In [14]:
import pandas as pd
import numpy as np
import emoji

In [15]:
# combine the csv files 
# df contain all 5000 tweets
df1 = pd.read_csv('matched_tweets_4443.csv', usecols=['tweet_id','tweet_text','year','final_rating'], index_col='tweet_id')
df2 = pd.read_csv('unmatched_tweets_557.csv', usecols=['tweet_id','tweet_text','year','majority_rating'], index_col='tweet_id')
df = pd.concat([df1, df2], axis=0)

In [16]:
# remove the emoji and replace it with the its text

df['tweet_text'] = df['tweet_text'].apply(lambda x: emoji.demojize(x,  delimiters=(" ", " ")).replace("_", " ").replace("-", " "))

In [17]:
# combine the majority_rating and final_rating
df['merged_rating'] = df['majority_rating'].fillna(df['final_rating'])

In [18]:
# convert the file to csv format
# name: dataset.csv
df.to_csv('dataset.csv')

In [19]:
# split dataset.csv into relevant and irrelevant

# relevant
relevant_df = df.loc[df['merged_rating'] == 1, ['tweet_text', 'year']]
relevant_df.to_csv('relevant_dataset.csv')
relevant_df.reset_index(inplace=True)

# irrelevant
irrelevant_df = df.loc[df['merged_rating'] == 0, ['tweet_text', 'year']]
irrelevant_df.to_csv('irrelevant_dataset.csv')
irrelevant_df.reset_index(inplace=True)


In [20]:
relevant_df

Unnamed: 0,tweet_id,tweet_text,year
0,688636812869369856,solar to hydrogen homes no more lpg be sure to...,2016
1,1316724254357041152,when hydrogen made from water electrolysis is ...,2020
2,657892234281537536,동영상 space engineers update hydrogen thrusters ...,2015
3,570260992715841537,greet the flying bum aircrafts aerospace hydro...,2015
4,617365883695038464,bmw reveals hydrogen stealth car can go miles ...,2015
...,...,...,...
2731,1064418282067771392,high purity hydrogen gas generator stable v w ...,2018
2732,699903784525254656,new technique for turning sunlight into hydrog...,2016
2733,1580906325403312129,megaphone will invest € billion in france le...,2022
2734,978094796588961793,our industry model is completely backwards ups...,2018


In [21]:
from transformers import pipeline


In [22]:
# convert the relevant_df to list with tweet_text only
data = list(relevant_df['tweet_text'])


In [35]:
sentiment_pipeline = pipeline("sentiment-analysis", model='finiteautomata/bertweet-base-sentiment-analysis')
results = sentiment_pipeline(data)

Downloading (…)lve/main/config.json:   0%|          | 0.00/949 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/540M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/338 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/843k [00:00<?, ?B/s]

Downloading (…)solve/main/bpe.codes:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/22.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/167 [00:00<?, ?B/s]

In [32]:
results_df = pd.DataFrame(results)
results
results_df.groupby(['label']).count()

Unnamed: 0_level_0,score
label,Unnamed: 1_level_1
NEGATIVE,1305
POSITIVE,1431


In [38]:
results2_df = pd.DataFrame(results)
results2_df.groupby(['label']).count()


Unnamed: 0_level_0,score
label,Unnamed: 1_level_1
NEG,99
NEU,1810
POS,827


In [39]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

model = BertForSequenceClassification.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis",num_labels=3)
tokenizer = BertTokenizer.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis")

nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

results3_df = nlp(data)




Downloading (…)lve/main/config.json:   0%|          | 0.00/789 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/369 [00:00<?, ?B/s]

In [43]:

results3_df = pd.DataFrame(results3_df)
results3_df.groupby(['label']).count()
# results3_df.groupby(['label']).count()

Unnamed: 0_level_0,score
label,Unnamed: 1_level_1
negative,49
neutral,2107
positive,580


In [44]:
from transformers import pipeline, set_seed
generator = pipeline('text-generation', model='gpt2')
set_seed(42)
generator("Hello, how are you,", max_length=30, num_return_sequences=5)


Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Hello, how are you, my name Issei. I'm so sorry…\n\nHearing this I was confused.\n\nThis one is"},
 {'generated_text': "Hello, how are you, and what are you doing?'' Mr. Brown asks. Mrs. Brown looks at him nervously. ``Don't tell"},
 {'generated_text': 'Hello, how are you, why does the sky get blue, is that how all the sky is blue, I thought you told me that at a'},
 {'generated_text': 'Hello, how are you, a girl?"\n\nLil\' Ciscy sighed weakly. "My name is Ciscy, you'},
 {'generated_text': "Hello, how are you, dear child? Did I miss anything? It's alright. Not that I'm not sick of all that, I just"}]