In [1]:
import torch
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import emoji
from tqdm import tqdm
from scipy.special import expit
from collections import defaultdict
from pysentimiento import create_analyzer

In [3]:
tweets = pd.read_csv("tweets_data.csv").dropna(subset=["tweet_id"])
tweets.head()

Unnamed: 0,author_id,created_at,geo,tweet_id,like_count,quote_count,reply_count,retweet_count,text
0,431948312,2017-01-30 23:55:49+00:00,,826217383039729664,0.0,0.0,0.0,0.0,We have the power👊💥😂💚 https://t.co/8en3JNwkny
1,418718492,2017-05-30 23:56:15+00:00,,869704038048202752,0.0,0.0,0.0,0.0,Sleep would be nice but Maiyahs cosies are bet...
2,340099697,2016-10-30 23:57:35+00:00,{'place_id': '31fd9d42c71e7afa'},792878143887605760,1.0,0.0,0.0,0.0,Lynn mad af cause Eric ate her chocolate chip ...
3,1249825087525728257,2020-10-30 23:59:39+00:00,,1322327339200630784,0.0,0.0,0.0,0.0,@KaileGedye text me cus I haven’t 🤣
4,205097885,2018-03-30 23:54:29+00:00,,979869505622069248,0.0,0.0,1.0,0.0,@juanlabia @Skelebird Yea but i feel like you ...


In [6]:
text = tweets.text

### Initialize sentiment and emotion models.

I used the [pysentimiento](https://github.com/pysentimiento/pysentimiento) package to derive the sentiment and emotion scores. The data produced from these 


The sentiment analysis model classifies three classes: `POS`, `NEG`, `NEU`


The emotion analysis model classifies seven classes: `anger`, `joy`, `sadness`, `fear`, `surprise`, `disgust`, `neutral`

In [4]:
sentiment_analyzer = create_analyzer(task="sentiment", lang="en")
emotion_analyzer = create_analyzer(task="emotion", lang="en")


Iterate over corpus, extract sentiment, and save the output

In [5]:
sentiments = []
for txt in tqdm(text.tolist()):
    sent = sentiment_analyzer.predict(txt).probas
    sentiments.append(sent)

Save the sentiment scores to a pickle file

In [None]:
sentiments = pd.DataFrame(index = tweets.tweet_id, sentiments)
sentiments.to_pickle("sentiment.pkl")

Repeat same process for emotions

In [13]:
emotions = []
for txt in tqdm(text.tolist()):
    emo = emotion_analyzer.predict(txt).probas
    emotions.append(emo)

In [None]:
emotions = pd.DataFrame(index = tweets.tweet_id, emotions)
emotions.to_pickle("emotions.pkl")

Initialize [topic model](https://huggingface.co/cardiffnlp/tweet-topic-21-multi) tokenizer and classifier.

In [6]:
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-dec2021-tweet-topic-multi-all")
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-dec2021-tweet-topic-multi-all", problem_type="multi_label_classification")
model.eval()
class_mapping = model.config.id2label

### Topics:

![](topics.png)

Extract topic scores and collect them.

In [None]:
topic_scores = []
for txt in tqdm(text.tolist()):
    tokens = tokenizer(txt, return_tensors='pt')
    output = model(**tokens)

    scores = output[0][0].detach().numpy()
    scores = expit(scores)
    topic_scores.append(scores)

Save topic scores to a pickle file.

In [None]:
cols = list(class_mapping.values())
topic_scores = pd.DataFrame(index= tweets.tweet_id, data = topic_scores, columns = cols)
topic_scores.to_pickle('topic_scores_df.pkl')