In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
!pip install emoji
import emoji
!pip install autocorrect
from autocorrect import Speller

!pip install transformers
!pip install simpletransformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from simpletransformers.classification import ClassificationModel, ClassificationArgs

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

from scipy.stats import pearsonr

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting emoji
  Downloading emoji-2.2.0.tar.gz (240 kB)
[K     |████████████████████████████████| 240 kB 9.9 MB/s 
[?25hBuilding wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-2.2.0-py3-none-any.whl size=234926 sha256=51e59186f692aa5888bf9fd15477278798b72bfe28574f1791eeeb2fcc135ca0
  Stored in directory: /root/.cache/pip/wheels/86/62/9e/a6b27a681abcde69970dbc0326ff51955f3beac72f15696984
Successfully built emoji
Installing collected packages: emoji
Successfully installed emoji-2.2.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting autocorrect
  Downloading autocorrect-2.6.1.tar.gz (622 kB)
[K     |████████████████████████████████| 622 kB 13.6 MB/s 
[?25hBuilding wheels for collected packages: autocorrect
  Building wheel for autocorr

In [None]:
#read the training and testing sets
Train = pd.read_csv('/content/EI-reg-En-anger-train.txt', sep='\t', header=0)
Test = pd.read_csv('/content/2018-EI-reg-En-anger-test-gold.txt', sep='\t', header=0)
Train.sample(10)

Unnamed: 0,ID,Tweet,Affect Dimension,Intensity Score
1246,2017-En-11305,Get to the gym and discover I forgot to put my...,anger,0.562
1509,2017-En-11639,I'm not sure if burning and looting really can...,anger,0.312
181,2017-En-10642,one month til someone's bday and i think it's ...,anger,0.435
170,2017-En-10545,@bismahmalik .....Seems like a fight ready to ...,anger,0.438
1167,2017-En-10657,Can't wait to be in my Ninja turtle costume ra...,anger,0.375
1003,2017-En-10991,@SpookyHelder ...specifically are the cause of...,anger,0.417
136,2017-En-10079,What the fuck am I supposed to do with no lunc...,anger,0.729
1473,2017-En-10348,@KimLy resent,anger,0.521
916,2017-En-10182,@canada4trumpnow @donlemon he has BAD TEMPERAM...,anger,0.646
84,2017-En-11620,@Supergold2002 Terence- got a lot of anger iss...,anger,0.625


In [None]:
#Split data into train (9/10) and validate (1/10)
#to do so, randomly shuffle the dataset
n = int(len(Train)/10)
Train = Train.sample(frac = 1)
Validate = Train.iloc[:n, :]
Train = Train.iloc[n:, :]

In [None]:
Validate["Intensity Score"].describe()

count    170.000000
mean       0.476218
std        0.178939
min        0.125000
25%        0.354000
50%        0.458000
75%        0.583000
max        0.938000
Name: Intensity Score, dtype: float64

In [None]:
Train["Intensity Score"].describe()

count    1531.000000
mean        0.501164
std         0.168568
min         0.032000
25%         0.375000
50%         0.492000
75%         0.625000
max         0.976000
Name: Intensity Score, dtype: float64

In [None]:
Test["Intensity Score"].describe()

count    1002.000000
mean        0.519358
std         0.189535
min         0.050000
25%         0.379000
50%         0.516000
75%         0.656000
max         0.953000
Name: Intensity Score, dtype: float64

#Data Cleaning

In [None]:
class cleanTweets:
  """
  DESCRIPTION: Class to clean tweets
  Can be used by initing the class and then using the only public function cleanTweets()
  """
  def __init__(self, X: pd.Series) -> None:
        self.X = X

        self.__removeURL()
        self.__removeUser()
        self.__removeHashtag()
        self.__removePunctuation()
        self.__removeNumbers()
        self.__convertEmoji()
        self.__removeSpace()
        self.__spellingCorrection()
        self.__removeStopWords()
        self.__lemmatize()
        self.__lower()
        

  def cleanTweets(self) -> pd.Series:
        """
        DESCRIPTION: Function used to clean tweets data.
        Takes in a Series and returns a Series.
        """
        return self.X

  def __removeURL(self) -> None:
        #remove all URLs from tweets
        self.X = self.X.apply(lambda x: re.sub(r"http\S+", "", x))

  def __removeUser(self)  -> None:
        #remove all users (starting with an @)
        self.X = self.X.apply(lambda x: re.sub(r"@\S+", "", x))

  def __removeHashtag(self) -> None:
        #remove all hashtags (starting with an #)
        self.X = self.X.apply(lambda x: re.sub(r"#\S+", "", x))
    
  def __removePunctuation(self) -> None:
        #remove all punctuations but ? and !
        self.X = self.X.apply(lambda x: re.sub(r"[^?!\w\s]", "", x))
    
  def __removeNumbers(self) -> None:
        #remove all numbers from the tweets
        self.X = self.X.apply(lambda x: re.sub(r"\d+", "", x))

  def __convertEmoji(self) -> None:
        #convert emojis to words
        self.X = self.X.apply(lambda x: emoji.demojize(x))

  def __removeSpace(self) -> None:
        #remove all spaces 
        self.X = self.X.apply(lambda x: re.sub(r"\s+", " ", x))

  def __spellingCorrection(self) -> None:
        #correct the spelling of the tweets, this uniforms equivalent words with invalid spelling
        spell = Speller(lang='en')
        self.X = self.X.apply(lambda x: spell(x))

  def __removeStopWords(self) -> None:
        #remove stopwords from data as they are not anger-specific
        stop_words = set(stopwords.words('english'))
        self.X = self.X.apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

  def __lemmatize(self) -> None:
        #lemmatize to make the vocabulary a bit smaller
        lemmatizer = WordNetLemmatizer()
        self.X = self.X.apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

  def __lower(self) -> None:
        #lowercase all strings
        self.X = self.X.apply(lambda x: x.lower())

In [None]:
#clean training data
cleaned_train = cleanTweets(Train["Tweet"])
Train["Tweet"] = cleaned_train.cleanTweets()
Train["Tweet"].sample(10)

1472                  here tip everyone stop offended end
1246      get gym discover i forgot put gym shoe back bad
1363    ended paying p half tube party dont even get p...
586                                 manchester derby home
513                     mad tilting? slightly rate? cool?
212     peter radio piece ball pure class dig revenge ...
215     egyptian official expressed frustration outrag...
528     lt feel everything she would remember everythi...
848                             men rage strike wish best
808          apparently he supposed scottish accent??? im
Name: Tweet, dtype: object

In [None]:
#clean training data
cleaned_validate = cleanTweets(Validate["Tweet"])
Validate["Tweet"] = cleaned_validate.cleanTweets()

In [None]:
#clean testing data
cleaned_test = cleanTweets(Test["Tweet"])
Test["Tweet"] = cleaned_test.cleanTweets()

#General Model Setup

In [None]:
#reduce columns and rename
Train_short = Train[["Tweet", "Intensity Score"]].rename(columns={"Tweet": "text", "Intensity Score": "labels"})
Test_short = Test[["Tweet", "Intensity Score"]].rename(columns={"Tweet": "text", "Intensity Score": "labels"})
Validate_short = Validate[["Tweet", "Intensity Score"]].rename(columns={"Tweet": "text", "Intensity Score": "labels"})

# Setting optional model configuration
model_args = ClassificationArgs()
model_args.num_train_epochs = 30
model_args.regression = True
model_args.overwrite_output_dir = True
model_args.evaluation_strategy ="steps"
model_args.logging_steps=1000
model_args.eval_steps = 10
model_args.save_total_limit = 1

#Model Training - Baseline Model

In [None]:
#model setup for regression
model_base = ClassificationModel('roberta', 
                                 'roberta-base',
                                 num_labels=1,
                                 ignore_mismatched_sizes=True,
                                 args=model_args,
                                 use_cuda=True)

#train the model
model_base.train_model(Train_short)

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

  0%|          | 0/1531 [00:00<?, ?it/s]

Epoch:   0%|          | 0/30 [00:00<?, ?it/s]

Running Epoch 0 of 30:   0%|          | 0/192 [00:00<?, ?it/s]



Running Epoch 1 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 2 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 3 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 4 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 5 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 6 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 7 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 8 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 9 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 10 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 11 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 12 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 13 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 14 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 15 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 16 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 17 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 18 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 19 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 20 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 21 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 22 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 23 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 24 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 25 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 26 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 27 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 28 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 29 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

(5760, 0.009215029667346641)

In [None]:
#make predictions on Validation Set
predictions, raw_outputs = model_base.predict(Validate_short["text"].tolist())

#compute pearson correlation
pearsonr(Validate_short["labels"].tolist(), predictions)[0]

  0%|          | 0/170 [00:00<?, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

0.7722410084083505

#Model Training - Emotion Model

In [None]:
#model setup for regression
model = ClassificationModel('roberta', 
                            'cardiffnlp/twitter-roberta-base-emotion',
                            num_labels=1,
                            ignore_mismatched_sizes=True,
                            args=model_args,
                            use_cuda=True)

#train the model
model.train_model(Train_short)

Downloading:   0%|          | 0.00/768 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-emotion and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([4, 768]) in the checkpoint and torch.Size([1, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([4]) in the checkpoint and torch.Size([1]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

  0%|          | 0/1531 [00:00<?, ?it/s]

Epoch:   0%|          | 0/30 [00:00<?, ?it/s]

Running Epoch 0 of 30:   0%|          | 0/192 [00:00<?, ?it/s]



Running Epoch 1 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 2 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 3 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 4 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 5 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 6 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 7 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 8 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 9 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 10 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 11 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 12 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 13 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 14 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 15 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 16 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 17 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 18 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 19 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 20 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 21 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 22 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 23 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 24 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 25 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 26 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 27 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 28 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

Running Epoch 29 of 30:   0%|          | 0/192 [00:00<?, ?it/s]

(5760, 0.0069217452968726115)

In [None]:
#make predictions on Testset
predictions, raw_outputs = model.predict(Validate_short["text"].tolist())

#compute pearson correlation
pearsonr(Validate_short["labels"].tolist(), predictions)[0]

  0%|          | 0/170 [00:00<?, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

0.8008023475420939

#Evaluation on Test Set

In [None]:
#Baseline Model
#make predictions on Testset
predictions, raw_outputs = model_base.predict(Test_short["text"].tolist())

#compute pearson correlation
pearsonr(Test_short["labels"].tolist(), predictions)[0]

  0%|          | 0/1002 [00:00<?, ?it/s]

  0%|          | 0/126 [00:00<?, ?it/s]

0.7782347430821897

In [None]:
#Emotion Model
#make predictions on Testset
predictions, raw_outputs = model.predict(Test_short["text"].tolist())

#compute pearson correlation
pearsonr(Test_short["labels"].tolist(), predictions)[0]

  0%|          | 0/1002 [00:00<?, ?it/s]

  0%|          | 0/126 [00:00<?, ?it/s]

0.7980677990705975