In [1]:
#%%
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer 
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import pickle
from gensim.models import word2vec
anger = pd.read_csv('/Users/yindima/Desktop/170 project/Train dataset/anger-ratings-0to1.train.txt', sep = '\t', names =  ['id','text','emotion','intensity'])
fear = pd.read_csv('/Users/yindima//Desktop/170 project/Train dataset/fear-ratings-0to1.train.txt', sep = '\t', names =  ['id','text','emotion','intensity'])
joy = pd.read_csv('/Users/yindima//Desktop/170 project/Train dataset/joy-ratings-0to1.train.txt', sep = '\t', names =  ['id','text','emotion','intensity'])
sadness = pd.read_csv('/Users/yindima//Desktop/170 project/Train dataset/sadness-ratings-0to1.train.txt', sep = '\t', names =  ['id','text','emotion','intensity'])

data = pd.concat([anger, fear, joy, sadness],ignore_index=True)

Our labeled twitter data is from http://saifmohammad.com/WebPages/EmotionIntensity-SharedTask.html

In [2]:
data_cleaned = data.drop(['id','intensity'], axis=1)

In [3]:
data_cleaned.head(10)

Unnamed: 0,text,emotion
0,How the fu*k! Who the heck! moved my fridge!.....,anger
1,So my Indian Uber driver just called someone t...,anger
2,@DPD_UK I asked for my parcel to be delivered ...,anger
3,so ef whichever butt wipe pulled the fire alar...,anger
4,Don't join @BTCare they put the phone down on ...,anger
5,My blood is boiling,anger
6,When you've still got a whole season of Wentwo...,anger
7,@bt_uk why does tracking show my equipment del...,anger
8,@TeamShanny legit why i am so furious with him...,anger
9,How is it suppose to work if you do that? Wtf ...,anger


# Preprocessing data
## uniformity
We firstly need to bring the words into uniform, so we would need to make every word lowercase and remove punctuation and stop words.

In [4]:
#Making all letters lowercase
data_cleaned['text'] = data_cleaned['text'].apply(lambda x: x.lower())
#Removing Punctuation, Symbols
data_cleaned['text'] = data_cleaned['text'].str.replace('[^\w\s]',' ')
#Removing Stop Words
stop_words = stopwords.words('english')
for i in range(len(data_cleaned)):
    text = ''
    for word in data_cleaned.loc[i]['text'].split():
        if word not in stop_words:
            text+=' '+word
    data_cleaned.loc[i]['text'] = text.strip(' ')


In [5]:
data_cleaned.head(10)

Unnamed: 0,text,emotion
0,fu k heck moved fridge knock landlord door ang...,anger
1,indian uber driver called someone n word movin...,anger
2,dpd_uk asked parcel delivered pick store addre...,anger
3,ef whichever butt wipe pulled fire alarm davis...,anger
4,join btcare put phone talk rude taking money a...,anger
5,blood boiling,anger
6,still got whole season wentworth watch stupid ...,anger
7,bt_uk tracking show equipment delivered servic...,anger
8,teamshanny legit furious people fucking idiots,anger
9,suppose work wtf dude thanks pissing,anger


## Lemmatisation
we will need to return the words into their root 

In [6]:
lemmatizer = WordNetLemmatizer()
for i in range(len(data_cleaned)):
    text = ''
    for word in data_cleaned.loc[i]['text'].split():
        text+=' '+lemmatizer.lemmatize(lemmatizer.lemmatize(lemmatizer.lemmatize(word),pos = 'a'), pos = 'v')
    data_cleaned.loc[i]['text'] = text.strip(' ')

In [7]:
data_cleaned.head(10)

Unnamed: 0,text,emotion
0,fu k heck move fridge knock landlord door angr...,anger
1,indian uber driver call someone n word move ve...,anger
2,dpd_uk ask parcel deliver pick store address f...,anger
3,ef whichever butt wipe pull fire alarm davis b...,anger
4,join btcare put phone talk rude take money acc...,anger
5,blood boil,anger
6,still get whole season wentworth watch stupid ...,anger
7,bt_uk track show equipment deliver service sud...,anger
8,teamshanny legit furious people fuck idiot,anger
9,suppose work wtf dude thank piss,anger


### removing rare words
firstly get the rarest 5,000 words, since they may not valueable for being feature, so we will remove them from data

In [8]:
word_dict = {}
for i in range(len(data_cleaned)):
    for word in data_cleaned.loc[i]['text'].split():
        if word in word_dict:
            word_dict[word]+=1
        else:
            word_dict[word] = 1
rarest_words = sorted(list(word_dict.keys()), key = lambda x:word_dict[x])[:5000]

for i in range(len(data_cleaned)):
    text = ''
    for word in data_cleaned.loc[i]['text'].split():
        if word not in rarest_words:
            text+=' '+word
    data_cleaned.loc[i]['text'] = text.strip(' ')

In [9]:
data_cleaned.head(10)

Unnamed: 0,text,emotion
0,move knock door angry mad,anger
1,indian uber driver call someone n word move ve...,anger
2,ask deliver pick store address fume,anger
3,whichever butt wipe pull fire alarm bc sound p...,anger
4,join put phone talk rude take money acc fume,anger
5,blood boil,anger
6,still get whole season watch stupid cunt work ...,anger
7,track show equipment deliver service already 3...,anger
8,legit furious people fuck idiot,anger
9,suppose work wtf dude thank piss,anger


### start from simplest model, only care about anger and joy

In [10]:
anger_and_joy_cleaned = pd.concat([data_cleaned[data_cleaned['emotion'] == 'joy'],data_cleaned[data_cleaned['emotion'] == 'anger']], ignore_index = True)

In [11]:
anger_and_joy_cleaned.head()

Unnamed: 0,text,emotion
0,get back see garydelaney burslem amaze face st...,joy
1,oh dear even absolute hilarity think laugh muc...,joy
2,wait week game cheer friday,joy
3,thank much sweet thoughtful make day joyful love,joy
4,feel bless work family nothing love amp make s...,joy


### Feature extraction
#### We will try 2 different feature extraction, TF-IDF and count vector

In [103]:
#TF-IDF
#This parameter gives the relative importance of a term in the data 
#and is a measure of how frequently and rarely it appears in the text. 
#This can be directly extracted in python
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(anger_and_joy_cleaned.emotion.values)
X_train, X_val, y_train, y_val = train_test_split(anger_and_joy_cleaned.text.values, y, stratify=y, random_state=23, test_size=0.1, shuffle=True)

In [104]:
tfidf = TfidfVectorizer(max_features=1000, analyzer='word',ngram_range=(1,3))
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.fit_transform(X_val)

In [105]:
# Model 1: Multinomial Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)
y_pred = nb.predict(X_val_tfidf)
print("naive bayes tfidf accuracy %s" %  accuracy_score(y_pred, y_val))
# Model 2: Linear SVM
from sklearn.linear_model import SGDClassifier
lsvm = SGDClassifier(alpha=0.001, random_state=5, max_iter=15, tol=None)
lsvm.fit(X_train_tfidf, y_train)
y_pred = lsvm.predict(X_val_tfidf)
print('svm using tfidf accuracy %s' % accuracy_score(y_pred, y_val))

# Model 3: logistic regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1)
logreg.fit(X_train_tfidf, y_train)
y_pred = logreg.predict(X_val_tfidf)
print('log reg tfidf accuracy %s' % accuracy_score(y_pred, y_val))

# Model 4: Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=500)
rf.fit(X_train_tfidf, y_train)
y_pred = rf.predict(X_val_tfidf)
print('random forest tfidf accuracy %s' % accuracy_score(y_pred, y_val))


naive bayes tfidf accuracy 0.4226190476190476
svm using tfidf accuracy 0.5119047619047619
log reg tfidf accuracy 0.4583333333333333




random forest tfidf accuracy 0.44642857142857145


In [107]:
#count vector
#This is another feature we consider and as the name suggests we transform our tweet into an
#array having the count of appearances of each word in it. 
#The intuition here is that the text that conveys similar emotions may have the same words repeated over 
#and over again. This is more like the direct approach.
count_vect = CountVectorizer(analyzer='word',ngram_range = (1,2))
count_vect.fit(anger_and_joy_cleaned['text'])
X_train_count =  count_vect.transform(X_train)
X_val_count =  count_vect.transform(X_val)

In [108]:
# Model 1: Multinomial Naive Bayes Classifier

nb = MultinomialNB()
nb.fit(X_train_count, y_train)
y_pred = nb.predict(X_val_count)
print('naive bayes count vectors accuracy %s' % accuracy_score(y_pred, y_val))

# Model 2: Linear SVM

lsvm = SGDClassifier(alpha=0.001, random_state=5, max_iter=15, tol=None)
lsvm.fit(X_train_count, y_train)
y_pred = lsvm.predict(X_val_count)
print('lsvm using count vectors accuracy %s' % accuracy_score(y_pred, y_val))

# Model 3: Logistic Regression

logreg = LogisticRegression(C=1)
logreg.fit(X_train_count, y_train)
y_pred = logreg.predict(X_val_count)
print('log reg count vectors accuracy %s' % accuracy_score(y_pred, y_val))

# Model 4: Random Forest Classifier

rf = RandomForestClassifier(n_estimators=500)
rf.fit(X_train_count, y_train)
y_pred = rf.predict(X_val_count)
print('random forest with count vectors accuracy %s' % accuracy_score(y_pred, y_val))

naive bayes count vectors accuracy 0.9404761904761905
lsvm using count vectors accuracy 0.9523809523809523
log reg count vectors accuracy 0.9345238095238095




random forest with count vectors accuracy 0.9285714285714286


# Try on 4 classes with count vector

In [26]:
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(data_cleaned.emotion.values)
X_train, X_val, y_train, y_val = train_test_split(data_cleaned.text.values, y, stratify=y, random_state=23, test_size=0.1, shuffle=True)

count_vect = CountVectorizer(analyzer='word',ngram_range = (1,2))
count_vect.fit(data_cleaned['text'])
X_train_count =  count_vect.transform(X_train)
X_val_count =  count_vect.transform(X_val)

print(lbl_enc.__dict__)


{'classes_': array(['anger', 'fear', 'joy', 'sadness'], dtype=object)}


In [27]:
# Model 1: Multinomial Naive Bayes Classifier

nb = MultinomialNB()
nb.fit(X_train_count, y_train)
y_pred = nb.predict(X_val_count)
print('naive bayes count vectors accuracy %s' % accuracy_score(y_pred, y_val))

# Model 2: Linear SVM

lsvm = SGDClassifier(alpha=0.001, random_state=5, max_iter=15, tol=None)
lsvm.fit(X_train_count, y_train)
y_pred = lsvm.predict(X_val_count)
print('lsvm using count vectors accuracy %s' % accuracy_score(y_pred, y_val))

# Model 3: Logistic Regression

logreg = LogisticRegression(C=1)
logreg.fit(X_train_count, y_train)
y_pred = logreg.predict(X_val_count)
print('log reg count vectors accuracy %s' % accuracy_score(y_pred, y_val))

# Model 4: Random Forest Classifier

rf = RandomForestClassifier(n_estimators=500)
rf.fit(X_train_count, y_train)
y_pred = rf.predict(X_val_count)
print('random forest with count vectors accuracy %s' % accuracy_score(y_pred, y_val))

naive bayes count vectors accuracy 0.8480662983425414
lsvm using count vectors accuracy 0.9005524861878453
log reg count vectors accuracy 0.8784530386740331




random forest with count vectors accuracy 0.8453038674033149


# Try on more test data

In [28]:
anger_test = pd.read_csv('/Users/yindima//Desktop/170 project/Test dataset/anger-ratings-0to1.test.gold.txt', sep = '\t', names =  ['id','text','emotion','intensity'])
fear_test = pd.read_csv('/Users/yindima//Desktop/170 project/Test dataset/fear-ratings-0to1.test.gold.txt', sep = '\t', names =  ['id','text','emotion','intensity'])
joy_test = pd.read_csv('/Users/yindima//Desktop/170 project/Test dataset/joy-ratings-0to1.test.gold.txt', sep = '\t', names =  ['id','text','emotion','intensity'])
sadness_test = pd.read_csv('/Users/yindima//Desktop/170 project/Test dataset/sadness-ratings-0to1.test.gold.txt', sep = '\t', names =  ['id','text','emotion','intensity'])

test_data = pd.concat([anger_test, fear_test, joy_test, sadness_test],ignore_index=True)
test_data_cleaned = test_data.drop(['id','intensity'], axis=1)

In [29]:
def unification(data_cleaned):
    #Making all letters lowercase
    data_cleaned['text'] = data_cleaned['text'].apply(lambda x: x.lower())
    #Removing Punctuation, Symbols
    data_cleaned['text'] = data_cleaned['text'].str.replace('[^\w\s]',' ')
    #Removing Stop Words
    stop_words = stopwords.words('english')
    for i in range(len(data_cleaned)):
        text = ''
        for word in data_cleaned.loc[i]['text'].split():
            if word not in stop_words:
                text+=' '+word
        data_cleaned.loc[i]['text'] = text.strip(' ')
    return data_cleaned

def lemmatisation(data_cleaned):
    lemmatizer = WordNetLemmatizer()
    for i in range(len(data_cleaned)):
        text = ''
        for word in data_cleaned.loc[i]['text'].split():
            text+=' '+lemmatizer.lemmatize(lemmatizer.lemmatize(lemmatizer.lemmatize(word),pos = 'a'), pos = 'v')
        data_cleaned.loc[i]['text'] = text.strip(' ')
    return data_cleaned

def removing_rare_words(data_cleaned, num_to_ignore = 5000):
    word_dict = {}
    for i in range(len(data_cleaned)):
        for word in data_cleaned.loc[i]['text'].split():
            if word in word_dict:
                word_dict[word]+=1
            else:
                word_dict[word] = 1
    rarest_words = sorted(list(word_dict.keys()), key = lambda x:word_dict[x])[:num_to_ignore]

    for i in range(len(data_cleaned)):
        text = ''
        for word in data_cleaned.loc[i]['text'].split():
            if word not in rarest_words:
                text+=' '+word
        data_cleaned.loc[i]['text'] = text.strip(' ')
    return data_cleaned


    

In [30]:
test_data_cleaned = removing_rare_words(lemmatisation(unification(test_data_cleaned)))

In [31]:
test_data_cleaned.head()

Unnamed: 0,text,emotion
0,point today someone say something kind burst eye,anger
1,game day minus 14 30 relentless,anger
2,game piss game year blood boil time turn,anger
3,find candice candace pout like,anger
4,come mum 25k tweet,anger


In [32]:
lbl_enc = preprocessing.LabelEncoder()
y_test = lbl_enc.fit_transform(test_data_cleaned.emotion.values)
x_test = test_data_cleaned['text']
x_test_count =  count_vect.transform(x_test)

In [33]:
# Model 1: Multinomial Naive Bayes Classifier


y_pred = nb.predict(x_test_count)
print('naive bayes count vectors accuracy %s' % accuracy_score(y_pred, y_test))

# Model 2: Linear SVM


y_pred = lsvm.predict(x_test_count)
print('lsvm using count vectors accuracy %s' % accuracy_score(y_pred, y_test))

# Model 3: Logistic Regression


y_pred = logreg.predict(x_test_count)
print('log reg count vectors accuracy %s' % accuracy_score(y_pred, y_test))

# Model 4: Random Forest Classifier


y_pred = rf.predict(x_test_count)
print('random forest with count vectors accuracy %s' % accuracy_score(y_pred, y_test))

naive bayes count vectors accuracy 0.746658179503501
lsvm using count vectors accuracy 0.821769573520051
log reg count vectors accuracy 0.8128580521960534
random forest with count vectors accuracy 0.8157224697644813


# Try on dev data

In [34]:
anger_dev = pd.read_csv('/Users/yindima//Desktop/170 project/Dev dataset/anger-ratings-0to1.dev.gold.txt', sep = '\t', names =  ['id','text','emotion','intensity'])
fear_dev = pd.read_csv('/Users/yindima//Desktop/170 project/Dev dataset/fear-ratings-0to1.dev.gold.txt', sep = '\t', names =  ['id','text','emotion','intensity'])
joy_dev = pd.read_csv('/Users/yindima//Desktop/170 project/Dev dataset/joy-ratings-0to1.dev.gold.txt', sep = '\t', names =  ['id','text','emotion','intensity'])
sadness_dev = pd.read_csv('/Users/yindima//Desktop/170 project/Dev dataset/sadness-ratings-0to1.dev.gold.txt', sep = '\t', names =  ['id','text','emotion','intensity'])

dev_data = pd.concat([anger_dev, fear_dev, joy_dev, sadness_dev],ignore_index=True)
dev_data_cleaned = dev_data.drop(['id','intensity'], axis=1)

In [35]:
dev_data_cleaned = removing_rare_words(lemmatisation(unification(dev_data_cleaned)),1000)
dev_data_cleaned.head()

Unnamed: 0,text,emotion
0,dont insult,anger
1,would take offense actually snap,anger
2,game affront god man must never speak,anger
3,ask start rag call,anger
4,sometimes get mad something minuscule try ruin...,anger


In [36]:
lbl_enc = preprocessing.LabelEncoder()
y_dev = lbl_enc.fit_transform(dev_data_cleaned.emotion.values)
x_dev = dev_data_cleaned['text']
x_dev_count =  count_vect.transform(x_dev)

In [37]:
# Model 1: Multinomial Naive Bayes Classifier


y_pred = nb.predict(x_dev_count)
print('naive bayes count vectors accuracy %s' % accuracy_score(y_pred, y_dev))

# Model 2: Linear SVM


y_pred = lsvm.predict(x_dev_count)
print('lsvm using count vectors accuracy %s' % accuracy_score(y_pred, y_dev))

# Model 3: Logistic Regression


y_pred = logreg.predict(x_dev_count)
print('log reg count vectors accuracy %s' % accuracy_score(y_pred, y_dev))

# Model 4: Random Forest Classifier


y_pred = rf.predict(x_dev_count)
print('random forest with count vectors accuracy %s' % accuracy_score(y_pred, y_dev))

naive bayes count vectors accuracy 0.7463976945244957
lsvm using count vectors accuracy 0.7982708933717579
log reg count vectors accuracy 0.7896253602305475
random forest with count vectors accuracy 0.7665706051873199


# Save the model into file

In [128]:
# save the model to disk
pickle.dump(logreg, open('/Users/yindima/Desktop/170 project/emotion_model/logreg_model', 'wb'))
# save the model to disk
pickle.dump(lsvm, open('/Users/yindima/Desktop/170 project/emotion_model/lsvm_model', 'wb'))

# Try other modern word embedding tools

### BERT

In [66]:
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
#logging.basicConfig(level=logging.INFO)

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


In [67]:
# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased')

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()
data_cleaned_index = data_cleaned.copy()
data_cleaned_index['seg'] = data_cleaned_index['text']
for i in range(len(data_cleaned_index)):
    data_cleaned_index.loc[i]['text']  = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(data_cleaned_index.loc[i]['text']))
    data_cleaned_index.loc[i]['seg'] = [1]*len(data_cleaned_index.loc[i]['text'])
   
    tokens_tensor = torch.tensor([data_cleaned_index.loc[i]['text']])
    segments_tensors = torch.tensor([data_cleaned_index.loc[i]['seg']])
    
    
    encoded_layers, x = model(tokens_tensor, segments_tensors)
    
    token_embeddings = torch.stack(encoded_layers, dim=0)
    token_embeddings = torch.squeeze(token_embeddings, dim=1)
    token_embeddings = token_embeddings.permute(1,0,2)
    
    token_vecs = encoded_layers[11][0]
    # Calculate the average of all 22 token vectors.
    sentence_embedding = torch.mean(token_vecs, dim=0)
    data_cleaned_index.loc[i]['text'] = sentence_embedding
    del tokens_tensor,segments_tensors,encoded_layers, x,token_embeddings,token_vecs,sentence_embedding
data_cleaned_index.head()

Unnamed: 0,text,emotion,seg
0,"[tensor(-0.0974, grad_fn=<SelectBackward>), te...",anger,"[1, 1, 1, 1, 1]"
1,"[tensor(-0.1502, grad_fn=<SelectBackward>), te...",anger,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
2,"[tensor(0.1921, grad_fn=<SelectBackward>), ten...",anger,"[1, 1, 1, 1, 1, 1, 1]"
3,"[tensor(0.1514, grad_fn=<SelectBackward>), ten...",anger,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
4,"[tensor(0.0650, grad_fn=<SelectBackward>), ten...",anger,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"


In [68]:
for i in range(len(data_cleaned_index)):
    data_cleaned_index.loc[i]['text'] = data_cleaned_index.loc[i]['text'].detach().numpy()

In [69]:
data_cleaned_index.head()

Unnamed: 0,text,emotion,seg
0,"[-0.09739778, 0.6434248, -0.30084196, -0.60367...",anger,"[1, 1, 1, 1, 1]"
1,"[-0.15020666, -0.025147611, 0.58002543, -0.028...",anger,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
2,"[0.19213702, 0.16715051, 0.5674692, -0.3713170...",anger,"[1, 1, 1, 1, 1, 1, 1]"
3,"[0.15143839, 0.3220565, 0.3240387, -0.00035042...",anger,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
4,"[0.06495898, -0.0065595717, 0.5851037, 0.08516...",anger,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"


In [70]:
data_cleaned_index_c = data_cleaned_index.copy()
for i in range(len(data_cleaned_index_c)):
    data_cleaned_index_c.loc[i]['text'] = list(data_cleaned_index_c.loc[i]['text'])


In [71]:
lbl_enc = preprocessing.LabelEncoder()
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

y = lbl_enc.fit_transform(data_cleaned_index.emotion.values)
X_train_bert, X_val_bert, y_train_bert, y_val_bert = train_test_split(data_cleaned_index.text.values, y, stratify=y, random_state=23, test_size=0.1, shuffle=True)
scaler.fit(list(data_cleaned_index.text.values))
X_train_bert = scaler.transform(list(X_train_bert))
X_val_bert = scaler.transform(list(X_val_bert))

In [72]:
##### Model 1: Multinomial Naive Bayes Classifier

nb = MultinomialNB()
nb.fit(list(X_train_bert), y_train_bert)
y_pred = nb.predict(X_val_bert)
print("naive bayes BERT accuracy %s" %  accuracy_score(y_pred, y_val_bert))
# Model 2: Linear SVM

lsvm = SGDClassifier(alpha=0.001, random_state=5, max_iter=15, tol=None)
lsvm.fit(X_train_bert, y_train_bert)
y_pred = lsvm.predict(X_val_bert)
print('svm using BERT accuracy %s' % accuracy_score(y_pred, y_val_bert))

# Model 3: logistic regression

logreg = LogisticRegression(C=1)
logreg.fit(X_train_bert, y_train_bert)
y_pred = logreg.predict(X_val_bert)
print('log reg BERT accuracy %s' % accuracy_score(y_pred, y_val_bert))

# Model 4: Random Forest Classifier

rf = RandomForestClassifier(n_estimators=500)
rf.fit(X_train_bert, y_train_bert)
y_pred = rf.predict(X_val_bert)
print('random forest BERT accuracy %s' % accuracy_score(y_pred, y_val_bert))

naive bayes BERT accuracy 0.39502762430939226
svm using BERT accuracy 0.5524861878453039




log reg BERT accuracy 0.5939226519337016
random forest BERT accuracy 0.5027624309392266


# Classify relevant and irrelevant tweets

In [73]:
relevant = pd.read_csv('/Users/yindima/Desktop/170 project/relevant.csv')
relevant.head()

Unnamed: 0,tweetid,text,d,lb,r,Unnamed: 5,Unnamed: 6
0,1.242949e+18,@instinctnaturel @TJCrick4 @dilenz2 @rachs80 @...,2020-03-25,,0.0,,
1,1.242718e+18,"RT @RebekahsRight: ""The SARS-CoV-2 coronavirus...",2020-03-25,4.0,1.0,,
2,1.242828e+18,@realDonaldTrump Ozone is one the most powerfu...,2020-03-25,1.0,0.0,,
3,1.242881e+18,RT @OccupyOneLove: @LillianOrlando4 @oveoblu ....,2020-03-25,4.0,0.0,,
4,1.242893e+18,@ScottyDdoogie @unperturbable @RepAdamSchiff W...,2020-03-25,2.0,1.0,,


In [74]:
with_r = relevant[pd.notna(relevant['r'])].reset_index()
with_r.head()

Unnamed: 0,index,tweetid,text,d,lb,r,Unnamed: 5,Unnamed: 6
0,0,1.242949e+18,@instinctnaturel @TJCrick4 @dilenz2 @rachs80 @...,2020-03-25,,0.0,,
1,1,1.242718e+18,"RT @RebekahsRight: ""The SARS-CoV-2 coronavirus...",2020-03-25,4.0,1.0,,
2,2,1.242828e+18,@realDonaldTrump Ozone is one the most powerfu...,2020-03-25,1.0,0.0,,
3,3,1.242881e+18,RT @OccupyOneLove: @LillianOrlando4 @oveoblu ....,2020-03-25,4.0,0.0,,
4,4,1.242893e+18,@ScottyDdoogie @unperturbable @RepAdamSchiff W...,2020-03-25,2.0,1.0,,


In [75]:
relevant_cleaned = removing_rare_words(lemmatisation(unification(with_r)),10000)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [76]:
relevant_cleaned

Unnamed: 0,index,tweetid,text,d,lb,r,Unnamed: 5,Unnamed: 6
0,0,1.242949e+18,instinctnaturel tjcrick4 dilenz2 rachs80 ...,2020-03-25,,0.0,,
1,1,1.242718e+18,rt rebekahsright the sars cov 2 coronavirus...,2020-03-25,4.0,1.0,,
2,2,1.242828e+18,realdonaldtrump ozone is one the most powerfu...,2020-03-25,1.0,0.0,,
3,3,1.242881e+18,rt occupyonelove lillianorlando4 oveoblu ...,2020-03-25,4.0,0.0,,
4,4,1.242893e+18,scottyddoogie unperturbable repadamschiff w...,2020-03-25,2.0,1.0,,
5,5,1.242924e+18,but the virus and the disease it causes in hum...,2020-03-25,2.0,1.0,,
6,6,1.242933e+18,w_terrence covid 19 chinese originated viru...,2020-03-25,,1.0,,
7,7,1.242931e+18,secpompeo this particular coronavirus has a n...,2020-03-25,5.0,0.0,,
8,8,1.242697e+18,realdonaldtrump the novel strain of coronavir...,2020-03-25,2.0,0.0,,
9,9,1.242705e+18,lilyella_tang ptcwang jlin7 official names ...,2020-03-25,5.0,1.0,,


In [77]:
#fit relevant data into count vector
lbl_enc = preprocessing.LabelEncoder()
y_rel = lbl_enc.fit_transform(relevant_cleaned.r.values)
X_train_rel, X_val_rel, y_train_rel, y_val_rel = train_test_split(relevant_cleaned.text.values, y_rel, stratify=y_rel, random_state=23, test_size=0.4, shuffle=True)

count_vect = CountVectorizer(analyzer='word')
count_vect.fit(relevant_cleaned['text'])
X_train_rel_count =  count_vect.transform(X_train_rel)
X_val_rel_count =  count_vect.transform(X_val_rel)



In [78]:
# Model 1: Multinomial Naive Bayes Classifier

nb = MultinomialNB()
nb.fit(X_train_rel_count, y_train_rel)
y_pred = nb.predict(X_val_rel_count)
print('naive bayes count vectors accuracy %s' % accuracy_score(y_pred, y_val_rel))

# Model 2: Linear SVM

lsvm = SGDClassifier(alpha=0.001, random_state=5, max_iter=15, tol=None)
lsvm.fit(X_train_rel_count, y_train_rel)
y_pred = lsvm.predict(X_val_rel_count)
print('lsvm using count vectors accuracy %s' % accuracy_score(y_pred, y_val_rel))

# Model 3: Logistic Regression

logreg = LogisticRegression(C=1)
logreg.fit(X_train_rel_count, y_train_rel)
y_pred = logreg.predict(X_val_rel_count)
print('log reg count vectors accuracy %s' % accuracy_score(y_pred, y_val_rel))

# Model 4: Random Forest Classifier

rf = RandomForestClassifier(n_estimators=500)
rf.fit(X_train_rel_count, y_train_rel)
y_pred = rf.predict(X_val_rel_count)
print('random forest with count vectors accuracy %s' % accuracy_score(y_pred, y_val_rel))

naive bayes count vectors accuracy 0.5642540620384048
lsvm using count vectors accuracy 0.5974889217134417
log reg count vectors accuracy 0.5952732644017725




random forest with count vectors accuracy 0.5989660265878878


In [79]:
#Try TF-IDF
lbl_enc = preprocessing.LabelEncoder()
y_rel = lbl_enc.fit_transform(relevant_cleaned.r.values)
X_train_rel, X_val_rel, y_train_rel, y_val_rel = train_test_split(relevant_cleaned.text.values, y_rel, stratify=y_rel, random_state=23, test_size=0.4, shuffle=True)
tfidf = TfidfVectorizer(max_features=1000, analyzer='word',ngram_range=(1,3))
X_train_tfidf = tfidf.fit_transform(X_train_rel)
X_val_tfidf = tfidf.fit_transform(X_val_rel)

# Model 1: Multinomial Naive Bayes Classifier

nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train_rel)
y_pred = nb.predict(X_val_tfidf)
print("naive bayes tfidf accuracy %s" %  accuracy_score(y_pred, y_val_rel))

# Model 2: Linear SVM

lsvm = SGDClassifier(alpha=0.001, random_state=5, max_iter=15, tol=None)
lsvm.fit(X_train_tfidf, y_train_rel)
y_pred = lsvm.predict(X_val_tfidf)
print('svm using tfidf accuracy %s' % accuracy_score(y_pred, y_val_rel))

# Model 3: logistic regression

logreg = LogisticRegression(C=1)
logreg.fit(X_train_tfidf, y_train_rel)
y_pred = logreg.predict(X_val_tfidf)
print('log reg tfidf accuracy %s' % accuracy_score(y_pred, y_val_rel))

# Model 4: Random Forest Classifier

rf = RandomForestClassifier(n_estimators=500)
rf.fit(X_train_tfidf, y_train_rel)
y_pred = rf.predict(X_val_tfidf)
print('random forest tfidf accuracy %s' % accuracy_score(y_pred, y_val_rel))



naive bayes tfidf accuracy 0.5480059084194978
svm using tfidf accuracy 0.5753323485967504
log reg tfidf accuracy 0.5672082717872969




random forest tfidf accuracy 0.5576070901033974


In [80]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y
