In [1]:
# basic imports and loading of data

import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

original = pd.read_csv('TweetsNR.csv').dropna()
dataframe = original.copy()
pd.set_option('display.max_rows', 100)

original

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I'd have responded, if I were going","I'd have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn't they put them on t...","Sons of ****,",negative
...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27477,4f4c4fc327,I've wondered about rake to. The client has ...,", don't force",negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive


In [2]:
# basic info

original.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27480 entries, 0 to 27480
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   textID         27480 non-null  object
 1   text           27480 non-null  object
 2   selected_text  27480 non-null  object
 3   sentiment      27480 non-null  object
dtypes: object(4)
memory usage: 1.0+ MB


In [3]:
# dataframe for work

dataframe = original.copy()
dataframe.drop(['textID', 'selected_text'], axis=1, inplace=True)
dataframe

Unnamed: 0,text,sentiment
0,"I'd have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn't they put them on t...",negative
...,...,...
27476,wish we could come see u on Denver husband l...,negative
27477,I've wondered about rake to. The client has ...,negative
27478,Yay good for both of you. Enjoy the break - y...,positive
27479,But it was worth it ****.,positive


In [4]:
# sentiment data into numbers for machine learning

dataframe['sentiment_num'] = dataframe['sentiment'].replace(['negative','neutral','positive'], [-1, 0, 1])
dataframe.sample(n=10, random_state=7)

Unnamed: 0,text,sentiment,sentiment_num
1315,it was a biligual sweatshop LOL I talk 2 him ...,negative,-1
6422,On the bus to NYC http://yfrog.com/08kaifj,neutral,0
17769,: Ok its suppose 2b followfriday not unfollow ...,positive,1
16344,had such a fun time with allegra tonite!!! we ...,positive,1
15836,Very cute - I don't think I can make it to Ma...,neutral,0
18695,dunno. Maybe the flu. I feel a bitbetter now.,positive,1
11403,ya i did i seen all them but Robert,neutral,0
22643,I was the blue lol http://twitpic.com/67zgz,neutral,0
1332,Waking up early to go to the gym,neutral,0
19474,it drained my energy,negative,-1


In [121]:
# rule based sentiment analysis - VADER

from nltk import sentiment
from sklearn.metrics import accuracy_score, classification_report

tokens = dataframe['text'].astype(str)

VA = sentiment.vader.SentimentIntensityAnalyzer()
all_vs = []
for sentence in tokens:
    vs = VA.polarity_scores(sentence)
    all_vs.append(vs['compound'])
    
all_vs = [1 if num>=0.1 else -1 if num<=-0.1 else 0 for num in all_vs]

examples = [1315, 6422, 17769, 16344, 15836, 18695, 11403, 22643, 1332, 19474]
for num in examples:
    print(f'{tokens[num]} - {VA.polarity_scores(tokens[num])}')
    actual = dataframe['sentiment_num'][num]
    predicted = all_vs[num-1]
    print(f'Actual label: {actual} / Predicted label: {predicted} {"<CORRECT>" if actual == predicted else "<WRONG>"}\n')

acc_va = accuracy_score(dataframe["sentiment_num"], all_vs)
print(f'Accuracy of VADER: {acc_va}')
print(f'Evaluation of VADER:\n {classification_report(dataframe["sentiment_num"], all_vs)}')
print(f'Confusion matrix of Vader:\n {pd.crosstab(dataframe["sentiment_num"], all_vs, rownames=["Actual"], colnames=["Predicted"])}')

 it was a biligual sweatshop LOL I talk 2 him once in a while but not as much, he got an r6 - {'neg': 0.0, 'neu': 0.882, 'pos': 0.118, 'compound': 0.3108}
Actual label: -1 / Predicted label: 1 <WRONG>

On the bus to NYC   http://yfrog.com/08kaifj - {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
Actual label: 0 / Predicted label: 0 <CORRECT>

: Ok its suppose 2b followfriday not unfollow Friday  aw well I have nice tweeters anyway! <-almost doesnt sound right...lol;) - {'neg': 0.0, 'neu': 0.67, 'pos': 0.33, 'compound': 0.75}
Actual label: 1 / Predicted label: 1 <CORRECT>

had such a fun time with allegra tonite!!! we saw 17again!! good movie - {'neg': 0.0, 'neu': 0.576, 'pos': 0.424, 'compound': 0.811}
Actual label: 1 / Predicted label: 1 <CORRECT>

 Very cute - I don't think I can make it to MakerFaire, sadly - {'neg': 0.199, 'neu': 0.568, 'pos': 0.234, 'compound': 0.1263}
Actual label: 0 / Predicted label: 1 <WRONG>

 dunno. Maybe the flu. I feel a bitbetter now. - {'neg': 0.30

In [122]:
# rule based sentiment analysis - TextBlob

import textblob

all_ts = []
for sentence in tokens:
    ts = textblob.TextBlob(sentence).sentiment.polarity
    all_ts.append(ts)
    
all_ts = [1 if num>=0.1 else -1 if num<=-0.1 else 0 for num in all_ts]

examples = [1315, 6422, 17769, 16344, 15836, 18695, 11403, 22643, 1332, 19474]
for num in examples:
    print(f'{tokens[num]} - {textblob.TextBlob(tokens[num]).sentiment.polarity}')
    actual = dataframe['sentiment_num'][num]
    predicted = all_ts[num-1]
    print(f'Actual label: {actual} / Predicted label: {predicted} {"<CORRECT>" if actual == predicted else "<WRONG>"}\n')

acc_tb = accuracy_score(dataframe["sentiment_num"], all_ts)
print(f'Accuracy of TextBlob: {acc_tb}')
print(f'Evaluation of TextBlob:\n {classification_report(dataframe["sentiment_num"], all_ts)}')
print(f'Confusion matrix of TextBlob:\n {pd.crosstab(dataframe["sentiment_num"], all_ts, rownames=["Actual"], colnames=["Predicted"])}')

 it was a biligual sweatshop LOL I talk 2 him once in a while but not as much, he got an r6 - 0.5
Actual label: -1 / Predicted label: 1 <WRONG>

On the bus to NYC   http://yfrog.com/08kaifj - 0.0
Actual label: 0 / Predicted label: 0 <CORRECT>

: Ok its suppose 2b followfriday not unfollow Friday  aw well I have nice tweeters anyway! <-almost doesnt sound right...lol;) - 0.475
Actual label: 1 / Predicted label: 1 <CORRECT>

had such a fun time with allegra tonite!!! we saw 17again!! good movie - 0.5385091145833333
Actual label: 1 / Predicted label: 1 <CORRECT>

 Very cute - I don't think I can make it to MakerFaire, sadly - 0.07500000000000001
Actual label: 0 / Predicted label: 0 <CORRECT>

 dunno. Maybe the flu. I feel a bitbetter now. - 0.0
Actual label: 1 / Predicted label: 0 <WRONG>

 ya i did i seen all them but Robert - 0.0
Actual label: 0 / Predicted label: 0 <CORRECT>

I was the blue  lol http://twitpic.com/67zgz - 0.4
Actual label: 0 / Predicted label: 1 <WRONG>

Waking up earl

In [123]:
# rule based sentiment analysis - AFINN

from afinn import Afinn

afn = Afinn(emoticons=True) 

all_afs = []
for sentence in tokens:
    afs = afn.score(sentence)
    all_afs.append(afs)
    
all_afs = [1 if num>=1 else -1 if num<=-1 else 0 for num in all_afs]

examples = [1315, 6422, 17769, 16344, 15836, 18695, 11403, 22643, 1332, 19474]
for num in examples:
    print(f'{tokens[num]} - {afn.score(tokens[num])}')
    actual = dataframe['sentiment_num'][num]
    predicted = all_afs[num-1]
    print(f'Actual label: {actual} / Predicted label: {predicted} {"<CORRECT>" if actual == predicted else "<WRONG>"}\n')

acc_afn = accuracy_score(dataframe["sentiment_num"], all_afs)
print(f'Accuracy of AFINN: {acc_afn}')
print(f'Evaluation of AFINN:\n {classification_report(dataframe["sentiment_num"], all_afs)}')
print(f'Confusion matrix of AFINN:\n {pd.crosstab(dataframe["sentiment_num"], all_afs, rownames=["Actual"], colnames=["Predicted"])}')

 it was a biligual sweatshop LOL I talk 2 him once in a while but not as much, he got an r6 - 3.0
Actual label: -1 / Predicted label: 1 <WRONG>

On the bus to NYC   http://yfrog.com/08kaifj - 0.0
Actual label: 0 / Predicted label: 0 <CORRECT>

: Ok its suppose 2b followfriday not unfollow Friday  aw well I have nice tweeters anyway! <-almost doesnt sound right...lol;) - 8.0
Actual label: 1 / Predicted label: 1 <CORRECT>

had such a fun time with allegra tonite!!! we saw 17again!! good movie - 7.0
Actual label: 1 / Predicted label: 1 <CORRECT>

 Very cute - I don't think I can make it to MakerFaire, sadly - 0.0
Actual label: 0 / Predicted label: 0 <CORRECT>

 dunno. Maybe the flu. I feel a bitbetter now. - -2.0
Actual label: 1 / Predicted label: -1 <WRONG>

 ya i did i seen all them but Robert - 0.0
Actual label: 0 / Predicted label: 0 <CORRECT>

I was the blue  lol http://twitpic.com/67zgz - 3.0
Actual label: 0 / Predicted label: 1 <WRONG>

Waking up early to go to the gym - 0.0
Actual

In [124]:
results_rb = pd.DataFrame([['VADER', acc_va],
                        ['TextBlob', acc_tb],
                        ['AFINN', acc_afn]],
                       columns=['Model', 'Accuracy'])
results_rb.to_csv('results_rb.csv')
results_rb

Unnamed: 0,Model,Accuracy
0,VADER,0.635408
1,TextBlob,0.592504
2,AFINN,0.647525


In [5]:
# test different tokenizers

from nltk.tokenize import TweetTokenizer, word_tokenize
from nltk.tokenize.casual import casual_tokenize
import twikenizer as twk
import twokenize as tok

dataframe['text'] = dataframe['text'].convert_dtypes(str)

# the best
tt = TweetTokenizer(reduce_len=True)
dataframe['Tweet Tokenizer'] = dataframe['text'].apply(tt.tokenize)

# very bad
dataframe['Word Tokenize'] = dataframe['text'].apply(word_tokenize)

# right in the middle
dataframe['Casual Tokenize'] = dataframe['text'].apply(casual_tokenize)

# very decent
dataframe['Twokenize'] = dataframe['text'].apply(tok.tokenizeRawTweetText)

# bad and very slow
ti = twk.Twikenizer()
ser = []
for s in dataframe['text']:
    ser.append(ti.tokenize(s))
dataframe['Twikenizer'] = ser

# Ranking:
# 1. Tweet Tokenizer
# 2. Twokenize
# 3. Casual Tokenize
# 4. Word Tokenize
# 5. Twikenizer

dataframe.sample(n=10, random_state=7)

Unnamed: 0,text,sentiment,sentiment_num,Tweet Tokenizer,Word Tokenize,Casual Tokenize,Twokenize,Twikenizer
1315,it was a biligual sweatshop LOL I talk 2 him ...,negative,-1,"[it, was, a, biligual, sweatshop, LOL, I, talk...","[it, was, a, biligual, sweatshop, LOL, I, talk...","[it, was, a, biligual, sweatshop, LOL, I, talk...","[it, was, a, biligual, sweatshop, LOL, I, talk...","[it, was, a, biligual, sweatshop, LOL, I, talk..."
6422,On the bus to NYC http://yfrog.com/08kaifj,neutral,0,"[On, the, bus, to, NYC, http://yfrog.com/08kaifj]","[On, the, bus, to, NYC, http, :, //yfrog.com/0...","[On, the, bus, to, NYC, http://yfrog.com/08kaifj]","[On, the, bus, to, NYC, http://yfrog.com/08kaifj]","[On, the, bus, to, NYC, http, :, /, /, yfrog, ..."
17769,: Ok its suppose 2b followfriday not unfollow ...,positive,1,"[:, Ok, its, suppose, 2b, followfriday, not, u...","[:, Ok, its, suppose, 2b, followfriday, not, u...","[:, Ok, its, suppose, 2b, followfriday, not, u...","[:, Ok, its, suppose, 2b, followfriday, not, u...","[:, Ok, its, suppose, 2b, followfriday, not, u..."
16344,had such a fun time with allegra tonite!!! we ...,positive,1,"[had, such, a, fun, time, with, allegra, tonit...","[had, such, a, fun, time, with, allegra, tonit...","[had, such, a, fun, time, with, allegra, tonit...","[had, such, a, fun, time, with, allegra, tonit...","[had, such, a, fun, time, with, allegra, tonit..."
15836,Very cute - I don't think I can make it to Ma...,neutral,0,"[Very, cute, -, I, don't, think, I, can, make,...","[Very, cute, -, I, do, n't, think, I, can, mak...","[Very, cute, -, I, don't, think, I, can, make,...","[Very, cute, -, I, don't, think, I, can, make,...","[Very, cute, -, I, don, ', t, think, I, can, m..."
18695,dunno. Maybe the flu. I feel a bitbetter now.,positive,1,"[dunno, ., Maybe, the, flu, ., I, feel, a, bit...","[dunno, ., Maybe, the, flu, ., I, feel, a, bit...","[dunno, ., Maybe, the, flu, ., I, feel, a, bit...","[dunno, ., Maybe, the, flu, ., I, feel, a, bit...","[dunno, ., Maybe, the, flu, ., I, feel, a, bit..."
11403,ya i did i seen all them but Robert,neutral,0,"[ya, i, did, i, seen, all, them, but, Robert]","[ya, i, did, i, seen, all, them, but, Robert]","[ya, i, did, i, seen, all, them, but, Robert]","[ya, i, did, i, seen, all, them, but, Robert]","[ya, i, did, i, seen, all, them, but, Robert]"
22643,I was the blue lol http://twitpic.com/67zgz,neutral,0,"[I, was, the, blue, lol, http://twitpic.com/67...","[I, was, the, blue, lol, http, :, //twitpic.co...","[I, was, the, blue, lol, http://twitpic.com/67...","[I, was, the, blue, lol, http://twitpic.com/67...","[I, was, the, blue, lol, http, :, /, /, twitpi..."
1332,Waking up early to go to the gym,neutral,0,"[Waking, up, early, to, go, to, the, gym]","[Waking, up, early, to, go, to, the, gym]","[Waking, up, early, to, go, to, the, gym]","[Waking, up, early, to, go, to, the, gym]","[Waking, up, early, to, go, to, the, gym]"
19474,it drained my energy,negative,-1,"[it, drained, my, energy]","[it, drained, my, energy]","[it, drained, my, energy]","[it, drained, my, energy]","[it, drained, my, energy]"


In [6]:
# preprocessing: all lower case, remove stop words and punctuation (mostly), remove short words (less than 3), stemming, POS tag, lemmatization, remove repeated characters

dataframe.drop(['Word Tokenize', 'Casual Tokenize', 'Twokenize', 'Twikenizer'], axis=1, inplace=True)

import nltk
import string
from nltk.corpus import stopwords, wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import re


tag_dict = {'J': wn.ADJ,
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV}

def lemmatize(sent):
    lemmas = []
    for word_tag in sent:
        lemmas.append(lemmatizer.lemmatize(word_tag[0], tag_dict.get(word_tag[1][0], wn.NOUN)))
    return lemmas

def remove_repeated_characters(tokens):  #not the best, if it can't find the word, reduces correct words into incorrect
    repeat_pattern = re.compile(r'(\w*)(\w)\2(\w*)')  # alphanumeric, repeated characters, rest of the word
    match_substitution = r'\1\2\3'  # 1, 2 or 3 repeated characters
    def replace(old_word):
        if wn.synsets(old_word):
            return old_word
        new_word = repeat_pattern.sub(match_substitution, old_word)
        return replace(new_word) if new_word != old_word else new_word
    correct_tokens = [replace(word) for word in tokens]
    return correct_tokens

def remove_digits(token):
    pattern = r'[0-9]'
    new_token = re.sub(pattern, '', token)
    return new_token

# nltk.download()
stop_words = stopwords.words('english')
stop_words.remove('no')
stop_words.remove('not')
stop_words += ['dont', 'cant', 'thats', 'didnt', 'wont', 'ive', 'whats', 'havent', 'ima', 'aint', 'canot', 'isnt', 'shes', 'hes', 'yal'] 
lemmatizer = WordNetLemmatizer()

tokens = dataframe['Tweet Tokenizer'].values
tokens = [[word.lower() for word in sentence if word not in string.punctuation] for sentence in tokens]
tokens = [[word for word in sentence if '.' not in word and not word.startswith(('@', '#', '_'))] for sentence in tokens]
tokens = [[remove_digits(word) for word in sentence] for sentence in tokens]
tokens = [remove_repeated_characters(sentence) for sentence in tokens]
tokens = [lemmatize(pos_tag(token)) for token in tokens]
tokens = [[word.split("'")[0] for word in sentence] for sentence in tokens]
tokens = [[word for word in sentence if len(word)>=3 or len(word)>=45 or word=='no'] for sentence in tokens]
tokens = [[word for word in sentence if word not in stop_words and word!='...'] for sentence in tokens]

dataframe['Tweet Tokenizer Proc'] = tokens

dataframe.sample(n=10, random_state=7)

Unnamed: 0,text,sentiment,sentiment_num,Tweet Tokenizer,Tweet Tokenizer Proc
1315,it was a biligual sweatshop LOL I talk 2 him ...,negative,-1,"[it, was, a, biligual, sweatshop, LOL, I, talk...","[biligual, sweatshop, lol, talk, not, much, get]"
6422,On the bus to NYC http://yfrog.com/08kaifj,neutral,0,"[On, the, bus, to, NYC, http://yfrog.com/08kaifj]","[bus, nyc]"
17769,: Ok its suppose 2b followfriday not unfollow ...,positive,1,"[:, Ok, its, suppose, 2b, followfriday, not, u...","[suppose, folowfriday, not, unfolow, friday, w..."
16344,had such a fun time with allegra tonite!!! we ...,positive,1,"[had, such, a, fun, time, with, allegra, tonit...","[fun, time, alegra, tonite, saw, good, movie]"
15836,Very cute - I don't think I can make it to Ma...,neutral,0,"[Very, cute, -, I, don't, think, I, can, make,...","[cute, think, make, makerfaire, sadly]"
18695,dunno. Maybe the flu. I feel a bitbetter now.,positive,1,"[dunno, ., Maybe, the, flu, ., I, feel, a, bit...","[duno, maybe, flu, feel, bitbeter]"
11403,ya i did i seen all them but Robert,neutral,0,"[ya, i, did, i, seen, all, them, but, Robert]","[see, robert]"
22643,I was the blue lol http://twitpic.com/67zgz,neutral,0,"[I, was, the, blue, lol, http://twitpic.com/67...","[blue, lol]"
1332,Waking up early to go to the gym,neutral,0,"[Waking, up, early, to, go, to, the, gym]","[wake, early, gym]"
19474,it drained my energy,negative,-1,"[it, drained, my, energy]","[drain, energy]"


In [28]:
dataframe.to_csv('tokenized_tweets.csv')

In [2]:
dataframe = pd.read_csv('tokenized_tweets.csv')

In [138]:
# checking the most frequent words

from nltk import FreqDist

all_words = dataframe['Tweet Tokenizer Proc'].to_numpy()
all_words = np.concatenate(all_words).ravel().tolist()

fdist = FreqDist(all_words)
fdist.most_common(25)

[('get', 3011),
 ('day', 2484),
 ('not', 1875),
 ('good', 1829),
 ('work', 1524),
 ('no', 1464),
 ('like', 1440),
 ('love', 1386),
 ('today', 1167),
 ('time', 1098),
 ('one', 1073),
 ('know', 1054),
 ('lol', 1026),
 ('think', 1021),
 ('see', 1013),
 ('happy', 1008),
 ('want', 987),
 ('make', 971),
 ('miss', 962),
 ('really', 921),
 ('back', 920),
 ('well', 909),
 ('night', 809),
 ('feel', 792),
 ('mother', 791)]

In [5]:
tokens = dataframe['Tweet Tokenizer Proc'].astype(str)

# Bag of Words, BoN = ngram_range=(1,3) uni OK, bi 28.9GiB, tri 55.8GiB
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(lowercase=False)

bow = count_vect.fit_transform(tokens).toarray()

# TF-IDF, ngram_range=(1, 3), max_features=10000
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer(min_df=2, ngram_range=(1, 3), strip_accents='unicode', norm='l2')

tfidf = tfidf_vect.fit_transform(tokens).toarray()

In [185]:
import random

res = random.choices(list(count_vect.vocabulary_.items()), k=10)
print(f'BoW - Number of sentences: {bow.shape[0]}, Unique words: {bow.shape[1]} \nExample:')
res

BoW - Number of sentences: 27480, Unique words: 17964 
Example:


[('bylaurenluke', 2260),
 ('superpower', 15216),
 ('brings', 1995),
 ('lexi', 9026),
 ('krathong', 8695),
 ('enlgland', 4970),
 ('muzik', 10467),
 ('nba', 10605),
 ('cade', 2279),
 ('shelter', 13950)]

In [186]:
print(f'TfIdf - Number of sentences: {tfidf.shape[0]}, Unique words and ngrams: {tfidf.shape[1]} \nExample:')

nums = random.choices(range(tfidf.shape[1]), k=10)
res = [(tfidf_vect.get_feature_names_out()[i], tfidf_vect.idf_[i]) for i in nums]
res

TfIdf - Number of sentences: 27480, Unique words and ngrams: 23658 
Example:


[('get hang', 9.42949066654802),
 ('bikini', 10.122637847107965),
 ('goto', 9.834955774656184),
 ('small', 7.414587646005755),
 ('hahahaha', 8.043196305428129),
 ('thought would', 10.122637847107965),
 ('shape', 9.024025558439854),
 ('school day', 9.42949066654802),
 ('sunshine gona', 10.122637847107965),
 ('ant', 9.42949066654802)]

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier

def base_classifier(rep, test=0.2):
    x_train, x_test, y_train, y_test = train_test_split(rep, dataframe['sentiment_num'], test_size=test, random_state=7)
    dummy = DummyClassifier(strategy='uniform', random_state=7)
    dummy.fit(x_train, y_train)
    return dummy.score(x_test, y_test)

print(f'Base Dummy BoW   {base_classifier(bow)}')
print(f'Base Dummy TdIdf {base_classifier(tfidf)}')

Base Dummy BoW   0.32787481804949054
Base Dummy TdIdf 0.32787481804949054


In [3]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
    
    
def ml_cross(classifier, rep, test=0.2):
    x_train, x_test, y_train, y_test = train_test_split(rep, dataframe['sentiment_num'], test_size=test, random_state=7)
    classifier.fit(x_train, y_train)
    cv_scores = cross_val_score(classifier, x_train, y_train, cv=5, n_jobs=-1)
    cv_mean_score = np.mean(cv_scores)
    test_score = classifier.score(x_test, y_test)
    return cv_scores, cv_mean_score, test_score

In [15]:
mnb = MultinomialNB(alpha=1)
temp, mnb_bow_cv_mean_score, mnb_bow_test_score = ml_cross(mnb, bow)
print(f'Multinomial Naive Bayes scores BoW: {temp} \nMean score: {mnb_bow_cv_mean_score} \nTest score: {mnb_bow_test_score}')

Multinomial Naive Bayes scores BoW: [0.65476461 0.64225608 0.6472595  0.65863088 0.65468608] 
Mean score: 0.6515194307972825 
Test score: 0.6550218340611353


In [16]:
temp, mnb_tfidf_cv_mean_score, mnb_tfidf_test_score = ml_cross(mnb, tfidf)
print(f'Multinomial Naive Bayes scores TfIdf: {temp} \nMean score: {mnb_tfidf_cv_mean_score} \nTest score: {mnb_tfidf_test_score}')

Multinomial Naive Bayes scores TfIdf: [0.63133955 0.62952013 0.62065044 0.63179441 0.63717015] 
Mean score: 0.6300949360998265 
Test score: 0.6381004366812227


In [10]:
lr = LogisticRegression(penalty='l2', max_iter=10000, C=1, random_state=7)
temp, lr_bow_cv_mean_score, lr_bow_test_score = ml_cross(lr, bow)
print(f'Logisitc Regression scores BoW: {temp} \nMean score: {lr_bow_cv_mean_score} \nTest score: {lr_bow_test_score}')

Logisitc Regression scores BoW: [0.68796907 0.6906982  0.69092563        nan 0.70200182] 
Mean score: nan 
Test score: 0.6925036390101892


In [11]:
temp, lr_tfidf_cv_mean_score, lr_tfidf_test_score = ml_cross(lr, tfidf)
print(f'Logisitc Regression scores TfIdf: {temp} \nMean score: {lr_tfidf_cv_mean_score} \nTest score: {lr_tfidf_test_score}')

Logisitc Regression scores TfIdf: [0.68887878 0.69115306 0.68592222 0.69752104 0.69131028] 
Mean score: 0.6909570757462851 
Test score: 0.6932314410480349


In [12]:
svm = LinearSVC(penalty='l2', max_iter=10000, C=1, random_state=7)
temp, svm_bow_cv_mean_score, svm_bow_test_score = ml_cross(svm, bow)
print(f'Linear SVM scores BoW: {temp} \nMean score: {svm_bow_cv_mean_score} \nTest score: {svm_bow_test_score}')

Linear SVM scores BoW: [0.66340687 0.666136   0.66704571 0.67295884 0.67470428] 
Mean score: 0.6688503390619338 
Test score: 0.6663027656477438


In [13]:
temp, svm_tfidf_cv_mean_score, svm_tfidf_test_score = ml_cross(svm, tfidf)
print(f'Linear SVM scores TfIdf: {temp} \nMean score: {svm_tfidf_cv_mean_score} \nTest score: {svm_tfidf_test_score}')

Linear SVM scores TfIdf: [0.67068456 0.68046395 0.66499886 0.67455083 0.67993631] 
Mean score: 0.6741269018105859 
Test score: 0.6750363901018923


In [13]:
sgd = SGDClassifier(loss='hinge', penalty='l2', max_iter=10000, random_state=7)
temp, svmsgd_bow_cv_mean_score, svmsgd_bow_test_score = ml_cross(sgd, bow)
print(f'Linear SVM (SGD) scores BoW: {temp} \nMean score: {svmsgd_bow_cv_mean_score} \nTest score: {svmsgd_bow_test_score}')

Linear SVM (SGD) scores BoW: [0.69865818 0.69820332 0.69797589        nan        nan] 
Mean score: nan 
Test score: 0.7083333333333334


In [6]:
temp, svmsgd_tfidf_cv_mean_score, svmsgd_tfidf_test_score = ml_cross(sgd, tfidf)
print(f'Linear SVM (SGD) scores TfIdf: {temp} \nMean score: {svmsgd_tfidf_cv_mean_score} \nTest score: {svmsgd_tfidf_test_score}')

Linear SVM (SGD) scores TfIdf: [0.69752104 0.69843075 0.69456448 0.70798272 0.69267516] 
Mean score: 0.6982348271621213 
Test score: 0.7079694323144105


In [8]:
rfc = RandomForestClassifier(n_estimators=20, random_state=7)
temp, rfc_bow_cv_mean_score, rfc_bow_test_score = ml_cross(rfc, bow)
print(f'Random Forest scores BoW: {temp} \nMean score: {rfc_bow_cv_mean_score} \nTest score: {rfc_bow_test_score}')

Random Forest scores BoW: [0.6784171  0.70138731 0.69774846 0.68796907 0.69517743] 
Mean score: 0.6921398761625668 
Test score: 0.7003275109170306


In [None]:
tfidf_small = np.float32(tfidf)
rfc = RandomForestClassifier(n_estimators=20, random_state=7)
temp, rfc_tfidf_cv_mean_score, rfc_tfidf_test_score = ml_cross(rfc, tfidf_small)
print(f'Random Forest scores TfIdf: {temp} \nMean score: {rfc_tfidf_cv_mean_score} \nTest score: {rfc_tfidf_test_score}')

Random Forest scores TfIdf: [0.6881965  0.70025017 0.69274505 0.69115306 0.69494995] 
Mean score: 0.6934589470072552 
Test score: 0.7030567685589519


In [None]:
# results_ml = pd.DataFrame([['Naive Bayes', mnb_bow_cv_mean_score, mnb_bow_test_score, mnb_tfidf_cv_mean_score, mnb_tfidf_test_score],
#                         ['Logistic Regression', lr_bow_cv_mean_score, lr_bow_test_score, lr_tfidf_cv_mean_score, lr_tfidf_test_score],
#                         ['Linear SVM', svm_bow_cv_mean_score, svm_bow_test_score, svm_tfidf_cv_mean_score, svm_tfidf_test_score],
#                         ['Linear SVM (SGD)', svmsgd_bow_cv_mean_score, svmsgd_bow_test_score, svmsgd_tfidf_cv_mean_score, svmsgd_tfidf_test_score],
#                         ['Random Forest', rfc_bow_cv_mean_score, rfc_bow_test_score, rfc_tfidf_cv_mean_score, rfc_tfidf_test_score]],
#                        columns=['Model', 'CV Score (BoW)', 'Test Score (BoW)', 'CV Score (TF-IDF)', 'Test Score (TF-IDF)']).T
# results_ml.to_csv('results_ml.csv')
# results_ml

In [16]:
# train Word2Vec word embedding; Continuous Bag of Words and Skip-Gram

from gensim.models import Word2Vec, FastText

tokens = dataframe['Tweet Tokenizer Proc']
#model.wv.key_to_index #dict with embeddings
#model.wv.get_normed_vectors() # all normalized vectors

w2v_cbow = Word2Vec(sentences=tokens, window=2, min_count=1, sg=0, cbow_mean=0, workers=6)
w2v_cbow_tokens = [[w2v_cbow.wv[word].tolist() for word in sentence] for sentence in tokens]
w2v_cbow_tokens = [np.add.reduce(word) if word != [] else np.random.uniform(low=-0.2, high=0.2, size=100) for word in w2v_cbow_tokens]

w2v_sg = Word2Vec(sentences=tokens, window=2, min_count=1, sg=1, workers=6) 
w2v_sg_tokens = [[w2v_sg.wv[word].tolist() for word in sentence] for sentence in tokens]
w2v_sg_tokens = [np.add.reduce(word) if word != [] else np.random.uniform(low=-0.2, high=0.2, size=100) for word in w2v_sg_tokens]

ft_cbow = FastText(tokens, window=2, min_count=1, sg=0, workers=6)
ft_cbow_tokens = [[ft_cbow.wv[word].tolist() for word in sentence] for sentence in tokens]
ft_cbow_tokens = [np.add.reduce(word) if word != [] else np.random.uniform(low=-0.2, high=0.2, size=100) for word in ft_cbow_tokens]

ft_sg = FastText(tokens, window=2, min_count=1, sg=1, workers=6)
ft_sg_tokens = [[ft_sg.wv[word].tolist() for word in sentence] for sentence in tokens]
ft_sg_tokens = [np.add.reduce(word) if word != [] else np.random.uniform(low=-0.2, high=0.2, size=100) for word in ft_sg_tokens]

In [18]:
# train Doc2Vec, word embedding on a document (in this case sentence)

from gensim.models.doc2vec import Doc2Vec, TaggedDocument

def gensim_dbow():
    tokens = dataframe['Tweet Tokenizer Proc'].to_numpy()
    tagged_data = [TaggedDocument(words=tokens[i], tags=[str(i)]) for i, _d in enumerate(tokens)]
    model = Doc2Vec(vector_size=100, min_count=1, epochs=10, workers=12)
    model.build_vocab(tagged_data)
    model.train(tagged_data, total_examples=model.corpus_count, epochs=10)
    return model

dbow = gensim_dbow()
dbow_tokens = [[dbow.wv[word].tolist() for word in sentence] for sentence in tokens]
dbow_tokens = [np.add.reduce(word) if word != [] else np.random.uniform(low=-0.2, high=0.2, size=100) for word in dbow_tokens]

In [20]:
# average word vectors

def avg_word_vecs(words, model, vocab, num_feat):
    vecs = np.zeros((num_feat,),dtype="float64")
    nwords = 0.
    for word in words:
        if word in vocab:
            nwords = nwords + 1.
            vecs = np.add(vecs, model.wv[word])
    if nwords:
        vecs = np.divide(vecs, nwords)
    return vecs

def avg_word_vect(corp, model, num_feat):
    vocab = set(model.wv.index_to_key)
    feat = [avg_word_vecs(token, model, vocab, num_feat)
                for token in corp]
    return np.array(feat)

w2v_model = Word2Vec(tokens, vector_size=100, window=100, min_count=2, sample=1e-3, sg=1, epochs=5, workers=10)
avg_vecs = avg_word_vect(corp=tokens, model=w2v_model, num_feat=100)

In [21]:
# glove and spacy pretrained

import spacy

embeddings_dict = {}
with open('glove.twitter.27B.100d.txt', 'r', encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = [float(item) for item in values[1:]]
        embeddings_dict[word] = vector
    
glove_tokens = [[embeddings_dict[word] if word in embeddings_dict else np.random.uniform(low=-0.2, high=0.2, size=100) for word in sentence] for sentence in tokens]
glove_tokens = [np.add.reduce(word) if word != [] else np.random.uniform(low=-0.2, high=0.2, size=100) for word in glove_tokens]

sp = spacy.load('en_core_web_lg')
spacy_tokens = [[sp.vocab.get_vector(word) if sp.vocab.has_vector else np.random.uniform(low=-0.2, high=0.2, size=300) for word in sentence] for sentence in tokens]
spacy_tokens = [np.add.reduce(word) if word != [] else np.random.uniform(low=-0.2, high=0.2, size=300) for word in spacy_tokens]

In [22]:
from sklearn.model_selection import train_test_split

mnb = MultinomialNB(alpha=1)

from sklearn.preprocessing import MinMaxScaler

# scale because NB only accepts positive values
w2v_cbow_tokens_nb = MinMaxScaler().fit_transform(w2v_cbow_tokens)
w2v_sg_tokens_nb = MinMaxScaler().fit_transform(w2v_sg_tokens)
ft_cbow_tokens_nb = MinMaxScaler().fit_transform(ft_cbow_tokens)
ft_sg_tokens_nb = MinMaxScaler().fit_transform(ft_sg_tokens)
dbow_tokens_nb = MinMaxScaler().fit_transform(dbow_tokens)
avg_vecs_nb = MinMaxScaler().fit_transform(avg_vecs)
glove_tokens_nb = MinMaxScaler().fit_transform(glove_tokens)
spacy_tokens_nb = MinMaxScaler().fit_transform(spacy_tokens)

temp, w2v_cbow_ms_mnb, w2v_cbow_ts_mnb = ml_cross(mnb, w2v_cbow_tokens_nb)
print(f'Naive Bayes scores Word2Vec CBoW:      {temp}')
temp, w2v_sg_ms_mnb, w2v_sg_ts_mnb = ml_cross(mnb, w2v_sg_tokens_nb)
print(f'Naive Bayes scores Word2Vec Skip-Gram: {temp}')

temp, ft_cbow_ms_mnb, ft_cbow_ts_mnb = ml_cross(mnb, ft_cbow_tokens_nb)
print(f'Naive Bayes scores FastText CBoW:      {temp}')
temp, ft_sg_ms_mnb, ft_sg_ts_mnb = ml_cross(mnb, ft_sg_tokens_nb)
print(f'Naive Bayes scores FastText Skip-Gram: {temp}')

temp, dbow_ms_mnb, dbow_ts_mnb = ml_cross(mnb, dbow_tokens_nb)
print(f'Naive Bayes scores Doc2Vec:            {temp}')
temp, avg_vec_ms_mnb, avg_vec_ts_mnb = ml_cross(mnb, avg_vecs_nb)
print(f'Naive Bayes scores Avg. Vectors:       {temp}')

temp, glove_ms_mnb, glove_ts_mnb = ml_cross(mnb, glove_tokens_nb)
print(f'Naive Bayes scores Glove:              {temp}')
temp, spacy_ms_mnb, spacy_ts_mnb = ml_cross(mnb, spacy_tokens_nb)
print(f'Naive Bayes scores Spacy:              {temp}')

Naive Bayes scores Word2Vec CBoW:      [0.42915624 0.41960428 0.4296111  0.42574483 0.42902639]
Naive Bayes scores Word2Vec Skip-Gram: [0.43256766 0.42369798 0.4321128  0.42324312 0.43198362]
Naive Bayes scores FastText CBoW:      [0.42051399 0.40641346 0.42028656 0.41869456 0.42151956]
Naive Bayes scores FastText Skip-Gram: [0.43302252 0.41482829 0.42824653 0.41755743 0.42902639]
Naive Bayes scores Doc2Vec:            [0.44393905 0.43438708 0.43870821 0.43484194 0.44972702]
Naive Bayes scores Avg. Vectors:       [0.47304981 0.4746418  0.4664544  0.47327723 0.47520473]
Naive Bayes scores Glove:              [0.41642029 0.41687514 0.415738   0.41346373 0.41537762]
Naive Bayes scores Spacy:              [0.44575847 0.44211963 0.44712304 0.44575847 0.44836215]


In [23]:
lr = LogisticRegression(penalty='l2', max_iter=10000, C=1, random_state=7)

temp, w2v_cbow_ms_lr, w2v_cbow_ts_lr = ml_cross(lr, w2v_cbow_tokens)
print(f'Logistic Regression scores Word2Vec CBoW:      {temp}')
temp, w2v_sg_ms_lr, w2v_sg_ts_lr = ml_cross(lr, w2v_sg_tokens)
print(f'Logistic Regression scores Word2Vec Skip-Gram: {temp}')

temp, ft_cbow_ms_lr, ft_cbow_ts_lr = ml_cross(lr, ft_cbow_tokens)
print(f'Logistic Regression scores FastText CBoW:      {temp}')
temp, ft_sg_ms_lr, ft_sg_ts_lr = ml_cross(lr, ft_sg_tokens)
print(f'Logistic Regression scores FastText Skip-Gram: {temp}')

temp, dbow_ms_lr, dbow_ts_lr = ml_cross(lr, dbow_tokens)
print(f'Logistic Regression scores Doc2Vec:            {temp}')
temp, avg_vec_ms_lr, avg_vec_ts_lr = ml_cross(lr, avg_vecs)
print(f'Logistic Regression scores Avg. Vectors:       {temp}')

temp, glove_ms_lr, glove_ts_lr = ml_cross(lr, glove_tokens)
print(f'Logistic Regression scores Glove:              {temp}')
temp, spacy_ms_lr, spacy_ts_lr = ml_cross(lr, spacy_tokens)
print(f'Logistic Regression scores Spacy:              {temp}')

Logistic Regression scores Word2Vec CBoW:      [0.60814191 0.60495793 0.60632249 0.60359336 0.60850773]
Logistic Regression scores Word2Vec Skip-Gram: [0.62178758 0.63202183 0.62451672 0.62337958 0.62420382]
Logistic Regression scores FastText CBoW:      [0.58107801 0.58357971 0.59040255 0.56925176 0.5878071 ]
Logistic Regression scores FastText Skip-Gram: [0.61269047 0.62178758 0.60996134 0.61769388 0.61260237]
Logistic Regression scores Doc2Vec:            [0.63042984 0.62883784 0.63679782 0.63588811 0.63466788]
Logistic Regression scores Avg. Vectors:       [0.62701842 0.61928588 0.62087787 0.61746645 0.61919927]
Logistic Regression scores Glove:              [0.66954742 0.65180805 0.6554469  0.65362747 0.65991811]
Logistic Regression scores Spacy:              [0.6743234  0.66931999 0.6800091  0.67318626 0.67470428]


In [24]:
svm = LinearSVC(penalty='l2', max_iter=10000, C=1, random_state=7, dual=False)

temp, w2v_cbow_ms_svm, w2v_cbow_ts_svm = ml_cross(svm, w2v_cbow_tokens)
print(f'Support Vector Machines scores Word2Vec CBoW:      {temp}')
temp, w2v_sg_ms_svm, w2v_sg_ts_svm = ml_cross(svm, w2v_sg_tokens)
print(f'Support Vector Machines scores Word2Vec Skip-Gram: {temp}')

temp, ft_cbow_ms_svm, ft_cbow_ts_svm = ml_cross(svm, ft_cbow_tokens)
print(f'Support Vector Machines scores FastText CBoW:      {temp}')
temp, ft_sg_ms_svm, ft_sg_ts_svm = ml_cross(svm, ft_sg_tokens)
print(f'Support Vector Machines scores FastText Skip-Gram: {temp}')

temp, dbow_ms_svm, dbow_ts_svm = ml_cross(svm, dbow_tokens)
print(f'Support Vector Machines scores Doc2Vec:            {temp}')
temp, avg_vec_ms_svm, avg_vec_ts_svm = ml_cross(svm, avg_vecs)
print(f'Support Vector Machines scores Avg. Vectors:       {temp}')

temp, glove_ms_svm, glove_ts_svm = ml_cross(svm, glove_tokens)
print(f'Support Vector Machines scores Glove:              {temp}')
temp, spacy_ms_svm, spacy_ts_svm = ml_cross(svm, spacy_tokens)
print(f'Support Vector Machines scores Spacy:              {temp}')

Support Vector Machines scores Word2Vec CBoW:      [0.61428247 0.61109848 0.60836934 0.60382079 0.60873521]
Support Vector Machines scores Word2Vec Skip-Gram: [0.62337958 0.62883784 0.625199   0.62724585 0.62829845]
Support Vector Machines scores FastText CBoW:      [0.60200136 0.60950648 0.5990448  0.5867637  0.60395814]
Support Vector Machines scores FastText Skip-Gram: [0.6211053  0.62474414 0.61496475 0.62428929 0.62215651]
Support Vector Machines scores Doc2Vec:            [0.63315897 0.62770071 0.63247669 0.63861724 0.63193813]
Support Vector Machines scores Avg. Vectors:       [0.63384126 0.62474414 0.63293154 0.62997498 0.62420382]
Support Vector Machines scores Glove:              [0.66431658 0.65590175 0.66113259 0.65499204 0.66264786]
Support Vector Machines scores Spacy:              [0.68182852 0.67500569 0.68091881 0.67932681 0.67879891]


In [25]:
sgd = SGDClassifier(loss='hinge', penalty='l2', max_iter=10000, random_state=7)

temp, w2v_cbow_ms_sgd, w2v_cbow_ts_sgd = ml_cross(sgd, w2v_cbow_tokens)
print(f'Stohastic Gradient Descent (SVM) scores Word2Vec CBoW:      {temp}')
temp, w2v_sg_ms_sgd, w2v_sg_ts_sgd = ml_cross(sgd, w2v_sg_tokens)
print(f'Stohastic Gradient Descent (SVM) scores Word2Vec Skip-Gram: {temp}')

temp, ft_cbow_ms_sgd, ft_cbow_ts_sgd = ml_cross(sgd, ft_cbow_tokens)
print(f'Stohastic Gradient Descent (SVM) scores FastText CBoW:      {temp}')
temp, ft_sg_ms_sgd, ft_sg_ts_sgd = ml_cross(sgd, ft_sg_tokens)
print(f'Stohastic Gradient Descent (SVM) scores FastText Skip-Gram: {temp}')

temp, dbow_ms_sgd, dbow_ts_sgd = ml_cross(sgd, dbow_tokens)
print(f'Stohastic Gradient Descent (SVM) scores Doc2Vec:            {temp}')
temp, avg_vec_ms_sgd, avg_vec_ts_sgd = ml_cross(sgd, avg_vecs)
print(f'Stohastic Gradient Descent (SVM) scores Avg. Vectors:       {temp}')

temp, glove_ms_sgd, glove_ts_sgd = ml_cross(sgd, glove_tokens)
print(f'Stohastic Gradient Descent (SVM) scores Glove:              {temp}')
temp, spacy_ms_sgd, spacy_ts_sgd = ml_cross(sgd, spacy_tokens)
print(f'Stohastic Gradient Descent (SVM) scores Spacy:              {temp}')

Stohastic Gradient Descent (SVM) scores Word2Vec CBoW:      [0.55628838 0.59153969 0.60382079 0.51012054 0.58257507]
Stohastic Gradient Descent (SVM) scores Word2Vec Skip-Gram: [0.54241528 0.51648851 0.61678417 0.57630202 0.56619654]
Stohastic Gradient Descent (SVM) scores FastText CBoW:      [0.48987946 0.49556516 0.51284967 0.37411872 0.51819836]
Stohastic Gradient Descent (SVM) scores FastText Skip-Gram: [0.48669547 0.58585399 0.55310439 0.58585399 0.54299363]
Stohastic Gradient Descent (SVM) scores Doc2Vec:            [0.53013418 0.59472368 0.57175347 0.59153969 0.5843949 ]
Stohastic Gradient Descent (SVM) scores Avg. Vectors:       [0.61746645 0.61928588 0.61974073 0.62383443 0.616697  ]
Stohastic Gradient Descent (SVM) scores Glove:              [0.58517171 0.63452354 0.60859677 0.54650898 0.56574158]
Stohastic Gradient Descent (SVM) scores Spacy:              [0.63361383 0.58289743 0.58744599 0.59290425 0.60191083]


In [26]:
rfc = RandomForestClassifier(n_estimators=100, random_state=7)

temp, w2v_cbow_ms_rfc, w2v_cbow_ts_rfc = ml_cross(rfc, w2v_cbow_tokens)
print(f'Random Forest scores Word2Vec CBoW:      {temp}')
temp, w2v_sg_ms_rfc, w2v_sg_ts_rfc = ml_cross(rfc, w2v_sg_tokens)
print(f'Random Forest scores Word2Vec Skip-Gram: {temp}')

temp, ft_cbow_ms_rfc, ft_cbow_ts_rfc = ml_cross(rfc, ft_cbow_tokens)
print(f'Random Forest scores FastText CBoW:      {temp}')
temp, ft_sg_ms_rfc, ft_sg_ts_rfc = ml_cross(rfc, ft_sg_tokens)
print(f'Random Forest scores FastText Skip-Gram: {temp}')

temp, dbow_ms_rfc, dbow_ts_rfc = ml_cross(rfc, dbow_tokens)
print(f'Random Forest scores Doc2Vec:            {temp}')
temp, avg_vec_ms_rfc, avg_vec_ts_rfc = ml_cross(rfc, avg_vecs)
print(f'Random Forest scores Avg. Vectors:       {temp}')

temp, glove_ms_rfc, glove_ts_rfc = ml_cross(rfc, glove_tokens)
print(f'Random Forest scores Glove:              {temp}')
temp, spacy_ms_rfc, spacy_ts_rfc = ml_cross(rfc, spacy_tokens)
print(f'Random Forest scores Spacy:              {temp}')

Random Forest scores Word2Vec CBoW:      [0.55060268 0.55378667 0.54946554 0.55424153 0.56619654]
Random Forest scores Word2Vec Skip-Gram: [0.56584035 0.57539231 0.5801683  0.57266318 0.5843949 ]
Random Forest scores FastText CBoW:      [0.49965886 0.48987946 0.49897657 0.51057539 0.50523203]
Random Forest scores FastText Skip-Gram: [0.54628156 0.54628156 0.55446896 0.55310439 0.56119199]
Random Forest scores Doc2Vec:            [0.58630885 0.59290425 0.59108483 0.58267    0.59190173]
Random Forest scores Avg. Vectors:       [0.63088469 0.62087787 0.62883784 0.62224244 0.62261146]
Random Forest scores Glove:              [0.63270412 0.623607   0.62337958 0.62406186 0.64058235]
Random Forest scores Spacy:              [0.62019559 0.62952013 0.61564703 0.60859677 0.63034577]


In [27]:
results_ml_adv = pd.DataFrame([['Word2Vec CBoW', w2v_cbow_ms_mnb, w2v_cbow_ts_mnb,
                         w2v_cbow_ms_lr, w2v_cbow_ts_lr,
                         w2v_cbow_ms_svm, w2v_cbow_ts_svm,
                         w2v_cbow_ms_sgd, w2v_cbow_ts_sgd,
                         w2v_cbow_ms_rfc, w2v_cbow_ts_rfc],
                        ['Word2Vec Skip-Gram', w2v_sg_ms_mnb, w2v_sg_ts_mnb,
                         w2v_sg_ms_lr, w2v_sg_ts_lr,
                         w2v_sg_ms_svm, w2v_sg_ts_svm,
                         w2v_sg_ms_sgd, w2v_sg_ts_sgd,
                         w2v_sg_ms_rfc, w2v_sg_ts_rfc],
                        ['FastText CBoW', ft_cbow_ms_mnb, ft_cbow_ts_mnb,
                         ft_cbow_ms_lr, ft_cbow_ts_lr,
                         ft_cbow_ms_svm, ft_cbow_ts_svm,
                         ft_cbow_ms_sgd, ft_cbow_ts_sgd,
                         ft_cbow_ms_rfc, ft_cbow_ts_rfc],
                        ['FastText Skip-Gram', ft_sg_ms_mnb, ft_sg_ts_mnb,
                         ft_sg_ms_lr, ft_sg_ts_lr,
                         ft_sg_ms_svm, ft_sg_ts_svm,
                         ft_sg_ms_sgd, ft_sg_ts_sgd,
                         ft_sg_ms_rfc, ft_sg_ts_rfc],
                        ['Doc2Vec', dbow_ms_mnb, dbow_ts_mnb,
                         dbow_ms_lr, dbow_ts_lr,
                         dbow_ms_svm, dbow_ts_svm,
                         dbow_ms_sgd, dbow_ts_sgd,
                         dbow_ms_rfc, dbow_ts_rfc],
                        ['Avg. Vector', avg_vec_ms_mnb, avg_vec_ts_mnb,
                         avg_vec_ms_lr, avg_vec_ts_lr,
                         avg_vec_ms_svm, avg_vec_ts_svm,
                         avg_vec_ms_sgd, avg_vec_ts_sgd,
                         avg_vec_ms_rfc, avg_vec_ts_rfc],
                        ['Glove', glove_ms_mnb, glove_ts_mnb,
                         glove_ms_lr, glove_ts_lr,
                         glove_ms_svm, glove_ts_svm,
                         glove_ms_sgd, glove_ts_sgd,
                         glove_ms_rfc, glove_ts_rfc],
                        ['Spacy', spacy_ms_mnb, spacy_ts_mnb,
                         spacy_ms_lr, spacy_ts_lr,
                         spacy_ms_svm, spacy_ts_svm,
                         spacy_ms_sgd, spacy_ts_sgd,
                         spacy_ms_rfc, spacy_ts_rfc]],
                         columns=['Representation', 'MultinomoalNB MS', 'MultinomoalNB TS',
                                  'Logistic Regression MS', 'Logistic Regression TS',
                                  'Support Vector Machines MS', 'Support Vector Machines TS', 
                                  'Stohastic Gradient Descent (SVM) MS', 'Stohastic Gradient Descent (SVM) TS',
                                  'Random Forest MS', 'Random Forest TS'])
results_ml_adv.to_csv('results_ml_adv.csv')
results_ml_adv

Unnamed: 0,Representation,MultinomoalNB MS,MultinomoalNB TS,Logistic Regression MS,Logistic Regression TS,Support Vector Machines MS,Support Vector Machines TS,Stohastic Gradient Descent (SVM) MS,Stohastic Gradient Descent (SVM) TS,Random Forest MS,Random Forest TS
0,Word2Vec CBoW,0.426629,0.433406,0.606305,0.619905,0.609261,0.618086,0.568869,0.547307,0.554859,0.569505
1,Word2Vec Skip-Gram,0.428721,0.440138,0.625182,0.627729,0.626592,0.632278,0.563637,0.555495,0.575692,0.586426
2,FastText CBoW,0.417486,0.425582,0.582424,0.594978,0.600255,0.614447,0.478122,0.547489,0.500864,0.505822
3,FastText Skip-Gram,0.424536,0.435953,0.614947,0.621179,0.621452,0.629367,0.5509,0.582424,0.552266,0.558042
4,Doc2Vec,0.440321,0.44778,0.633324,0.638464,0.632778,0.639374,0.574509,0.595706,0.588974,0.59607
5,Avg. Vector,0.472526,0.486536,0.62077,0.620451,0.629139,0.628457,0.619405,0.620997,0.625091,0.626092
6,Glove,0.415575,0.431405,0.65807,0.661936,0.659798,0.665757,0.588109,0.579148,0.628867,0.631914
7,Spacy,0.445824,0.462882,0.674309,0.683588,0.679176,0.685044,0.599754,0.575873,0.620861,0.631186


In [29]:
from sklearn.neural_network import MLPClassifier

def quick_mlp(rep, test=0.2):
    mlp = MLPClassifier(solver='adam', alpha=1e-5, learning_rate='adaptive', early_stopping=True, activation = 'relu', hidden_layer_sizes=(512, 512), random_state=7)
    x_train, x_test, y_train, y_test = train_test_split(rep, dataframe['sentiment_num'], test_size=test, random_state=7)
    mlp.fit(x_train, y_train)
    return mlp.score(x_test, y_test)
        
print(f'Multi Layer Perceptron BoW:   {quick_mlp(bow)}')
print(f'Multi Layer Perceptron TfIdf: {quick_mlp(tfidf)}')

Multi Layer Perceptron BoW:   0.7019650655021834
Multi Layer Perceptron TfIdf: 0.6883187772925764


In [4]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import Adam #,Adadelta, RMSprop
from keras.utils import to_categorical
from sklearn.metrics import accuracy_score, classification_report
from keras.callbacks import EarlyStopping

callback = EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)

def regular_dl(x_train, x_test, y_train, y_test, rtype='bow'):
    np.random.seed(7)
    classes = 3
    batch_size = 100
    epochs = 20

    y_train = to_categorical(y_train, classes)
    y_test = to_categorical(y_test, classes)

    model = Sequential()
    if rtype=='bow':
        model.add(Dense(1000, input_shape=(bow.shape[1],)))
    else:
        model.add(Dense(1000, input_shape=(tfidf.shape[1],)))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(Dense(500))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(Dense(50))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(Dense(classes))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    print (model.summary())
    model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, workers=6, use_multiprocessing=True, validation_split=0.1, callbacks=[callback])

    y_train_predclass =  np.argmax(model.predict(x_train), axis=1)
    y_test_predclass =  np.argmax(model.predict(x_test), axis=1)
    y_train = np.argmax(y_train, axis=1)
    y_test = np.argmax(y_test, axis=1)

    print(f"\n\nDeep Neural Network - Train accuracy: {accuracy_score(y_train, y_train_predclass)}")
    print(f"\nDeep Neural Network - Test accuracy: {accuracy_score(y_test, y_test_predclass)}")
    print("\nDeep Neural Network - Train Classification Report")
    print(classification_report(y_train, y_train_predclass))
    print("\nDeep Neural Network - Test Classification Report")
    print(classification_report(y_test, y_test_predclass))

In [46]:
x_train, x_test, y_train, y_test = train_test_split(bow, dataframe['sentiment_num'], test_size=0.2, random_state=7)
regular_dl(x_train, x_test, y_train, y_test)

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_8 (Dense)             (None, 1000)              17965000  
                                                                 
 activation_8 (Activation)   (None, 1000)              0         
                                                                 
 dropout_6 (Dropout)         (None, 1000)              0         
                                                                 
 dense_9 (Dense)             (None, 500)               500500    
                                                                 
 activation_9 (Activation)   (None, 500)               0         
                                                                 
 dropout_7 (Dropout)         (None, 500)               0         
                                                                 
 dense_10 (Dense)            (None, 50)               

In [7]:
x_train, x_test, y_train, y_test = train_test_split(tfidf, dataframe['sentiment_num'], test_size=0.2, random_state=7)
regular_dl(x_train, x_test, y_train, y_test, rtype='tfidf')

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 1000)              23659000  
                                                                 
 activation (Activation)     (None, 1000)              0         
                                                                 
 dropout (Dropout)           (None, 1000)              0         
                                                                 
 dense_1 (Dense)             (None, 500)               500500    
                                                                 
 activation_1 (Activation)   (None, 500)               0         
                                                                 
 dropout_1 (Dropout)         (None, 500)               0         
                                                                 
 dense_2 (Dense)             (None, 50)                2

In [12]:
from keras.preprocessing import sequence
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D

def CNN(rep, rtype='bow'):
    if rtype == 'bow':
        max_features = bow.shape[1]
        max_length = bow.shape[1]
    else:
        max_features = tfidf.shape[1]
        max_length = tfidf.shape[1]
        
    x_train, x_test, y_train, y_test = train_test_split(rep, dataframe['sentiment_num'], test_size=0.2, random_state=7)
    print(len(x_train), 'train observations')
    print(len(x_test), 'test observations')

    x_train = sequence.data_utils.pad_sequences(x_train, maxlen=max_length)
    x_test = sequence.data_utils.pad_sequences(x_test, maxlen=max_length)
    print('x_train shape:', x_train.shape)
    print('x_test shape:', x_test.shape)

    batch_size = 32
    embedding_dims = 60
    num_kernels = 260
    kernel_size = 3
    hidden_dims = 300
    epochs = 3

    model = Sequential()
    model.add(Embedding(max_features, embedding_dims, input_length=max_length))
    model.add(Dropout(0.2))
    model.add(Conv1D(num_kernels, kernel_size, padding='valid', activation='relu', strides=1))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(hidden_dims))
    model.add(Dropout(0.5))
    model.add(Activation('relu'))
    model.add(Dense(1))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())

    model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1, verbose=1, callbacks=[callback])

    y_train_predclass = np.argmax(model.predict(x_train), axis=1)
    y_test_predclass = np.argmax(model.predict(x_test), axis=1)
    y_train_predclass.shape = y_train.shape
    y_test_predclass.shape = y_test.shape

    print(f'\n\nCNN 1D - Train accuracy: {accuracy_score(y_train, y_train_predclass)}')
    print(f'\nCNN 1D of Training data\n {classification_report(y_train, y_train_predclass)}')
    print(f'\nCNN 1D - Train Confusion Matrix\n\n {pd.crosstab(y_train, y_train_predclass, rownames=["Actual"], colnames=["Predicted"])}')
    print(f'\nCNN 1D - Test accuracy: {accuracy_score(y_test, y_test_predclass)}')
    print(f'\nCNN 1D of Test data\n {classification_report(y_test, y_test_predclass)}')
    print(f'\nCNN 1D - Test Confusion Matrix\n\n {pd.crosstab(y_test, y_test_predclass, rownames=["Actual"], colnames=["Predicted"])}')

In [None]:
CNN(bow)

In [None]:
CNN(tfidf, rtype='tfidf')

In [14]:
from keras.layers import LSTM, Bidirectional

def RNN_LSTM(rep, rtype='bow'):
    if rtype == 'bow':
        max_features = bow.shape[1]
        max_length = bow.shape[1]
    else:
        max_features = tfidf.shape[1]
        max_length = tfidf.shape[1]
        
    batch_size = 64
    x_train, x_test, y_train, y_test = train_test_split(rep, dataframe['sentiment_num'], test_size=0.2, random_state=7)

    model = Sequential()
    model.add(Embedding(max_features, 128, input_length=max_length))
    model.add(Bidirectional(LSTM(64)))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='softmax'))
    model.compile('adam', 'categorical_crossentropy', metrics=['accuracy'])
    print(model.summary())

    model.fit(x_train, y_train, batch_size=batch_size, epochs=4, validation_split=0.1, verbose=1, callbacks=[callback])

    y_train_predclass = np.argmax(model.predict(x_train), axis=1)
    y_test_predclass = np.argmax(model.predict(x_test), axis=1)
    y_train_predclass.shape = y_train.shape
    y_test_predclass.shape = y_test.shape

    print("\n\nLSTM Bidirectional Sentiment Classification - Train accuracy:", (round(accuracy_score(y_train, y_train_predclass), 3)))
    print("\nLSTM Bidirectional Sentiment Classification of Training data\n", classification_report(y_train, y_train_predclass))
    print("\nLSTM Bidirectional Sentiment Classification - Train Confusion Matrix\n\n", pd.crosstab(y_train, y_train_predclass, rownames=["Actual"], colnames=["Predicted"]))
    print("\nLSTM Bidirectional Sentiment Classification - Test accuracy:", (round(accuracy_score(y_test, y_test_predclass), 3)))
    print("\nLSTM Bidirectional Sentiment Classification of Test data\n", classification_report(y_test, y_test_predclass))
    print("\nLSTM Bidirectional Sentiment Classification - Test Confusion Matrix\n\n", pd.crosstab(y_test, y_test_predclass, rownames=["Actual"], colnames=["Predicted"]))

In [None]:
RNN_LSTM(bow)

In [None]:
RNN_LSTM(tfidf, rtype='tfidf')