## Imports

In [1]:
# Use this on Colab to update plotly

!pip install plotly --upgrade

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import plotly.graph_objects as go
import nltk
import string
import spacy

In [3]:
nltk.download('popular')

In [4]:
df = pd.read_csv("https://github.com/murpi/wilddata/raw/master/quests/tweets.zip")
df.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


## Preprocessing

In [5]:
df = df[df['sentiment'] != 'neutral']
df['sentiment'].value_counts(normalize=True)

positive    0.524476
negative    0.475524
Name: sentiment, dtype: float64

In [6]:
df.reset_index(drop=True, inplace=True)

In [7]:
stopwordsenglish = nltk.corpus.stopwords.words('english')
nlp = spacy.load("en_core_web_sm")
my_punct = ["''", "``", '""', "'s", '-']

def clean(text):

  word_tokens = nltk.word_tokenize(text.lower())
  word_tokens_clean = []

  for word in word_tokens:
    if word not in stopwordsenglish:
      if word not in string.punctuation:
        if word not in my_punct:
          word_tokens_clean.append(word)

  nlp_tokens = nlp(' '.join([str(word) for word in word_tokens_clean]))
  word_tokens_lemma = [word.lemma_ for word in nlp_tokens]
  lemma_string = " ".join(word_tokens_lemma)
  
  return lemma_string


In [8]:
clean("You are better when I am well.")

'well well'

In [9]:
# Expect about 2 minutes 30 seconds of loading time on Colab here

df_token = df.copy()
df_token['clean'] = df_token['text'].apply(clean)
df_token

Unnamed: 0,textID,text,selected_text,sentiment,clean
0,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,sooo sad miss san diego
1,088c60f138,my boss is bullying me...,bullying me,negative,boss bully ...
2,9642c003ef,what interview! leave me alone,leave me alone,negative,interview leave alone
3,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,son put release already buy
4,6e0c6d75b1,2am feedings for the baby are fun when he is a...,fun,positive,2 am feeding baby fun smile coo
...,...,...,...,...,...
16358,b78ec00df5,enjoy ur night,enjoy,positive,enjoy ur night
16359,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative,wish could come see u denver husband lose job ...
16360,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative,wonder rake client make clear .net force devs ...
16361,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive,yay good enjoy break probably need hectic week...


In [10]:
X = df_token['clean']
y = df_token['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=32, train_size = 0.75)

## Bag of Words

In [11]:
vectorizerCV = CountVectorizer()
vectorizerCV.fit(X_train)

X_train_CV = vectorizerCV.transform(X_train)
X_test_CV = vectorizerCV.transform(X_test)

## TfIdf

In [12]:
vectorizerTV = TfidfVectorizer()
vectorizerTV.fit(X_train)

X_train_TV = vectorizerTV.transform(X_train)
X_test_TV = vectorizerTV.transform(X_test)

## Storing Results

In [13]:
dico = {'classifier_model': ['Logistic Regression', 'K Nearest Neighbors', 'Decision Tree', 'Random Forest'],
        'train_score_bow': np.zeros(4),
        'test_score_bow': np.zeros(4),
        'train_score_tfidf': np.zeros(4),
        'test_score_tfidf': np.zeros(4)}

results = pd.DataFrame.from_dict(dico)

## Logistic Regression Classification

In [14]:
modelLR_CV = LogisticRegression().fit(X_train_CV, y_train)

print(f"Accuracy score on the train dataset with Bag of Words Vectorizer: {modelLR_CV.score(X_train_CV, y_train)}")
print(f"Accuracy score on the test dataset with Bag of Words Vectorizer: {modelLR_CV.score(X_test_CV, y_test)}")

Accuracy score on the train dataset with Bag of Words Vectorizer: 0.9529009126466753
Accuracy score on the test dataset with Bag of Words Vectorizer: 0.8660474211684185


In [15]:
modelLR_TV = LogisticRegression().fit(X_train_TV, y_train)

print(f"Accuracy score on the train dataset with TfIdf Vectorizer: {modelLR_TV.score(X_train_TV, y_train)}")
print(f"Accuracy score on the test dataset with TfIdf Vectorizer: {modelLR_TV.score(X_test_TV, y_test)}")

Accuracy score on the train dataset with TfIdf Vectorizer: 0.9255215123859192
Accuracy score on the test dataset with TfIdf Vectorizer: 0.868247372280616


## KNN Classification

In [16]:
modelKNN_CV = KNeighborsClassifier().fit(X_train_CV, y_train)

print(f"Accuracy score on the train dataset with Bag of Words Vectorizer: {modelKNN_CV.score(X_train_CV, y_train)}")
print(f"Accuracy score on the test dataset with Bag of Words Vectorizer: {modelKNN_CV.score(X_test_CV, y_test)}")

Accuracy score on the train dataset with Bag of Words Vectorizer: 0.8472946544980443
Accuracy score on the test dataset with Bag of Words Vectorizer: 0.7726717184062576


In [17]:
modelKNN_TV = KNeighborsClassifier().fit(X_train_TV, y_train)

print(f"Accuracy score on the train dataset with TfIdf Vectorizer: {modelKNN_TV.score(X_train_TV, y_train)}")
print(f"Accuracy score on the test dataset with TfIdf Vectorizer: {modelKNN_TV.score(X_test_TV, y_test)}")

Accuracy score on the train dataset with TfIdf Vectorizer: 0.5896349413298566
Accuracy score on the test dataset with TfIdf Vectorizer: 0.5690540210217551


## Decision Tree Classification

In [18]:
modelDTC_CV = DecisionTreeClassifier().fit(X_train_CV, y_train)

print(f"Accuracy score on the train dataset with Bag of Words Vectorizer: {modelDTC_CV.score(X_train_CV, y_train)}")
print(f"Accuracy score on the test dataset with Bag of Words Vectorizer: {modelDTC_CV.score(X_test_CV, y_test)}")

Accuracy score on the train dataset with Bag of Words Vectorizer: 0.9991851368970013
Accuracy score on the test dataset with Bag of Words Vectorizer: 0.8086042532388169


In [19]:
modelDTC_TV = DecisionTreeClassifier().fit(X_train_TV, y_train)

print(f"Accuracy score on the train dataset with TfIdf Vectorizer: {modelDTC_TV.score(X_train_TV, y_train)}")
print(f"Accuracy score on the test dataset with TfIdf Vectorizer: {modelDTC_TV.score(X_test_TV, y_test)}")

Accuracy score on the train dataset with TfIdf Vectorizer: 0.9991851368970013
Accuracy score on the test dataset with TfIdf Vectorizer: 0.8132485944756783


## Random Forest Classification

In [20]:
modelRFC_CV = RandomForestClassifier(n_estimators=20).fit(X_train_CV, y_train)

print(f"Accuracy score on the train dataset with Bag of Words Vectorizer: {modelRFC_CV.score(X_train_CV, y_train)}")
print(f"Accuracy score on the test dataset with Bag of Words Vectorizer: {modelRFC_CV.score(X_test_CV, y_test)}")

Accuracy score on the train dataset with Bag of Words Vectorizer: 0.9971479791395046
Accuracy score on the test dataset with Bag of Words Vectorizer: 0.8442923490589098


In [21]:
modelRFC_TV = RandomForestClassifier(n_estimators=20).fit(X_train_TV, y_train)

print(f"Accuracy score on the train dataset with TfIdf Vectorizer: {modelRFC_TV.score(X_train_TV, y_train)}")
print(f"Accuracy score on the test dataset with TfIdf Vectorizer: {modelRFC_TV.score(X_test_TV, y_test)}")

Accuracy score on the train dataset with TfIdf Vectorizer: 0.9976368970013038
Accuracy score on the test dataset with TfIdf Vectorizer: 0.8508922023955023


## Visualizing Results

In [22]:
results.iloc[0, 1:] = [modelLR_CV.score(X_train_CV, y_train), modelLR_CV.score(X_test_CV, y_test), modelLR_TV.score(X_train_TV, y_train), modelLR_TV.score(X_test_TV, y_test)]
results.iloc[1, 1:] = [modelKNN_CV.score(X_train_CV, y_train), modelKNN_CV.score(X_test_CV, y_test), modelKNN_TV.score(X_train_TV, y_train), modelKNN_TV.score(X_test_TV, y_test)]
results.iloc[2, 1:] = [modelDTC_CV.score(X_train_CV, y_train), modelDTC_CV.score(X_test_CV, y_test), modelDTC_TV.score(X_train_TV, y_train), modelDTC_TV.score(X_test_TV, y_test)]
results.iloc[3, 1:] = [modelRFC_CV.score(X_train_CV, y_train), modelRFC_CV.score(X_test_CV, y_test), modelRFC_TV.score(X_train_TV, y_train), modelRFC_TV.score(X_test_TV, y_test)]
results


Unnamed: 0,classifier_model,train_score_bow,test_score_bow,train_score_tfidf,test_score_tfidf
0,Logistic Regression,0.952901,0.866047,0.925522,0.868247
1,K Nearest Neighbors,0.847295,0.772672,0.589961,0.568565
2,Decision Tree,0.999185,0.808604,0.999185,0.813249
3,Random Forest,0.997148,0.844292,0.997637,0.850892


In [23]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=results.classifier_model, y=results.train_score_bow, name='Bag of Words Train', line_shape='spline', line_color='blue'))
fig.add_trace(go.Scatter(x=results.classifier_model, y=results.test_score_bow, name='Bag of Words Test', line_shape='spline', line_color='darkorange'))
fig.add_trace(go.Scatter(x=results.classifier_model, y=results.train_score_tfidf, name='TfIdf Train', line_shape='spline', line_color='cyan'))
fig.add_trace(go.Scatter(x=results.classifier_model, y=results.test_score_tfidf, name='TfIdf Test', line_shape='spline', line_color='yellow'))

fig.update_layout(title='Accuracy Score per Model and vectorizer', xaxis_title='Classifier Model', yaxis_title='Accuracy Score', template='plotly_dark', height=700, width=900)
fig.show()

The train scores are shown in blue colors, and the test score in yellowish colors. Based on this, we can see that the model that performs best on the test set is the Logistic Regression model, while having a lower score than Decision Tree and Random Forest on the train set, meaning that there is very little overfitting, if at all. Bag of Words seems to perform a bit better overall than TfIdf.

On the other hand, KNN model's performance is very low, on top of being also the slowest one. It is also worth noting that although other models perform roughly similar on both vectorizers, KNN also drops a lot of performance on the TfIDF compared to Bag of Words.