In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import utils
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import average_precision_score, accuracy_score, confusion_matrix, classification_report

In [23]:
df = pd.read_csv('../data/full-corpus.csv')

## Balancing Classes

In [24]:
def get_sentiment_df(df):
    pos_df = df[df['Sentiment'] == 'positive']
    neg_df = df[df['Sentiment'] == 'negative']
    neutral_df = df[df['Sentiment'] == 'neutral']
    irr_df = df[df['Sentiment'] == 'irrelevant']
    return pos_df, neg_df, neutral_df, irr_df

In [25]:
pos_df, neg_df, neutral_df, irr_df = get_sentiment_df(df)

In [26]:
# Downsample negative class to match positive class
df_neg_down = resample(neg_df,
                    replace=False,    # sample without replacement
                    n_samples=len(pos_df),     # to match minority class
                    random_state=42) # reproducible results
len(df_neg_down)

519

In [27]:
# Downsample negative class to match positive class
df_neutral_down = resample(neutral_df,
                    replace=False,    # sample without replacement
                    n_samples=len(pos_df),     # to match minority class
                    random_state=42) # reproducible results
len(df_neutral_down)

519

In [28]:
# Downsample negative class to match positive class
df_irr_down = resample(irr_df,
                    replace=False,    # sample without replacement
                    n_samples=len(pos_df),     # to match minority class
                    random_state=42) # reproducible results
len(df_irr_down)

519

In [29]:
df_balanced = pd.concat([pos_df, df_neg_down, df_neutral_down])

## Performing Sentiment Classification on Balanced Dataset

In [30]:
y = df_balanced.pop('Sentiment')
y.head()

0    positive
1    positive
2    positive
3    positive
4    positive
Name: Sentiment, dtype: object

In [32]:
X_train, X_test, y_train, y_test = train_test_split(df_balanced, y, random_state=42)

In [33]:
train_text = X_train['TweetText'].to_numpy()
test_text = X_test['TweetText'].to_numpy()

In [34]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(train_text)
tfidf_transformer = TfidfTransformer()

In [35]:
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf = MultinomialNB()
clf.fit(X_train_tfidf, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [36]:
y_pred = clf.predict(count_vect.transform(test_text))
# print(average_precision_score(y_test, y_pred))
accuracy_score(y_test, y_pred)

0.6435897435897436

## Now that model is trained, Get misclassified tweet examples

In [38]:
y_pred

array(['negative', 'positive', 'positive', 'neutral', 'negative',
       'negative', 'positive', 'positive', 'negative', 'negative',
       'negative', 'negative', 'negative', 'positive', 'negative',
       'neutral', 'negative', 'positive', 'positive', 'neutral',
       'negative', 'positive', 'negative', 'negative', 'positive',
       'positive', 'positive', 'positive', 'positive', 'positive',
       'negative', 'positive', 'negative', 'positive', 'positive',
       'positive', 'positive', 'negative', 'positive', 'negative',
       'positive', 'positive', 'positive', 'negative', 'negative',
       'negative', 'positive', 'negative', 'positive', 'negative',
       'positive', 'neutral', 'positive', 'neutral', 'negative',
       'neutral', 'negative', 'negative', 'positive', 'negative',
       'positive', 'negative', 'positive', 'positive', 'positive',
       'positive', 'neutral', 'negative', 'neutral', 'positive',
       'positive', 'negative', 'negative', 'positive', 'negative',
   

In [39]:
y_test

2831     neutral
1146    positive
1331    positive
3834    positive
3237     neutral
4210     neutral
59      positive
2508    positive
445     negative
1332    positive
30      positive
547      neutral
347     negative
1807     neutral
466     negative
2851     neutral
2467    positive
1329    positive
1857     neutral
1355    negative
418     negative
1808     neutral
191     negative
346     negative
162     positive
2473    positive
1218    positive
1252    positive
1239    positive
3940    negative
          ...   
3859    positive
1232    positive
1300    positive
4184     neutral
2704     neutral
2563    negative
1426     neutral
1803     neutral
388     negative
1925     neutral
3912    negative
86      positive
4279     neutral
1343    positive
1243    positive
1265    positive
1895     neutral
1336    positive
436     negative
962      neutral
3945    negative
886      neutral
1815     neutral
1199    positive
113     positive
274     negative
1422     neutral
1152    positi

Following tweet in test set was classified as negative when it should have been neutral

In [41]:
test_text[0]

'Listening to Doug Leland #Microsoft  big data guru. Social media impact huge not just data wise, but within tools to manage data #gartnerSYM'

Following tweet predicted as negative when true label is positive

In [43]:
test_text[9]

'Seriously!? Why do I still find the yellow pages ph book at my doorstep? These days I just #google it on my iphone.'

## Get word associated with Negative and Positive Tweets

In [45]:
ind_max = clf.feature_log_prob_.argsort()[::-1]
ind_max

array([[4116, 3503, 1981, ...,  307, 3549, 1506],
       [   0, 2295, 2296, ..., 3707,  748, 1691],
       [   0, 2311, 2312, ..., 3624, 3549,  307]])

In [46]:
ind_top = ind_max[:,:100]
ind_top.shape

(3, 100)

In [51]:
feature_names = np.array(count_vect.get_feature_names())
feature_names.shape

(4117,)

In [48]:
feature_names_neg = feature_names[ind_top[0]]
feature_names_neg

array(['zzzzzzzzzzzzzzzzzzzz', 'tcpj_mickey', 'kdtd4zre',
       'keepnupwittboyd', 'keepsake', 'keepthedesktop', 'tcmagazine',
       'tbnyx7p7', 'key', 'keyboards', 'keynotes', 'keys', 'kfwq8r4t',
       'khoslaventures', 'kickoff', 'kid', 'kidding', 'kill', 'karşı',
       'karth_vader_', 'kalifornia', 'kaiylw0lf', 'john', 'johnnyvegas',
       'join', 'jolieesharmeda', 'jolly', 'teched_na', 'jsq96nuq',
       'techcrunch', 'killed', 'teamed', 'justin', 'justsaying',
       'jw1ubodi', 'jzmvbdnm', 'k10svnnr', 'k2rrc2ir', 'k3txy6jz',
       'kahnfla7', 'teaching', 'killing', 'kind', 'kinda', 'la', 'lab',
       'labs', 'lack', 'lacks', 'tah_med', 'laggin', 'land', 'l5mtm6ig',
       'languages', 'laptop', 'tagged', 'late', 'latest', 'launched',
       'zzk4ftii', 'launching', 'law', 'lapse', 'joelplane', 'l5j2ueeq',
       'kwfpth4p', 'kindahow', 'tap', 'kiss', 'kissed', 'tango',
       'kj890kmf', 'kjwugom3', 'talks', 'l4hqkv0c', 'klout', 'takes',
       'knowyourmobile', 'knw', 'ko

In [49]:
feature_names_pos = feature_names[ind_top[1]]
feature_names_pos

array(['00', 'mixedchik22', 'mixer', 'mmm', 'mn2nova', 'mn39r2vc',
       'mobileburn', 'mobileme', 'models', 'mole', 'mom', 'moment',
       'monday', 'monopolistic', 'monopoly', 'month', 'months', 'mood',
       'missing', 'minute', 'mins', 'mine', 'merchandising', 'messages',
       'messed', 'messy', 'metro', 'microsoftstores', 'middle',
       'midnight', 'morning', 'might', 'mikeferri', 'miley', 'mileycyrus',
       'millenia', 'millions', 'mimecastsa', 'min', 'mind', 'mightily',
       'mentor', 'mountainview', 'movies', 'navigation', 'naw',
       'nb4dhlsg', 'ndk', 'nearest', 'neat', 'neck', 'need4s', 'needed',
       'needless', 'neednewipadguide', 'needsomethingtoread', 'negative',
       'nerdboner', 'nerdiness', 'nervous', 'netbooks', 'nav', 'native',
       'nationalgeographic', 'nansen', 'moving', 'mp', 'mr', 'mrgareth',
       'msft', 'msg', 'mspegypt', 'mswx6eo6', 'mouse', 'mts11', 'muchhh',
       'mulling', 'music', 'muuwbgpn', 'mv', 'n9', 'nagoul1', 'nam',
       'm

In [50]:
feature_names_neu = feature_names[ind_top[2]]
feature_names_neu

array(['00', 'moment', 'moments', 'mommy_gaga', 'moms', 'mon', 'money',
       'monitoring', 'montrelcox', 'mom', 'mood', 'moto', 'motoactiv',
       'motorola', 'mountainview', 'mouse', 'mousecombos', 'movement',
       'movie', 'morning', 'moving', 'modify', 'mobiletechworld',
       'mikeshatzkin', 'milestone', 'miley', 'millenia', 'million',
       'millions', 'mimecastsa', 'min', 'models', 'mind', 'mindtree',
       'mines', 'missed', 'mixer', 'mmm', 'mms14pdw', 'mn2nova',
       'mobilenews', 'mindshare', 'mp3', 'mpromo', 'mr_malie504',
       'nanotech', 'nationalgeographic', 'native', 'natural',
       'navigation', 'naw', 'nawwaf91', 'nayarivera', 'naming',
       'ncqrwpyb', 'ndrpciy9', 'nearest', 'neat', 'need4s', 'nejbye',
       'neon_ness', 'neowin', 'nerdboner', 'ndk', 'named', 'nagoya', 'n9',
       'mrgareth', 'mrkt', 'ms', 'msamberpriley', 'msantram', 'mshfeqe6',
       'msleamichele', 'mspegypt', 'msqsvmdk', 'mswx6eo6', 'mt9rdrcz',
       'mts11', 'mtu7tgx7', 'mugamb