In [1]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
names = ['timestamp','date','query','handle','message']
df = pd.read_csv('sentiment140.csv',encoding='Latin1',names=names)
df['sentiment'] = df.index
#Instead of positive being 4 make positive 1
df.loc[df['sentiment'] == 4,'sentiment']=1

In [3]:
df.head()

Unnamed: 0,timestamp,date,query,handle,message,sentiment
0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0
0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,0
0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,0
0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,0
0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",0


In [4]:
# sample the data for faster processing
df = df.sample(frac=0.1, replace=True, random_state=1)

In [5]:
# check for label balancing
print('Total Positive Labels: ', df[df.sentiment == 1].shape[0])
print('Total Negative Labels: ', df[df.sentiment == 0].shape[0])

Total Positive Labels:  80295
Total Negative Labels:  79705


In [6]:
def preprocessor(s):
    import inflect
    from nltk.stem.lancaster import LancasterStemmer
    '''
    Preprocessing by doing the following
    - lowercasing everything
    - removing punctuation
    - replacing sequences of numbers with a single token
    - shortening long words using stemming
    '''

    #lowercasing all words
    def to_lowercase(words):
        new_words = []
        for word in words:
            new_word = word.lower()
            new_words.append(new_word)
        return new_words

    #Removing punctuation
    def remove_punctuation(words):
        new_words = []
        for word in words:
            new_word = re.sub(r'[^\w\s]', '', word)
            if new_word != '':
                new_words.append(new_word)
        return new_words

    #replacing sequences of numbers with a single token
    def replace_numbers(words):
        p = inflect.engine()
        new_words = []
        for word in words:
            if word.isdigit():
                new_word = p.number_to_words(word)
                new_words.append(new_word)
            else:
                new_words.append(word)
        return new_words

    # shortening long words using stemming
    def stem_words(words):
        stemmer = LancasterStemmer()
        stems = []
        for word in words:
            stem = stemmer.stem(word)
            stems.append(stem)
        return stems

    # Combine all the steps into one function
    def word_preprocessor(words):
        result= ''
        first_step = to_lowercase(words)
        for element in first_step: result += str(element)

        #Feed lowercased data into punctuation step
        result2 = ''
        second_step = remove_punctuation(result)
        for element in second_step: result2 += str(element)

        #Feed lowercased+un-punctuated data into the 3rd step
        result3 = ''
        third_step = replace_numbers(result2)
        for element in third_step: result3 += str(element)
            
        #Feed lowercased+un-punctuated+num adjusted data into the 4th step
        result4 = ''
        fourth_step = stem_words(result3)
        for element in fourth_step: result4 += str(element)

        return result4
    
    return word_preprocessor(s)

In [7]:
X = df['message'].values
y = df['sentiment']

In [8]:
#Transform/Preprocess the data
X_processed = []
[X_processed.append(preprocessor(tweet)) for tweet in X]

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [9]:
# split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.33, random_state=42)

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

In [12]:
#bigram
bigram_vectorizer = CountVectorizer(ngram_range=(1,2), min_df=5)

X_2 = bigram_vectorizer.fit_transform(X_train)

In [13]:
print('The size of the bigram vocabulary is: ', X_2.shape[1])

The size of the bigram vocabulary is:  39711


In [15]:
Y_2 = bigram_vectorizer.transform(X_test)

In [17]:
#Fitting the Naive Bayes Model on vectorized bigrams
alpha =  0.1
alpha_accuracy = []

clf=MultinomialNB(alpha = alpha)
clf.fit(X_2,y_train)
pred = clf.predict(Y_2)
alpha_accuracy.append(metrics.accuracy_score(y_test, pred))
print('Accuracy: ' , metrics.accuracy_score(y_test, pred))

Accuracy:  0.7794128787878788
