In [1]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
names = ['timestamp','date','query','handle','message']
df = pd.read_csv('sentiment140.csv',encoding='Latin1',names=names)
df['sentiment'] = df.index
#Instead of positive being 4 make positive 1
df.loc[df['sentiment'] == 4,'sentiment']=1

In [3]:
df.head()

Unnamed: 0,timestamp,date,query,handle,message,sentiment
0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0
0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,0
0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,0
0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,0
0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",0


In [4]:
# sample the data for faster processing
df = df.sample(frac=0.1, replace=True, random_state=1)

In [5]:
# check for label balancing
print('Total Positive Labels: ', df[df.sentiment == 1].shape[0])
print('Total Negative Labels: ', df[df.sentiment == 0].shape[0])

Total Positive Labels:  80295
Total Negative Labels:  79705


In [6]:
def preprocessor(s):
    import inflect
    from nltk.stem.lancaster import LancasterStemmer
    '''
    Preprocessing by doing the following
    - lowercasing everything
    - removing punctuation
    - replacing sequences of numbers with a single token
    - shortening long words using stemming
    '''

    #lowercasing all words
    def to_lowercase(words):
        new_words = []
        for word in words:
            new_word = word.lower()
            new_words.append(new_word)
        return new_words

    #Removing punctuation
    def remove_punctuation(words):
        new_words = []
        for word in words:
            new_word = re.sub(r'[^\w\s]', '', word)
            if new_word != '':
                new_words.append(new_word)
        return new_words

    #replacing sequences of numbers with a single token
    def replace_numbers(words):
        p = inflect.engine()
        new_words = []
        for word in words:
            if word.isdigit():
                new_word = p.number_to_words(word)
                new_words.append(new_word)
            else:
                new_words.append(word)
        return new_words

    # shortening long words using stemming
    def stem_words(words):
        stemmer = LancasterStemmer()
        stems = []
        for word in words:
            stem = stemmer.stem(word)
            stems.append(stem)
        return stems

    # Combine all the steps into one function
    def word_preprocessor(words):
        result= ''
        first_step = to_lowercase(words)
        for element in first_step: result += str(element)

        #Feed lowercased data into punctuation step
        result2 = ''
        second_step = remove_punctuation(result)
        for element in second_step: result2 += str(element)

        #Feed lowercased+un-punctuated data into the 3rd step
        result3 = ''
        third_step = replace_numbers(result2)
        for element in third_step: result3 += str(element)
            
        #Feed lowercased+un-punctuated+num adjusted data into the 4th step
        result4 = ''
        fourth_step = stem_words(result3)
        for element in fourth_step: result4 += str(element)

        return result4
    
    return word_preprocessor(s)

In [7]:
X = df['message'].values
y = df['sentiment']

In [8]:
#Transform/Preprocess the data
X_processed = []
for tweet in X:
    X_processed.append(preprocessor(tweet))

In [9]:
# split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.33, random_state=42)

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

In [11]:
#bigram
bigram_vectorizer = CountVectorizer(ngram_range=(1,2), min_df=5)

X_2 = bigram_vectorizer.fit_transform(X_train)

In [12]:
print('The size of the bigram vocabulary is: ', X_2.shape[1])

The size of the bigram vocabulary is:  39711


In [13]:
Y_2 = bigram_vectorizer.transform(X_test)

In [14]:
#Fitting the Naive Bayes Model on vectorized bigrams
alpha =  0.1
alpha_accuracy = []

clf=MultinomialNB(alpha = alpha)
clf.fit(X_2,y_train)
pred = clf.predict(Y_2)
alpha_accuracy.append(metrics.accuracy_score(y_test, pred))
print('Accuracy: ' , metrics.accuracy_score(y_test, pred))

Accuracy:  0.7794128787878788


Election tweets

In [15]:
election_data = pd.read_csv('candidates.csv')


In [17]:
election_data.head()

Unnamed: 0,candidate,tweet,date
0,Bill Nelson,"""The stakes could not be higher. ... What you ...",2018-10-28 23:56:02
1,Bill Nelson,@FLGovScott @USACEHQ I laugh even in an electi...,2018-10-28 23:57:05
2,Bill Nelson,Governor Rick Scott has been amazing since the...,2018-10-28 23:57:22
3,Bill Nelson,Look out my friends in Florida You have bill...,2018-10-28 23:58:03
4,Bill Nelson,@GKeile DON'T forget about Rick Scott in Flori...,2018-10-28 23:58:15


In [19]:
el_tweets = election_data['tweet'].values

In [21]:
#Preprocess and transform the data
el_processed = []
for tweet in el_tweets:
    el_processed.append(preprocessor(tweet))
    
el_processed

['the stakes could not be higher  what you do in florida is going to impact the rest of the country  kamalaharris stumping for her senate colleague bill nelson in miami this morning httpstcoexhzheightsmok',
 'flgovscott usacehq i laugh even in an election year bill nelson no where to be found he is an empty seat like the ads',
 'governor rick scott has been amazing since the hurricane  hes always down here providing support fema is everywhere red cross you name it were being well taken care of\n\n\n\nbill nelson is absolutely nowhere to be found  big surprise hes a democrat',
 'look out my friends in florida   you have bill nelson trying to take your guns away you have a scumbag running for governor guilim who will turn your state into a sanctuary state you will need your guns more then ever this is a demon rat plot to disarm you',
 'gkeile dont forget about rick scott in florida over the donothing bill nelson vote rickscott voteredtosaveamerica httpstcoumlyhxywav',
 'rick scotts compa

In [22]:
el_transformed = bigram_vectorizer.transform(el_processed)

In [23]:
prediction = clf.predict(el_transformed)

In [24]:
election_data['prediction'] = prediction

In [26]:
election_data.head(10)

Unnamed: 0,candidate,tweet,date,prediction
0,Bill Nelson,"""The stakes could not be higher. ... What you ...",2018-10-28 23:56:02,0
1,Bill Nelson,@FLGovScott @USACEHQ I laugh even in an electi...,2018-10-28 23:57:05,0
2,Bill Nelson,Governor Rick Scott has been amazing since the...,2018-10-28 23:57:22,0
3,Bill Nelson,Look out my friends in Florida You have bill...,2018-10-28 23:58:03,1
4,Bill Nelson,@GKeile DON'T forget about Rick Scott in Flori...,2018-10-28 23:58:15,0
5,Bill Nelson,Rick Scott’s company ripped off Medicare. \n\n...,2018-11-04 23:59:24,1
6,Bill Nelson,Powerful reminder why the election of @RonDeSa...,2018-11-04 23:59:25,1
7,Bill Nelson,@realDonaldTrump @ScottforFlorida You are the ...,2018-11-04 23:59:38,1
8,Bill Nelson,@realDonaldTrump You are the con man in the wo...,2018-11-04 23:59:43,1
9,Bill Nelson,"In all the time I’ve been President, almost tw...",2018-11-04 23:59:46,1


In [31]:
final_pd = pd.DataFrame({'positive' : election_data.groupby('candidate')['prediction'].sum()}).reset_index()

In [32]:
final_pd

Unnamed: 0,candidate,positive
0,Bill Nelson,6
1,Claire McCaskill,8
2,Dean Heller,8
3,Jacky Rosen,5
4,Joe Donnelly,8
5,Joe Manchin,6
6,Jon Tester,7
7,Josh Hawley,7
8,Kyrsten Sinema,7
9,Marsha Blackburn,7


In [47]:
def Winner(candidate1, candidate2, data):
    candidate1_votes = data[final_pd.candidate == candidate1]['positive'].values
    candidate2_votes = data[final_pd.candidate == candidate2]['positive'].values
    
    if candidate1_votes == candidate2_votes:
        return "It's a Tie"
    else:
        winner_votes = max(candidate1_votes, candidate2_votes)[0]
        if candidate1_votes == winner_votes:
            return candidate1
        else:
            return candidate2

In [50]:
print('Arizona [Kyrsten Sinema vs Martha McSally] _  Winner: ', Winner('Kyrsten Sinema', 'Martha McSally', final_pd))
print('Nevada [Jacky Rosen vs Dean Heller] _  Winner: ', Winner('Jacky Rosen', 'Dean Heller', final_pd))
print('Florida [Bill Nelson vs Rick Scott] _  Winner: ', Winner('Bill Nelson', 'Rick Scott', final_pd))
print('Montana [Jon Tester vs Matt Rosendale] _  Winner: ', Winner('Jon Tester', 'Matt Rosendale', final_pd) )
print('Missouri [Josh Hawley vs Claire McCaskill] _  Winner: ', Winner('Josh Hawley', 'Claire McCaskill', final_pd))
print('Tennessee [Marsha Blackburn vs Phil Bredesen] _  Winner: ', Winner('Marsha Blackburn', 'Phil Bredesen', final_pd))
print('Indiana [Mike Braun vs Joe Donnelly] _  Winner: ', Winner('Mike Braun', 'Joe Donnelly', final_pd))
print('West Virginia [Joe Manchin III vs Patrick Morrisey] _  Winner: ', Winner('Joe Manchin', 'Patrick Morrisey', final_pd))

Arizona [Kyrsten Sinema vs Martha McSally] _  Winner:  Martha McSally
Nevada [Jacky Rosen vs Dean Heller] _  Winner:  Dean Heller
Florida [Bill Nelson vs Rick Scott] _  Winner:  Bill Nelson
Montana [Jon Tester vs Matt Rosendale] _  Winner:  Matt Rosendale
Missouri [Josh Hawley vs Claire McCaskill] _  Winner:  Claire McCaskill
Tennessee [Marsha Blackburn vs Phil Bredesen] _  Winner:  Phil Bredesen
Indiana [Mike Braun vs Joe Donnelly] _  Winner:  It's a Tie
West Virginia [Joe Manchin III vs Patrick Morrisey] _  Winner:  Patrick Morrisey
