# Political Tweets Sentiment Analysis (SVM, Logistic Regression, Naive Bayes)

In [1]:
import math
import pandas as pd
import numpy as np
import csv
import sys, os
import string
import re
import datetime
# Natural Language Processing
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.stem.snowball import SnowballStemmer # load the stemmer module from NLTK
dataloc = '/home/composersyf/Documents/Political Data Science Project/TwitterData'
import emoji
# scikit-learn
from sklearn.metrics import accuracy_score, confusion_matrix

#### Loading Components for NLP

In [2]:
twtokenizer = TweetTokenizer()
stemmer=SnowballStemmer('english')

In [3]:
punctuation = list(set(string.punctuation)) + ['…','’','...','—',':/','”','..', '“']

In [4]:
stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours',
'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers',
'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves',
'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are',
'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does',
'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until',
'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into',
'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down',
'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here',
'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',
'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so',
'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should'] # Removed 'not','now', 'no','nor'

## Building Sentiment Model

#### Get the Training Data

In [5]:
# To download the Twitter Sentiment Files
# nltk.download('twitter_samples')

In [6]:
# Get ~ 10k Sentiment Labeled Tweets
from nltk.corpus import twitter_samples

In [7]:
neg_tweets = twitter_samples.strings('negative_tweets.json')
pos_tweets = twitter_samples.strings('positive_tweets.json')

In [8]:
def clean_tweets(tweet):
    #tokenizer = nltk.tokenize.treebank.TreebankWordTokenizer()
    cleanWords = twtokenizer.tokenize(tweet)
    
    # Convert to Lowercase
    cleanWords = [t.lower() for t in cleanWords]
    
    # Convert Emoji's to Word Label
    cleanWords = [emoji.demojize(word) for word in cleanWords]
    
    # Normalize (remove punctuation)
    #Remove punctuation
    cleanWords = [word for word in cleanWords if word not in punctuation]
    
    # punc = string.punctuation
    # cleanWords = [t for t in cleanWords if t not in punc]
    # cleanWords = [re.sub('[^0-9a-z]', "", x) for x in cleanWords]
    
    # Remove Empty Vectors
    cleanWords = [x for x in cleanWords if x != '']
     
    # Remove StopWords
    # cleanWords = [word for word in cleanWords if word not in stopwords_short]
    cleanWords = [word for word in cleanWords if word not in stopwords]
    
    # Identify Digits & Convert to Num
    #cleanWords = [re.sub("\d+", "NUM", x) for x in cleanWords]
    
    # Remove all Web/URL References
    # Could be better to replace with 'URL'
    cleanWords = [word for word in cleanWords if word[0:3] != 'htt']
    
    # Stem Words
    cleanWords = [stemmer.stem(x) for x in cleanWords] # call stemmer to stem the input
    
    # Remove Multiple Letters, Replace with only 3
    
    return cleanWords

In [9]:
neg_tweets_clean = [clean_tweets(tweet) for tweet in neg_tweets]
pos_tweets_clean = [clean_tweets(tweet) for tweet in pos_tweets]

#### Train the Model

Source: http://www.laurentluce.com/posts/twitter-sentiment-analysis-using-python-and-nltk/

In [10]:
def getTermFreq(textList):
    #calculate the term frequency for each text list
    TF = {}
    for row in textList:
        #print(row)
        for word in row:
            # print(word)
            if word in TF:
                TF[word] += 1
            else:
                TF[word] = 1
    return TF

In [11]:
# Unigram Language Model
def genUniLM(TF):
    u_theta = pd.DataFrame.from_dict(TF, orient = "index")
    u_theta.columns = ['TF']
    # u_theta.sort('TF', ascending = False)[0:10]
    # Total Number of Words in Training Corpus
    nWords = u_theta['TF'].sum()
    nWords
    # Number of Unique Words in Training Corpus
    vSize = len(u_theta['TF'])
    vSize
    # Calculate Probabilty of Each Word by TTF/N
    u_theta['p'] = u_theta/nWords
    u_theta = u_theta.sort_values('TF', ascending = False)
    # Check that Probability Sums to 1
    print("Total Probability: ",u_theta['p'].sum())
    return u_theta

In [12]:
# Get Term Frequency for All Negative Tweets
#TF_neg = getTermFreq(neg_tweets_clean)
#u_theta_neg = genUniLM(TF_neg)

In [13]:
# Get Term Frequency for All Positive Tweets
#TF_pos = getTermFreq(pos_tweets_clean)
#u_theta_pos = genUniLM(TF_pos)

In [14]:
# Defind the functions to turn the corpus into document term matrix
def create_countVectors(tokens):
    doc_TF = {}
    for token in tokens:
        if token in doc_TF:
            doc_TF[token] += 1
        else:
            doc_TF[token] = 1
    return doc_TF

def createDTM(corpus):
    dtmHASH = {}
    for key in corpus.keys():
        dtmHASH[key] = create_countVectors(corpus[key])
    return dtmHASH

In [15]:
neg_tweets_corpus={}
for i,l in enumerate(neg_tweets_clean):
    neg_tweets_corpus[i]=l
neg_tweets_DTM=createDTM(neg_tweets_corpus)
neg_tweets_df=pd.DataFrame.from_dict(neg_tweets_DTM, orient = 'index')
neg_tweets_df.shape

(5000, 9312)

In [16]:
pos_tweets_corpus={}
for i,l in enumerate(pos_tweets_clean):
    pos_tweets_corpus[i]=l
pos_tweets_DTM=createDTM(pos_tweets_corpus)
pos_tweets_df=pd.DataFrame.from_dict(pos_tweets_DTM, orient = 'index')
pos_tweets_df.shape

(5000, 11346)

In [17]:
#stack the positive DTM and negative DTM together vertically
tweets_df=pd.concat([pos_tweets_df,neg_tweets_df],join="outer",axis=0)
tweets_df.shape

(10000, 17890)

In [18]:
# Replace the NA values with zero
tweets_df=tweets_df.fillna(0)

In [19]:
# Drop the terms whose term frequency is just 1
col_sums=tweets_df.sum(axis=0)
tweets_df_clean=tweets_df.iloc[:,np.where(col_sums>1)[0]]
tweets_df_clean.shape

(10000, 4652)

In [20]:
#total number of features present in the training set
feature_names=tweets_df_clean.columns.values
len(feature_names)

4652

### (1) Train an SVM Model

In [21]:
from sklearn import svm

In [22]:
Y=["Positive"]*5000+["Negative"]*5000

In [23]:
svm_clf=svm.SVC(kernel="linear",probability=True)
svm_clf.fit(tweets_df_clean,Y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

### (2) Train a Logistic Regression (Maximum Entropy) Model

In [24]:
from sklearn.linear_model import LogisticRegression

In [25]:
lr_clf=LogisticRegression(n_jobs=2)
lr_clf.fit(tweets_df_clean,Y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=2,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### **Evaludate the SVM & Logistic Regression model on the validation set

In [26]:
validation_sample=pd.read_csv("/home/composersyf/Downloads/sentiment_validation.csv")

In [27]:
validation_sample_text=validation_sample.Tweet
validation_tweets_clean = [clean_tweets(tweet) for tweet in np.array(validation_sample_text)]

In [28]:
validation_tweets_corpus={}
for i,l in enumerate(validation_tweets_clean):
    validation_tweets_corpus[i]=l
validation_tweets_DTM=createDTM(validation_tweets_corpus)
validation_tweets_df=pd.DataFrame.from_dict(validation_tweets_DTM, orient = 'index')
validation_tweets_df=validation_tweets_df.fillna(0)
validation_tweets_df.shape

(2194, 5994)

In [29]:
validation_tweets_df_part1=validation_tweets_df.loc[:,np.intersect1d(validation_tweets_df.columns.values,feature_names)]

In [30]:
validation_tweets_df_part2=pd.DataFrame(np.zeros((2194,len(np.setdiff1d(feature_names,validation_tweets_df.columns.values)))))
validation_tweets_df_part2.columns=np.setdiff1d(feature_names,validation_tweets_df.columns.values)

In [31]:
validation_tweets_df_final=pd.concat([validation_tweets_df_part1,validation_tweets_df_part2],axis=1,join="outer")

In [32]:
column_names_before=validation_tweets_df_final.columns.values

In [33]:
validation_tweets_df_final=validation_tweets_df_final.loc[:,tweets_df_clean.columns.values]
column_names_after=validation_tweets_df_final.columns.values

In [34]:
print(len(column_names_before)==len(column_names_after))
print((column_names_before==column_names_after).sum())

True
0


In [35]:
from collections import Counter
Counter(validation_sample.sentiment)

Counter({'Negative': 1054, 'Neutral': 824, 'Positive': 316})

#### - Evaluate the SVM model

In [36]:
svm_pred=svm_clf.predict(validation_tweets_df_final)

In [37]:
svm_result=validation_sample.loc[:,['sentiment']]
svm_result['pred_sentiment']=svm_pred

In [38]:
svm_result_1=svm_result[svm_result.sentiment!='Neutral']
c=0
for i in range(svm_result_1.shape[0]):
    if svm_result_1.sentiment.iloc[i]==svm_result_1.pred_sentiment.iloc[i]:
        c+=1
print('SVM Accuracy:',round(c*100/svm_result_1.shape[0],2),'%')

SVM Accuracy: 62.12 %


In [39]:
confusion_matrix(svm_result_1.iloc[:,0],svm_result_1.iloc[:,1])

array([[736, 318],
       [201, 115]])

In [40]:
svm_result_2=svm_result[svm_result.sentiment=='Neutral']
print('SVM imbalance (Positive rate):', Counter(svm_result_2.pred_sentiment)['Positive']/svm_result_2.shape[0]*100, '%')

SVM imbalance (Positive rate): 28.398058252427184 %


In [41]:
svm_clf.predict_proba(validation_tweets_df_final)[:,0] #it's predicting the probability of negative sentiment

array([ 0.00949859,  0.12273114,  0.77612369, ...,  0.79343947,
        0.75273737,  0.94003868])

In [44]:
threshold=[i*0.01+0 for i in range(51)]

In [63]:
svm_pred_prob=svm_clf.predict_proba(validation_tweets_df_final)[:,0]
for t in threshold:
    svm_result=validation_sample.loc[:,['sentiment']]
    svm_result['pred_sentiment']=np.where(svm_pred_prob>t,"Negative","Positive")
    svm_result_1=svm_result[svm_result.sentiment!='Neutral']
    print("threshold: ",t)
    c=0
    for i in range(svm_result_1.shape[0]):
        if svm_result_1.sentiment.iloc[i]==svm_result_1.pred_sentiment.iloc[i]:
            c+=1
    print('SVM Accuracy:',round(c*100/svm_result_1.shape[0],2),'%')
    svm_result_2=svm_result[svm_result.sentiment=='Neutral']
    print('SVM imbalance (Positive rate):', Counter(svm_result_2.pred_sentiment)['Positive']/svm_result_2.shape[0]*100, '%')
    if t==0.29:
        print(confusion_matrix(svm_result_1.iloc[:,0],svm_result_1.iloc[:,1]))
    print('')
#svm_result['pred_sentiment']=svm_pred

threshold:  0.0
SVM Accuracy: 76.93 %
SVM imbalance (Positive rate): 0.0 %

threshold:  0.01
SVM Accuracy: 76.93 %
SVM imbalance (Positive rate): 0.3640776699029126 %

threshold:  0.02
SVM Accuracy: 76.72 %
SVM imbalance (Positive rate): 0.8495145631067961 %

threshold:  0.03
SVM Accuracy: 76.06 %
SVM imbalance (Positive rate): 1.3349514563106795 %

threshold:  0.04
SVM Accuracy: 75.91 %
SVM imbalance (Positive rate): 1.9417475728155338 %

threshold:  0.05
SVM Accuracy: 75.91 %
SVM imbalance (Positive rate): 2.063106796116505 %

threshold:  0.06
SVM Accuracy: 75.84 %
SVM imbalance (Positive rate): 2.3058252427184467 %

threshold:  0.07
SVM Accuracy: 75.77 %
SVM imbalance (Positive rate): 3.1553398058252426 %

threshold:  0.08
SVM Accuracy: 75.47 %
SVM imbalance (Positive rate): 3.3980582524271843 %

threshold:  0.09
SVM Accuracy: 75.26 %
SVM imbalance (Positive rate): 3.640776699029126 %

threshold:  0.1
SVM Accuracy: 74.82 %
SVM imbalance (Positive rate): 4.247572815533981 %

threshol

#### - Evaluate the Logistic Regression Model

In [46]:
lr_pred=lr_clf.predict(validation_tweets_df_final)

In [47]:
lr_result=validation_sample.loc[:,['sentiment']]
lr_result['pred_sentiment']=lr_pred

In [48]:
lr_result_1=lr_result[lr_result.sentiment!='Neutral']
c=0
for i in range(lr_result_1.shape[0]):
    if lr_result_1.sentiment.iloc[i]==lr_result_1.pred_sentiment.iloc[i]:
        c+=1
print('LR Accuracy:',round(c*100/lr_result_1.shape[0],2),'%')

LR Accuracy: 39.05 %


In [49]:
lr_result_2=lr_result[lr_result.sentiment=='Neutral']
print('LR imbalance (Positive rate):', Counter(lr_result_2.pred_sentiment)['Positive']/lr_result_2.shape[0]*100, '%')

LR imbalance (Positive rate): 73.30097087378641 %


In [50]:
confusion_matrix(lr_result_1.iloc[:,0],lr_result_1.iloc[:,1])

array([[292, 762],
       [ 73, 243]])

In [51]:
lr_clf.predict_proba(validation_tweets_df_final)[:,0] #it's predicting the probability of negative sentiment

array([ 0.14816387,  0.21407164,  0.55348139, ...,  0.49612704,
        0.53752547,  0.59540195])

In [62]:
lr_pred_prob=lr_clf.predict_proba(validation_tweets_df_final)[:,0]
for t in threshold:
    lr_result=validation_sample.loc[:,['sentiment']]
    lr_result['pred_sentiment']=np.where(lr_pred_prob>t,"Negative","Positive")
    lr_result_1=lr_result[lr_result.sentiment!='Neutral']
    print("threshold: ",t)
    c=0
    for i in range(lr_result_1.shape[0]):
        if lr_result_1.sentiment.iloc[i]==lr_result_1.pred_sentiment.iloc[i]:
            c+=1
    print('LR Accuracy:',round(c*100/lr_result_1.shape[0],2),'%')
    lr_result_2=lr_result[lr_result.sentiment=='Neutral']
    print('LR imbalance (Positive rate):', Counter(lr_result_2.pred_sentiment)['Positive']/lr_result_2.shape[0]*100, '%')
    if round(t,2)==0.4:
        print(confusion_matrix(lr_result_1.iloc[:,0],lr_result_1.iloc[:,1]))
    print('')
#svm_result['pred_sentiment']=svm_pred

threshold:  0.0
LR Accuracy: 76.93 %
LR imbalance (Positive rate): 0.0 %

threshold:  0.01
LR Accuracy: 76.93 %
LR imbalance (Positive rate): 0.24271844660194172 %

threshold:  0.02
LR Accuracy: 76.93 %
LR imbalance (Positive rate): 0.24271844660194172 %

threshold:  0.03
LR Accuracy: 76.93 %
LR imbalance (Positive rate): 0.24271844660194172 %

threshold:  0.04
LR Accuracy: 76.93 %
LR imbalance (Positive rate): 0.24271844660194172 %

threshold:  0.05
LR Accuracy: 76.93 %
LR imbalance (Positive rate): 0.24271844660194172 %

threshold:  0.06
LR Accuracy: 76.93 %
LR imbalance (Positive rate): 0.24271844660194172 %

threshold:  0.07
LR Accuracy: 76.93 %
LR imbalance (Positive rate): 0.24271844660194172 %

threshold:  0.08
LR Accuracy: 76.93 %
LR imbalance (Positive rate): 0.24271844660194172 %

threshold:  0.09
LR Accuracy: 76.93 %
LR imbalance (Positive rate): 0.24271844660194172 %

threshold:  0.1
LR Accuracy: 76.93 %
LR imbalance (Positive rate): 0.24271844660194172 %

threshold:  0.11


### (3) Train a Naive Bayes Classifier

In [52]:
from sklearn.naive_bayes import BernoulliNB

In [53]:
nb_clf = BernoulliNB()
nb_clf.fit(tweets_df_clean, Y)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

#### - Evaluate the Naive Bayes Model

In [55]:
nb_pred=nb_clf.predict(validation_tweets_df_final)

In [56]:
nb_result=validation_sample.loc[:,['sentiment']]
nb_result['pred_sentiment']=nb_pred

In [57]:
nb_result_1=nb_result[nb_result.sentiment!='Neutral']
c=0
for i in range(nb_result_1.shape[0]):
    if nb_result_1.sentiment.iloc[i]==nb_result_1.pred_sentiment.iloc[i]:
        c+=1
print('NB Accuracy:',round(c*100/nb_result_1.shape[0],2),'%')

NB Accuracy: 51.97 %


In [58]:
nb_result_2=nb_result[nb_result.sentiment=='Neutral']
print('NB imbalance (Positive rate):', Counter(nb_result_2.pred_sentiment)['Positive']/nb_result_2.shape[0]*100, '%')

NB imbalance (Positive rate): 64.92718446601941 %


In [59]:
confusion_matrix(nb_result_1.iloc[:,0],nb_result_1.iloc[:,1])

array([[473, 581],
       [ 77, 239]])

In [61]:
nb_pred_prob=nb_clf.predict_proba(validation_tweets_df_final)[:,0]
for t in threshold:
    nb_result=validation_sample.loc[:,['sentiment']]
    nb_result['pred_sentiment']=np.where(nb_pred_prob>t,"Negative","Positive")
    nb_result_1=nb_result[nb_result.sentiment!='Neutral']
    print("threshold: ",t)
    c=0
    for i in range(nb_result_1.shape[0]):
        if nb_result_1.sentiment.iloc[i]==nb_result_1.pred_sentiment.iloc[i]:
            c+=1
    print('NB Accuracy:',round(c*100/nb_result_1.shape[0],2),'%')
    nb_result_2=nb_result[nb_result.sentiment=='Neutral']
    print('NB imbalance (Positive rate):', Counter(nb_result_2.pred_sentiment)['Positive']/nb_result_2.shape[0]*100, '%')
    if t==0.19:
        print(confusion_matrix(nb_result_1.iloc[:,0],nb_result_1.iloc[:,1]))
    print('')

threshold:  0.0
NB Accuracy: 76.93 %
NB imbalance (Positive rate): 0.0 %

threshold:  0.01
NB Accuracy: 77.3 %
NB imbalance (Positive rate): 1.0922330097087378 %

threshold:  0.02
NB Accuracy: 77.66 %
NB imbalance (Positive rate): 2.1844660194174756 %

threshold:  0.03
NB Accuracy: 77.01 %
NB imbalance (Positive rate): 3.762135922330097 %

threshold:  0.04
NB Accuracy: 76.57 %
NB imbalance (Positive rate): 6.310679611650485 %

threshold:  0.05
NB Accuracy: 76.13 %
NB imbalance (Positive rate): 8.37378640776699 %

threshold:  0.06
NB Accuracy: 75.91 %
NB imbalance (Positive rate): 9.466019417475728 %

threshold:  0.07
NB Accuracy: 75.33 %
NB imbalance (Positive rate): 11.286407766990292 %

threshold:  0.08
NB Accuracy: 74.89 %
NB imbalance (Positive rate): 12.62135922330097 %

threshold:  0.09
NB Accuracy: 75.04 %
NB imbalance (Positive rate): 13.95631067961165 %

threshold:  0.1
NB Accuracy: 74.6 %
NB imbalance (Positive rate): 15.291262135922329 %

threshold:  0.11
NB Accuracy: 74.45 