In [1]:
import pandas as pd
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from sklearn import model_selection
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split #holdout method
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier

#read csv into dataframe
GOPSentiDf = pd.read_csv('2016GOPPresDebSenti.csv')
AirlineSentiDf = pd.read_csv('TweetsUSAirlineSenti.csv')

#Begin feature removal
GOPSentiDf.count()

id                           13871
candidate                    13775
candidate_confidence         13871
relevant_yn                  13871
relevant_yn_confidence       13871
sentiment                    13871
sentiment_confidence         13871
subject_matter               13545
subject_matter_confidence    13871
candidate_gold                  28
name                         13871
relevant_yn_gold                32
retweet_count                13871
sentiment_gold                  15
subject_matter_gold             18
text                         13871
tweet_coord                     21
tweet_created                13871
tweet_id                     13871
tweet_location                9959
user_timezone                 9468
dtype: int64

In [2]:
AirlineSentiDf.count()

tweet_id                        14640
airline_sentiment               14640
airline_sentiment_confidence    14640
negativereason                   9178
negativereason_confidence       10522
airline                         14640
airline_sentiment_gold             40
name                            14640
negativereason_gold                32
retweet_count                   14640
text                            14640
tweet_coord                      1019
tweet_created                   14640
tweet_location                   9907
user_timezone                    9820
dtype: int64

In [3]:
#we want the sentiment analysis to be as general as possible, independent of user location, when they tweeted, 
#the tweet's subject matter, how often their tweets get retweeted, who is the user, 

GOPSentiDf.drop('user_timezone', 1, inplace = True)
GOPSentiDf.drop('tweet_location', 1, inplace = True)
GOPSentiDf.drop('tweet_id', 1, inplace = True)
GOPSentiDf.drop('tweet_created', 1, inplace = True)
GOPSentiDf.drop('tweet_coord', 1, inplace = True)
#subject matter gold is the specific topic of tweeted text, such as Religion, Abortion, Immigration, FOX news, etc.
GOPSentiDf.drop('subject_matter_gold', 1, inplace = True)
#sentiment gold is repetitive of the sentiment column
GOPSentiDf.drop('sentiment_gold', 1, inplace = True)
GOPSentiDf.drop('retweet_count', 1, inplace = True)
#relevant_yn_gold is repetitive of relevant
GOPSentiDf.drop('relevant_yn_gold', 1, inplace = True)
GOPSentiDf.drop('name', 1, inplace = True)
#candidate_gold is repetitive of candidate
GOPSentiDf.drop('candidate_gold', 1, inplace = True)
GOPSentiDf.drop('subject_matter_confidence', 1, inplace = True)
GOPSentiDf.drop('subject_matter', 1, inplace = True)
GOPSentiDf.drop('relevant_yn_confidence', 1, inplace = True)
GOPSentiDf.drop('relevant_yn', 1, inplace = True)
GOPSentiDf.drop('candidate_confidence', 1, inplace = True)
GOPSentiDf.drop('candidate', 1, inplace = True)

#we want to do the same for the tweets for the US Airlines
AirlineSentiDf.drop('user_timezone', 1, inplace = True)
AirlineSentiDf.drop('tweet_location', 1, inplace = True)
AirlineSentiDf.drop('tweet_created', 1, inplace = True)
AirlineSentiDf.drop('tweet_coord', 1, inplace = True)
AirlineSentiDf.drop('retweet_count', 1, inplace = True)
AirlineSentiDf.drop('negativereason_gold', 1, inplace = True)
AirlineSentiDf.drop('name', 1, inplace = True)
AirlineSentiDf.drop('airline_sentiment_gold', 1, inplace = True)
AirlineSentiDf.drop('airline', 1, inplace = True)
AirlineSentiDf.drop('negativereason_confidence', 1, inplace = True)
AirlineSentiDf.drop('negativereason', 1, inplace = True)

print("removed unnecessary columns")

removed unnecessary columns


In [4]:
GOPSentiDf.count()

id                      13871
sentiment               13871
sentiment_confidence    13871
text                    13871
dtype: int64

In [5]:
AirlineSentiDf.count()

tweet_id                        14640
airline_sentiment               14640
airline_sentiment_confidence    14640
text                            14640
dtype: int64

In [6]:
#combine GOPSentiDf and AirlineSentiDf
GOPIdAr = GOPSentiDf.values[0:,][:,0]
GOPSentiAr = GOPSentiDf.values[0:,][:,1]
GOPSentiConAr = GOPSentiDf.values[0:,][:,2]
GOPTxtAr= GOPSentiDf.values[0:,][:,3]

AirIdAr = AirlineSentiDf.values[0:,][:,0]
AirSentiAr = AirlineSentiDf.values[0:,][:,1]
AirSentiConAr = AirlineSentiDf.values[0:,][:,2]
AirTxtAr = AirlineSentiDf.values[0:,][:,3]

IdAr = []
SentiAr = []
SentiConAr = []
TxtAr = []

#for i in range(len(GOPIdAr)):
#    IdAr.append(GOPIdAr[i])
#    SentiAr.append(GOPSentiAr[i])
#    SentiConAr.append(GOPSentiConAr[i])
#    TxtAr.append(GOPTxtAr[i])

for i in range(len(AirIdAr)):
    IdAr.append(AirIdAr[i])
    SentiAr.append(AirSentiAr[i])
    SentiConAr.append(AirSentiConAr[i])
    TxtAr.append(AirTxtAr[i])

print("combined the 2 dataframes")

combined the 2 dataframes


In [7]:
#remove all stopwords, hashtags, web links, retweets (RT), direct @s, and symbols
stop_words = set(stopwords.words("english"))
stemmer = SnowballStemmer("english")

shrtTxtAr = []
index = 0
for i in range(len(TxtAr)):
    shrtTxtAr.append('')
for text in TxtAr:
    TxtAr[index] = TxtAr[index].replace('RT ', '')
    while (TxtAr[index].find('#') != -1):
        TxtAr[index] = TxtAr[index].replace('#', '')
    while (TxtAr[index].find('@') != -1):
        TxtAr[index] = TxtAr[index].replace('@', '')
    while (TxtAr[index].find('http') != -1):
        cnt = TxtAr[index].find('http')
        TxtAr[index] = TxtAr[index].replace(TxtAr[index][cnt:], '')
    text = TxtAr[index]
    words = word_tokenize(text)
    for w in words:
        if (w.isalpha() and w not in stop_words):
            #stem all the words for easier classification later
            w = stemmer.stem(w)
            shrtTxtAr[index] = shrtTxtAr[index] + ' ' + w
    TxtAr[index] = shrtTxtAr[index]
    index += 1
print("cleaned the text and updated both text columns")

cleaned the text and updated both text columns


In [8]:
#collect and categorize all words as whether they're positive, negative, or neutral according to how they were classified
posWordsFreq = []
negWordsFreq = []
neuWordsFreq = []

for i in range(len(SentiAr)):
    words = word_tokenize(TxtAr[i])
    if (SentiAr[i].lower() == 'positive'):
        for w in range(len(words)):
            posWordsFreq.append(words[w])
    elif (SentiAr[i].lower() == 'negative'):
        for w in range(len(words)):
            negWordsFreq.append(words[w])
    else:
        for w in range(len(words)):
            neuWordsFreq.append(words[w])

posWordsFreq = nltk.FreqDist(posWordsFreq)
negWordsFreq = nltk.FreqDist(negWordsFreq)
neuWordsFreq = nltk.FreqDist(neuWordsFreq)

posWordsKeysAr = list(posWordsFreq.keys())
posWordsValuesAr = list(posWordsFreq.values())
negWordsKeysAr = list(negWordsFreq.keys())
negWordsValuesAr = list(negWordsFreq.values())
neuWordsKeysAr = list(neuWordsFreq.keys())
neuWordsValuesAr = list(neuWordsFreq.values())

print("created positive, negative, and neutral arrays of words")

created positive, negative, and neutral arrays of words


In [9]:
#create features: number of negative and positive words and emoticons, number of emoticons
NumPosAr = []
NumNegAr = []
NumNeuAr = []

#determine if a word is more positive, negative, or neutral and store at one of three previously instantiated arrays
for texts in TxtAr:
    numPos = 0
    numNeg = 0
    numNeu = 0
    words = word_tokenize(texts)
    for w in words:
        posVal = 0
        negVal = 0
        neuVal = 0
        if w in posWordsKeysAr:
            posVal = posWordsValuesAr[posWordsKeysAr.index(w)]
        if w in negWordsKeysAr:
            negVal = negWordsValuesAr[negWordsKeysAr.index(w)]
        if w in neuWordsKeysAr:
            neuVal = neuWordsValuesAr[neuWordsKeysAr.index(w)]
        if posVal == max(posVal, negVal, neuVal):
            numPos += 1
        if negVal == max(posVal, negVal, neuVal):
            numNeg += 1
        if neuVal == max(posVal, negVal, neuVal):
            numNeu += 1
    NumPosAr.append(numPos)
    NumNegAr.append(numNeg)
    NumNeuAr.append(numNeu)

print("counted all positive, negative, and neutral words per text line")

counted all positive, negative, and neutral words per text line


In [10]:
#combine all the arrays into a dataframe
df = pd.DataFrame({'ID':IdAr, 'Text':TxtAr, '#Positive Words':NumPosAr, '#Negative Words':NumNegAr, '#Neutral Words':NumNeuAr,
                  'Sentiment Confidence':SentiConAr, 'Sentiment':SentiAr})
df = df[['ID', 'Text', '#Positive Words', '#Negative Words', '#Neutral Words', 'Sentiment Confidence', 'Sentiment']]

print("Recombined to new dataframe")

Recombined to new dataframe


In [11]:
df.count()

ID                      14640
Text                    14640
#Positive Words         14640
#Negative Words         14640
#Neutral Words          14640
Sentiment Confidence    14640
Sentiment               14640
dtype: int64

In [12]:
#classify using Naive Bayes, SVM, and maximum entropy with k-fold
X = df.values[0:,[2, 3, 4]]
Y = df.values[0:,][:,6]

seed = 11

kfold = model_selection.KFold(n_splits = 10, random_state = seed)

clfNB = MultinomialNB(alpha = 1) #alpha for Laplacian correction
clfSVM = LinearSVC(random_state = seed)
clfLogReg = LogisticRegression(random_state = seed)
eclf = VotingClassifier(estimators=[('MultiNm', clfNB), ('LinSVC', clfSVM), ('lr', clfLogReg)], voting = 'hard') #hard voting for majority voting

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = seed)
clfNB.fit(X_train, Y_train)
clfSVM.fit(X_train, Y_train)
clfLogReg.fit(X_train, Y_train)
results = model_selection.cross_val_score(eclf, X, Y, cv = kfold)

predNB = clfNB.predict(X_test)
predSVM = clfSVM.predict(X_test)
predLogReg = clfLogReg.predict(X_test)

print("Trained Multinomial Naive Bayes, Linear SVM, and Logarithmic Regression models")

Trained Multinomial Naive Bayes, Linear SVM, and Logarithmic Regression models


In [13]:
print("Multinomial Naive Bayes:" + str(accuracy_score(Y_test, predNB)))
print("Linear SVM:" + str(accuracy_score(Y_test, predSVM)))
print("Logarithmic Regression:" + str(accuracy_score(Y_test, predLogReg)))
print("Ensemble:" + str(results.mean()))

Multinomial Naive Bayes:0.815573770492
Linear SVM:0.819672131148
Logarithmic Regression:0.822131147541
Ensemble:0.814071038251


In [14]:
#list of emotions, not emojis
#create 2d array with 50 columns and 3 rows, where the row determines the number of character per emoji
#i.e., :) contains 2 characters so it belongs it row 0, while :-D goes to row 1, and so on
PosEmoAr = [[':)', ':]', ':}', '=)', '=]', '=}', ':B', '=B', '<3', '^^', ':*', '=*', ';)', ';]', 
             ';}', '=P', '=p', ':P', ':p', ':b', '=b'], 
            [':o)', ':o]', ':o}', ':-]', ':-)', ':-}', '=^]', '=^)', '=^}', ':-D', ':-B', 
             ':^D', ':^B', '=^B', '=^D', ':\')', ':\']', '=\'}', '^.^', '^-^', '^_^', ':-*', 
             ':-p', ':-P', ':-b', ':^p', ':^P', ':^b', '\\o\\', '/o/', '=^p', '=^P', '=^b', '\\o/']]
NegEmoAr = [['D:', 'D=', ':(', ':[', ':{', '=(', '=[', '={', '=\\', ':\\', '=/', ':/', '=$', 'Oo'], 
            ['D-:', 'D^:', 'D^=', ':o(', ':o[', ':^(', ':^[', ':^{', '=^(', '=^{', '>=(', '>=[', '>={', 
             ':-[', ':-(', '=^[', '>=[', ':\'(', ':\'[', ':\'{', '=\'{', '=\'(', '=\'[', 'o.O', 'O_o', ':o{'],
            ['>:-{', '>:-[', '>:-(', '>=^[', '>=^(', '>:-{', '>=^{'],
            ['>:-=(', ':$:-{']]
NeuEmoAr = [[':|', '=|', '><', ':o', ':O', '=0', ':@', '=@', ':x', '=X', ':#', '=#'], 
            [':-|', '>.<', '>_<', ':^o', ':^@', '-.-', '-_-', ':-x', ':-X', ':-@', ':-#', ':^x', ':^#'], 
            ['-.-\'', '-_-\'']]

print("created emoticons comparator")

created emoticons comparator


In [15]:
#need to define a function that breaks down a text into several features: text, number of positive, negative, and neutral things,
#sentiment confidence. Then classifier classifies it to negative, positive, or neutral