In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab 
import nltk

from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize.moses import MosesDetokenizer

#read csv into dataframe
GOPSentiDf = pd.read_csv('2016GOPPresDebSenti.csv')
AirlineSentiDf = pd.read_csv('TweetsUSAirlineSenti.csv')

#Begin feature removal
GOPSentiDf.count()

id                           13871
candidate                    13775
candidate_confidence         13871
relevant_yn                  13871
relevant_yn_confidence       13871
sentiment                    13871
sentiment_confidence         13871
subject_matter               13545
subject_matter_confidence    13871
candidate_gold                  28
name                         13871
relevant_yn_gold                32
retweet_count                13871
sentiment_gold                  15
subject_matter_gold             18
text                         13871
tweet_coord                     21
tweet_created                13871
tweet_id                     13871
tweet_location                9959
user_timezone                 9468
dtype: int64

In [3]:
AirlineSentiDf.count()

tweet_id                        14640
airline_sentiment               14640
airline_sentiment_confidence    14640
negativereason                   9178
negativereason_confidence       10522
airline                         14640
airline_sentiment_gold             40
name                            14640
negativereason_gold                32
retweet_count                   14640
text                            14640
tweet_coord                      1019
tweet_created                   14640
tweet_location                   9907
user_timezone                    9820
dtype: int64

In [4]:
#we want the sentiment analysis to be as general as possible, independent of user location, when they tweeted, 
#the tweet's subject matter, how often their tweets get retweeted, who is the user, 

GOPSentiDf.drop('user_timezone', 1, inplace = True)
GOPSentiDf.drop('tweet_location', 1, inplace = True)
GOPSentiDf.drop('tweet_id', 1, inplace = True)
GOPSentiDf.drop('tweet_created', 1, inplace = True)
GOPSentiDf.drop('tweet_coord', 1, inplace = True)
#subject matter gold is the specific topic of tweeted text, such as Religion, Abortion, Immigration, FOX news, etc.
GOPSentiDf.drop('subject_matter_gold', 1, inplace = True)
#sentiment gold is repetitive of the sentiment column
GOPSentiDf.drop('sentiment_gold', 1, inplace = True)
GOPSentiDf.drop('retweet_count', 1, inplace = True)
#relevant_yn_gold is repetitive of relevant
GOPSentiDf.drop('relevant_yn_gold', 1, inplace = True)
GOPSentiDf.drop('name', 1, inplace = True)
#candidate_gold is repetitive of candidate
GOPSentiDf.drop('candidate_gold', 1, inplace = True)
GOPSentiDf.drop('subject_matter_confidence', 1, inplace = True)
GOPSentiDf.drop('subject_matter', 1, inplace = True)
GOPSentiDf.drop('relevant_yn_confidence', 1, inplace = True)
GOPSentiDf.drop('relevant_yn', 1, inplace = True)
GOPSentiDf.drop('candidate_confidence', 1, inplace = True)
GOPSentiDf.drop('candidate', 1, inplace = True)

#we want to do the same for the tweets for the US Airlines
AirlineSentiDf.drop('user_timezone', 1, inplace = True)
AirlineSentiDf.drop('tweet_location', 1, inplace = True)
AirlineSentiDf.drop('tweet_created', 1, inplace = True)
AirlineSentiDf.drop('tweet_coord', 1, inplace = True)
AirlineSentiDf.drop('retweet_count', 1, inplace = True)
AirlineSentiDf.drop('negativereason_gold', 1, inplace = True)
AirlineSentiDf.drop('name', 1, inplace = True)
AirlineSentiDf.drop('airline_sentiment_gold', 1, inplace = True)
AirlineSentiDf.drop('airline', 1, inplace = True)
AirlineSentiDf.drop('negativereason_confidence', 1, inplace = True)
AirlineSentiDf.drop('negativereason', 1, inplace = True)

print("removed unnecessary columns")

removed unnecessary columns


In [5]:
GOPSentiDf.count()

id                      13871
sentiment               13871
sentiment_confidence    13871
text                    13871
dtype: int64

In [6]:
AirlineSentiDf.count()

tweet_id                        14640
airline_sentiment               14640
airline_sentiment_confidence    14640
text                            14640
dtype: int64

In [7]:
#list of emotions, not emojis
#create 2d array with 50 columns and 3 rows, where the row determines the number of character per emoji
#i.e., :) contains 2 characters so it belongs it row 0, while :-D goes to row 1, and so on
PosEmoAr = [[':)', ':]', ':}', '=)', '=]', '=}', ':B', '=B', '<3', '^^', ':*', '=*', ';)', ';]', 
             ';}', '=P', '=p', ':P', ':p', ':b', '=b'], 
            [':o)', ':o]', ':o}', ':-]', ':-)', ':-}', '=^]', '=^)', '=^}', ':-D', ':-B', 
             ':^D', ':^B', '=^B', '=^D', ':\')', ':\']', '=\'}', '^.^', '^-^', '^_^', ':-*', 
             ':-p', ':-P', ':-b', ':^p', ':^P', ':^b', '\\o\\', '/o/', '=^p', '=^P', '=^b', '\\o/']]
NegEmoAr = [['D:', 'D=', ':(', ':[', ':{', '=(', '=[', '={', '=\\', ':\\', '=/', ':/', '=$', 'Oo'], 
            ['D-:', 'D^:', 'D^=', ':o(', ':o[', ':^(', ':^[', ':^{', '=^(', '=^{', '>=(', '>=[', '>={', 
             ':-[', ':-(', '=^[', '>=[', ':\'(', ':\'[', ':\'{', '=\'{', '=\'(', '=\'[', 'o.O', 'O_o', ':o{'],
            ['>:-{', '>:-[', '>:-(', '>=^[', '>=^(', '>:-{', '>=^{'],
            ['>:-=(', ':$:-{']]
NeuEmoAr = [[':|', '=|', '><', ':o', ':O', '=0', ':@', '=@', ':x', '=X', ':#', '=#'], 
            [':-|', '>.<', '>_<', ':^o', ':^@', '-.-', '-_-', ':-x', ':-X', ':-@', ':-#', ':^x', ':^#'], 
            ['-.-\'', '-_-\'']]

print("created emoticons comparator")

created emoticons comparator


In [11]:
#create features: number of negative and positive words and emoticons, number of emoticons
GOPTxtAr = GOPSentiDf.values[0:,][:,3]
GOPNumPosAr = []
GOPNumNegAr = []
GOPNumEmoAr = []
for i in range(len(GOPTxtAr)):
    GOPNumPosAr.append(0)
    GOPNumNegAr.append(0)
    GOPNumEmoAr.append(0)
    
AirTxtAr = AirlineSentiDf.values[0:,][:,3]
AirNumPosAr = []
AirNumNegAr = []
AirNumEmoAr = []

for i in range(len(AirTxtAr)):
    AirNumPosAr.append(0)
    AirNumNegAr.append(0)
    AirNumEmoAr.append(0)

print("created new features")

created new features


In [None]:
#remove all stopwords, hashtags, web links, retweets (RT), and direct @s
stop_words = set(stopwords.words("english"))

index = 0
shrtTxtGOPAr = []
for i in range(len(GOPTxtAr)):
    shrtTxtGOPAr.append('')
for text in GOPTxtAr:
    GOPTxtAr[index] = GOPTxtAr[index].replace('RT ', '')
    while (GOPTxtAr[index].find('#') != -1):
        GOPTxtAr[index] = GOPTxtAr[index].replace(GOPTxtAr[index][GOPTxtAr[index].find('#') : GOPTxtAr[index].find(' ', GOPTxtAr[index].find('#'))], '')
    while (GOPTxtAr[index].find('@') != -1):
        GOPTxtAr[index] = GOPTxtAr[index].replace(GOPTxtAr[index][GOPTxtAr[index].find('@') : GOPTxtAr[index].find(' ', GOPTxtAr[index].find('@'))], '')
    while (GOPTxtAr[index].find('http') != -1):
        GOPTxtAr[index] = GOPTxtAr[index].replace(GOPTxtAr[index][GOPTxtAr[index].find('http') : ], '')
    text = GOPTxtAr[index]
    GOPTxtAr[index] = ''
    words = word_tokenize(text)
    for w in words:
        if w not in stop_words:
            shrtTxtGOPAr[index] = shrtTxtGOPAr[index] + ' ' + w
    GOPTxtAr[index] = shrtTxtGOPAr[index]
    index += 1

index = 0
shrtTxtAirAr = []
for i in range(len(AirTxtAr)):
    shrtTxtAirAr.append('')
for text in AirTxtAr:
    AirTxtAr[index] = AirTxtAr[index].replace('RT ', '')
    while (AirTxtAr[index].find('#') != -1):
        AirTxtAr[index] = AirTxtAr[index].replace(AirTxtAr[index][AirTxtAr[index].find('#') : AirTxtAr[index].find(' ', AirTxtAr[index].find('#'))], '')
    while (GOPTxtAr[index].find('@') != -1):
        AirTxtAr[index] = AirTxtAr[index].replace(AirTxtAr[index][AirTxtAr[index].find('@') : AirTxtAr[index].find(' ', AirTxtAr[index].find('@'))], '')
    while (GOPTxtAr[index].find('http') != -1):
        AirTxtAr[index] = AirTxtAr[index].replace(AirTxtAr[index][AirTxtAr[index].find('http') : ], '')
    text = AirTxtAr[index]
    AirTxtAr[index] = ''
    words = word_tokenize(text)
    for w in words:
        if w not in stop_words:
            shrtTxtAirAr[index] = shrtTxtAirAr[index] + ' ' + w
    AirTxtAr[index] = shrtTxtAirAr[index]
    index += 1