In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab 

from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

#read csv into dataframe
GOPSentiDf = pd.read_csv('2016GOPPresDebSenti.csv')
AirlineSentiDf = pd.read_csv('TweetsUSAirlineSenti.csv')

#Begin feature removal
GOPSentiDf.count()

id                           13871
candidate                    13775
candidate_confidence         13871
relevant_yn                  13871
relevant_yn_confidence       13871
sentiment                    13871
sentiment_confidence         13871
subject_matter               13545
subject_matter_confidence    13871
candidate_gold                  28
name                         13871
relevant_yn_gold                32
retweet_count                13871
sentiment_gold                  15
subject_matter_gold             18
text                         13871
tweet_coord                     21
tweet_created                13871
tweet_id                     13871
tweet_location                9959
user_timezone                 9468
dtype: int64

In [2]:
AirlineSentiDf.count()

tweet_id                        14640
airline_sentiment               14640
airline_sentiment_confidence    14640
negativereason                   9178
negativereason_confidence       10522
airline                         14640
airline_sentiment_gold             40
name                            14640
negativereason_gold                32
retweet_count                   14640
text                            14640
tweet_coord                      1019
tweet_created                   14640
tweet_location                   9907
user_timezone                    9820
dtype: int64

In [3]:
#we want the sentiment analysis to be as general as possible, independent of user location, when they tweeted, 
#the tweet's subject matter, how often their tweets get retweeted, who is the user, 

GOPSentiDf.drop('user_timezone', 1, inplace = True)
GOPSentiDf.drop('tweet_location', 1, inplace = True)
GOPSentiDf.drop('tweet_id', 1, inplace = True)
GOPSentiDf.drop('tweet_created', 1, inplace = True)
GOPSentiDf.drop('tweet_coord', 1, inplace = True)
#subject matter gold is the specific topic of tweeted text, such as Religion, Abortion, Immigration, FOX news, etc.
GOPSentiDf.drop('subject_matter_gold', 1, inplace = True)
#sentiment gold is repetitive of the sentiment column
GOPSentiDf.drop('sentiment_gold', 1, inplace = True)
GOPSentiDf.drop('retweet_count', 1, inplace = True)
#relevant_yn_gold is repetitive of relevant
GOPSentiDf.drop('relevant_yn_gold', 1, inplace = True)
GOPSentiDf.drop('name', 1, inplace = True)
#candidate_gold is repetitive of candidate
GOPSentiDf.drop('candidate_gold', 1, inplace = True)
GOPSentiDf.drop('subject_matter_confidence', 1, inplace = True)
GOPSentiDf.drop('subject_matter', 1, inplace = True)
GOPSentiDf.drop('relevant_yn_confidence', 1, inplace = True)
GOPSentiDf.drop('relevant_yn', 1, inplace = True)
GOPSentiDf.drop('candidate_confidence', 1, inplace = True)
GOPSentiDf.drop('candidate', 1, inplace = True)

#we want to do the same for the tweets for the US Airlines
AirlineSentiDf.drop('user_timezone', 1, inplace = True)
AirlineSentiDf.drop('tweet_location', 1, inplace = True)
AirlineSentiDf.drop('tweet_created', 1, inplace = True)
AirlineSentiDf.drop('tweet_coord', 1, inplace = True)
AirlineSentiDf.drop('retweet_count', 1, inplace = True)
AirlineSentiDf.drop('negativereason_gold', 1, inplace = True)
AirlineSentiDf.drop('name', 1, inplace = True)
AirlineSentiDf.drop('airline_sentiment_gold', 1, inplace = True)
AirlineSentiDf.drop('airline', 1, inplace = True)
AirlineSentiDf.drop('negativereason_confidence', 1, inplace = True)
AirlineSentiDf.drop('negativereason', 1, inplace = True)

In [4]:
GOPSentiDf.count()

id                      13871
sentiment               13871
sentiment_confidence    13871
text                    13871
dtype: int64

In [5]:
AirlineSentiDf.count()

tweet_id                        14640
airline_sentiment               14640
airline_sentiment_confidence    14640
text                            14640
dtype: int64

In [6]:
#list of emotions, not emojis
#create 2d array with 50 columns and 3 rows, where the row determines the number of character per emoji
#i.e., :) contains 2 characters so it belongs it row 0, while :-D goes to row 1, and so on
PosEmoAr = [[':)', ':]', ':}', '=)', '=]', '=}', ':B', '=B', '<3', '^^', ':*', '=*', ';)', ';]', 
             ';}', '=P', '=p', ':P', ':p', ':b', '=b'], 
            [':o)', ':o]', ':o}', ':-]', ':-)', ':-}', '=^]', '=^)', '=^}', ':-D', ':-B', 
             ':^D', ':^B', '=^B', '=^D', ':\')', ':\']', '=\'}', '^.^', '^-^', '^_^', ':-*', 
             ':-p', ':-P', ':-b', ':^p', ':^P', ':^b', '\\o\\', '/o/', '=^p', '=^P', '=^b', '\\o/']]
NegEmoAr = [['D:', 'D=', ':(', ':[', ':{', '=(', '=[', '={', '=\\', ':\\', '=/', ':/', '=$', 'Oo'], 
            ['D-:', 'D^:', 'D^=', ':o(', ':o[', ':^(', ':^[', ':^{', '=^(', '=^{', '>=(', '>=[', '>={', 
             ':-[', ':-(', '=^[', '>=[', ':\'(', ':\'[', ':\'{', '=\'{', '=\'(', '=\'[', 'o.O', 'O_o', ':o{'],
            ['>:-{', '>:-[', '>:-(', '>=^[', '>=^(', '>:-{', '>=^{'],
            ['>:-=(', ':$:-{']]
NeuEmoAr = [[':|', '=|', '><', ':o', ':O', '=0', ':@', '=@', ':x', '=X', ':#', '=#'], 
            [':-|', '>.<', '>_<', ':^o', ':^@', '-.-', '-_-', ':-x', ':-X', ':-@', ':-#', ':^x', ':^#'], 
            ['-.-\'', '-_-\'']]

In [7]:
#create features: number of negative and positive words and emoticons, number of emoticons
GOPTxtAr = GOPSentiDf.values[0:,][:,3]
GOPNumPosAr = []
GOPNumNegAr = []
GOPNumEmoAr = []
for i in range(len(GOPTxtAr)):
    GOPNumPosAr.append(0)
    GOPNumNegAr.append(0)
    GOPNumEmoAr.append(0)
    
AirTxtAr = AirlineSentiDf.values[0:,][:,3]
AirNumPosAr = []
AirNumNegAr = []
AirNumEmoAr = []

for i in range(len(AirTxtAr)):
    AirNumPosAr.append(0)
    AirNumNegAr.append(0)
    AirNumEmoAr.append(0)
    
index = 0
for text in GOPTxtAr:
    #remove indefinite and definite articles
    GOPTxtAr[index] = str(GOPTxtAr[index]).replace('The ', '')
    GOPTxtAr[index] = str(GOPTxtAr[index]).replace('the ', '')
    GOPTxtAr[index] = str(GOPTxtAr[index]).replace('An ', '')
    GOPTxtAr[index] = str(GOPTxtAr[index]).replace('an ', '')
    GOPTxtAr[index] = str(GOPTxtAr[index]).replace('A ', '')
    GOPTxtAr[index] = str(GOPTxtAr[index]).replace('a ', '')
    index += 1

index = 0
for text in AirTxtAr:
    AirTxtAr[index] = str(AirTxtAr[index]).replace('The ', '')
    AirTxtAr[index] = str(AirTxtAr[index]).replace('the ', '')
    AirTxtAr[index] = str(AirTxtAr[index]).replace('An ', '')
    AirTxtAr[index] = str(AirTxtAr[index]).replace('an ', '')
    AirTxtAr[index] = str(AirTxtAr[index]).replace('A ', '')
    AirTxtAr[index] = str(AirTxtAr[index]).replace('a ', '')
    index += 1

In [11]:
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize.moses import MosesDetokenizer

updatedGOP = []
x = 0
for text in GOPTxtAr:
    ex_s = GOPTxtAr[x]
    words = word_tokenize(ex_s)
    sw = set(stopwords.words("english"))
    filtered_sentence = []

    for w in words:
        if w not in sw:
            filtered_sentence.append(w)
    
    y = 0
    for text in filtered_sentence:
        if filtered_sentence[y] == '#':
            filtered_sentence[y] = ' '
            filtered_sentence[y] = ' '
        if filtered_sentence[y] == 'https':
            filtered_sentence[y] = ' '
            filtered_sentence[y+1]= ' '
            filtered_sentence[y+2] = ' '  
        if filtered_sentence[y] == 'http':
            filtered_sentence[y] = ' '
            filtered_sentence[y+1]= ' '
            filtered_sentence[y+2] = ' '  
        y += 1
    detokenizer = MosesDetokenizer()
    updatedGOP.append(detokenizer.detokenize(filtered_sentence, return_str=True))
    x += 1

print(updatedGOP)





In [12]:
updatedAir = []
x = 0
for text in AirTxtAr:
    ex_s = AIrTxtAr[x]
    words = word_tokenize(ex_s)
    sw = set(stopwords.words("english"))
    filtered_sentence = []

    for w in words:
        if w not in sw:
            filtered_sentence.append(w)
    
    y = 0
    for text in filtered_sentence:
        if filtered_sentence[y] == '#':
            filtered_sentence[y] = ' '
            filtered_sentence[y] = ' '
        if filtered_sentence[y] == 'https':
            filtered_sentence[y] = ' '
            filtered_sentence[y+1]= ' '
            filtered_sentence[y+2] = ' '  
        if filtered_sentence[y] == 'http':
            filtered_sentence[y] = ' '
            filtered_sentence[y+1]= ' '
            filtered_sentence[y+2] = ' '  
        y += 1
    detokenizer = MosesDetokenizer()
    updatedGOP.append(detokenizer.detokenize(filtered_sentence, return_str=True))
    x += 1

print(updatedGOP)

IndexError: list assignment index out of range