In [6]:
#import statements
import pandas as pd
import numpy as np
import os
import re
import time
# from langdetect import detect  
import ast
# from bs4 import BeautifulSoup
# import requests, json
from nltk import word_tokenize,pos_tag_sents,WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
from nltk.corpus import wordnet
from collections import Counter
from textblob import TextBlob
import constants
import nltk
from polyglot.detect import Detector 
import cld2
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
sia = SIA()

In [3]:
# Reads the emoticon look up table file
header = ['EmoticonSymbol','SentimentScore']
emoticon_data = pd.read_csv('EmoticonLookupTable.txt', delimiter='\t', encoding = 'ISO-8859-1',names=header)

In [4]:
#Writing emoticons to a dictionary
emoji_dict = emoticon_data.groupby('EmoticonSymbol')['SentimentScore'].apply(list).to_dict()

In [7]:
# Reading the slang dictionary that is already created
f = open("Slangdictionary.txt","r")
res1=f.read()
f.close()
slangdict = ast.literal_eval(res1)

In [8]:
#read the training data from the csv file
cleaneddata = pd.read_table('clean_data_Cldpoly.csv',
                    sep='|', 
                   # delimiter=',',
                    usecols=[0,1],
                    dtype={'label':int,'comment':str},
                    keep_default_na=False)

In [9]:
cleaneddata.head()

Unnamed: 0,label,comment
0,0,"I highly doubt this mostly ignored, surely uns..."
1,0,Holy shit they are dropping an Halloween surpr...
2,0,Chafetz is a known liar (see Personal Problem ...
3,0,Kansas Number 1 in imaginary Muslim terrorists...
4,1,wow it is totally unreasonable to assume that ...


In [10]:
def featureextraction(dataframe, field, func, column_names):
    return pd.concat((
        dataframe,
        dataframe[field].apply(
            lambda cell: pd.Series(func(cell), index=column_names))), axis=1)

In [11]:
## function to get list of emojis in a comment
def find_emoji(text):
    return list(x for x in text.split() if x in emoji_dict.keys() )

In [14]:
## function to get positive word count

def get_pos_neg_word_count(tokens):
    
    pos_word_count = 0
    neg_word_count = 0    
    pos_flag = False
    neg_flag = False
    flip_count = 0
        
    for word in tokens:
        senti = sia.polarity_scores(str(word)) 
        if senti["pos"] == 1:
            pos_word_count += 1
            pos_flag = True
            if neg_flag:
                flip_count += 1
                neg_flag =  False
                
        elif senti["neg"] == 1:
            neg_word_count += 1
            neg_flag = True
            if pos_flag:
                flip_count +=1
                pos_flag = False
    return pos_word_count, neg_word_count,flip_count
    
    

In [15]:
# Extracting the features for each comment 
# Punctuation Features and presence of sarcastic symbol
def allfeatures(user_comment):
    if '!' or '.' or '?' in user_comment:
        Numofexclaimations = user_comment.count('!')
        Numofdots = user_comment.count('.')
        Numofquestionmarks = user_comment.count('?')
    else:
        Numofexclaimations = 0
        Numofdots = 0
        Numofquestionmarks = 0
    if '(!)' in user_comment:
        SarcasticSymbol = 1
    else:
        SarcasticSymbol = 0
    sentiments = TextBlob(str(user_comment)).sentiment
    polarity = sentiments.polarity
    subjectivity = sentiments.subjectivity
    numofcapitals = sum(x.isupper() for x in user_comment.split() if len(x) > 1 )
    elist = find_emoji(user_comment)
    pscore =0
    nscore = 0
    for item in elist:
        if (emoji_dict[item][0] == 1):
            pscore += 1
        elif (emoji_dict[item][0] == -1):
            nscore += 1
    tokens = word_tokenize(user_comment)
    pos_words, neg_words, flip_count = get_pos_neg_word_count(tokens)
    
            
    return Numofexclaimations,Numofdots,Numofquestionmarks,SarcasticSymbol,polarity,subjectivity,numofcapitals,pscore,nscore,pos_words, neg_words, flip_count
                

In [16]:
featureddataset = featureextraction(cleaneddata, 'comment', allfeatures, ['Numofexclaimations', 'Numofdots','Numofquestionmarks','SarcasticSymbol','Polarity', 'Subjectivity','NumofCapitalWords','PositiveEmojiCount','NegativeEmojiCount','PosWords','NegWords','FlipCount'])

In [34]:
def get_parent_sentiment(comment):
    sentiments = TextBlob(str(comment)).sentiment
    polarity = sentiments.polarity
    if polarity >= 0.1:
        return 1
    elif polarity < -0.1:
        return -1
    else:
        return 0
sentiments = TextBlob(str("you are ")).sentiment

In [35]:
featureddataset['parent_sentiment'] = cleaneddata.parent_comment.apply(get_parent_sentiment)

In [18]:
start_time = time.time()
txt = cleaneddata['comment'].tolist()
#POS tagging for all the tokens in the sentence
tagged_texts = pos_tag_sents(map(word_tokenize, txt))
end_time = time.time()
cleaneddata['POS'] = tagged_texts
print("time taken ", end_time-start_time)

time taken  79.26964211463928


In [19]:

#helper function to collect number of interjection
def comment_interjection(user_comment):
    count = Counter(tag for word,tag in user_comment)
    return count['UH']

In [70]:
#get heighly emotional words (associated with POS tags)

def get_high_emotion_words(postags):
    highly_pos = 0
    highly_neg = 0
    POS_list = ['JJ','JJR','JJS', 'RB','RBR','RBS','VB','VBD','VBG','VBN','VBP','VBZ']
    num_tokens = len(postags)
    for i in range(num_tokens):
        if postags[i][1] in POS_list:            
            #check sentiment of next word
            if i < (num_tokens - 1) :
                senti_word = sia.polarity_scores(postags[i+1][0])
                if senti_word['pos'] == 1:
                    highly_pos += 1
                if senti_word['neg'] == 1:
                    highly_neg += 1
                
    return highly_pos, highly_neg        
    

In [71]:
featureddataset['highly_positive'],featureddataset['highly_negative'] = zip(*cleaneddata['POS'].map(get_high_emotion_words)) 

In [20]:
##feature extraction
# number of interjection
featureddataset['interjection']  = cleaneddata.POS.apply(comment_interjection)


In [72]:
featureddataset
        

Unnamed: 0,label,comment,Numofexclaimations,Numofdots,Numofquestionmarks,SarcasticSymbol,Polarity,Subjectivity,NumofCapitalWords,PositiveEmojiCount,NegativeEmojiCount,PosWords,NegWords,FlipCount,interjection,highly_positive,highly_negative
0,0,"I highly doubt this mostly ignored, surely uns...",0.0,1.0,0.0,0.0,0.041313,0.601616,0.0,0.0,0.0,2.0,5.0,4.0,0,0,4
1,0,Holy shit they are dropping an Halloween surpr...,0.0,1.0,0.0,0.0,-0.200000,0.800000,0.0,0.0,0.0,1.0,1.0,1.0,0,0,0
2,0,Chafetz is a known liar (see Personal Problem ...,0.0,0.0,1.0,0.0,0.000000,0.300000,0.0,0.0,0.0,0.0,2.0,0.0,0,0,1
3,0,Kansas Number 1 in imaginary Muslim terrorists...,0.0,0.0,0.0,0.0,0.200000,0.300000,0.0,0.0,0.0,2.0,2.0,3.0,0,0,1
4,1,wow it is totally unreasonable to assume that ...,0.0,0.0,0.0,0.0,-0.100000,0.783333,0.0,0.0,0.0,2.0,2.0,3.0,0,1,0
5,1,Ho ho ho... But Melania said that there is no ...,1.0,3.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,1.0,0.0,0,0,1
6,0,Like this: \ /,0.0,0.0,0.0,0.0,-0.250000,1.000000,0.0,0.0,0.0,1.0,0.0,0.0,0,0,0
7,0,Too bad all the women who don't have kids are ...,1.0,0.0,0.0,0.0,0.100000,0.722222,0.0,0.0,0.0,1.0,1.0,1.0,0,1,1
8,1,I can't wait until @potus starts a twitter war...,0.0,1.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,1.0,0.0,0,0,1
9,0,"Uh, HRC DID toast Trump in the debates.",0.0,1.0,0.0,0.0,0.000000,0.000000,2.0,0.0,0.0,0.0,0.0,0.0,0,0,0


In [73]:
# Create a list of the feature column's names
features = featureddataset.columns[2:]

In [74]:
features

Index(['Numofexclaimations', 'Numofdots', 'Numofquestionmarks',
       'SarcasticSymbol', 'Polarity', 'Subjectivity', 'NumofCapitalWords',
       'PositiveEmojiCount', 'NegativeEmojiCount', 'PosWords', 'NegWords',
       'FlipCount', 'interjection', 'highly_positive', 'highly_negative'],
      dtype='object')

In [75]:

train, test =train_test_split(featureddataset, test_size=0.2) 

In [76]:
X_train = train[features]
y_train = train['label']
X_test = test[features]
y_test = test['label']

In [77]:
# Create a random forest Classifier. 
clf = RandomForestClassifier(n_estimators = 110,n_jobs=-1)#, random_state=0)

In [78]:
# Train the Classifier to take the training features 
%timeit
clf.fit(X_train, y_train )

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=110, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [79]:
# Apply the Classifier we trained to the test data
y_predit = clf.predict(X_test)

In [80]:
print(confusion_matrix(y_test,y_predit))  
print(classification_report(y_test,y_predit))  
print(accuracy_score(y_test,y_predit)) 

[[3129 1639]
 [2037 2789]]
              precision    recall  f1-score   support

           0       0.61      0.66      0.63      4768
           1       0.63      0.58      0.60      4826

   micro avg       0.62      0.62      0.62      9594
   macro avg       0.62      0.62      0.62      9594
weighted avg       0.62      0.62      0.62      9594

0.6168438607462998


In [81]:
print(clf.feature_importances_)

[0.06092406 0.07988101 0.02405276 0.         0.29124662 0.29749295
 0.03509814 0.0021044  0.00148277 0.04903525 0.04373443 0.01919574
 0.04542278 0.02576461 0.02456448]
