In [None]:
## Script used for project presentation demo


In [1]:
#import statements
import pandas as pd
import numpy as np
import os
import re
import time 
import ast
from nltk import word_tokenize,pos_tag_sents,WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
from nltk.corpus import wordnet
from textblob import TextBlob
import constants
from collections import Counter
import nltk
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.externals import joblib

  from numpy.core.umath_tests import inner1d


In [2]:
sia = SIA()

In [6]:
#Load the saved model and predict the output
from sklearn.externals import joblib
loaded_model = joblib.load('RF_oldfeatures.sav')



In [7]:
#read the test data from the csv file
## Demo_rf.csv and Demo_CNN.csv files contains 10 rows for clean_test_balanced file which are picked randomly ,
# and gives 70% accuracy on saved models.

test_data_RF = pd.read_table('Demo_rf.csv',
                    sep='|', 
                    usecols=[0,1,2],
                    keep_default_na=False)

test_data_CNN = pd.read_table('Demo_CNN.csv',
                    sep='|', 
                    usecols=[0,1,2],
                    keep_default_na=False)


In [9]:
test_data_RF

Unnamed: 0,label,comment,parent_comment,POS
0,0,I used to hold my tek-dek against the window,Skateboard for me. Honestly didn't realize so ...,"[(I, PRP), (used, VBD), (to, TO), (hold, VB), ..."
1,0,"They tried to email Matt Daemon, but it bounce...",Or maybe even that they found Matt Damon.,"[(They, PRP), (tried, VBD), (to, TO), (email, ..."
2,1,"Yeah, 43% winrate as ADC and 39% winrate as mi...",Let's take a moment and appreciate how balance...,"[(Yeah, UH), (,, ,), (43, CD), (%, NN), (winra..."
3,1,"Bro, Bitcoin is the currency of the future!",I take my money in currency please.,"[(Bro, NNP), (,, ,), (Bitcoin, NNP), (is, VBZ)..."
4,1,You're Northernlion's alt account aren't you?,Broken stopwatch idea This may or may not have...,"[(You, PRP), ('re, VBP), (Northernlion, NNP), ..."
5,1,Here's a little secret on how to make LOTS AND...,You can pay people money with Facebook. I have...,"[(Here, RB), ('s, VBZ), (a, DT), (little, JJ),..."
6,1,"The police did a great job, they saved the ban...",California police killed hostage in July bank ...,"[(The, DT), (police, NN), (did, VBD), (a, DT),..."
7,0,There's a difference between believing in God'...,"We can be welcoming, and we should. Hospitalit...","[(There, EX), ('s, VBZ), (a, DT), (difference,..."
8,1,Finally!,4K Video camera finally packed in a phone,"[(Finally, RB), (!, .)]"
9,1,I'll have to make a note to ask her about how ...,Who wants to be the person to ask her about th...,"[(I, PRP), ('ll, MD), (have, VB), (to, TO), (m..."


In [8]:
## Random Forest Model

# Reads the emoticon look up table file
header = ['EmoticonSymbol','SentimentScore']
emoticon_data = pd.read_csv('EmoticonLookupTable.txt', delimiter='\t', encoding = 'ISO-8859-1',names=header)

#Writing emoticons to a dictionary
emoji_dict = emoticon_data.groupby('EmoticonSymbol')['SentimentScore'].apply(list).to_dict()

# Reading the slang dictionary that is already created
f = open("Slangdictionary.txt","r")
res1=f.read()
f.close()
slangdict = ast.literal_eval(res1)

#get POS tags
text = test_data_RF['comment']
pos_tagged_texts = pos_tag_sents(map(word_tokenize, text))
test_data_RF["POS"] = pos_tagged_texts

## function to get positive word count 
def get_pos_neg_word_count(tokens):     
    pos_word_count = 0
    neg_word_count = 0    
    pos_flag = False
    neg_flag = False
    flip_count = 0        
    for word in tokens:
        senti = sia.polarity_scores(str(word)) 
        if senti["pos"] == 1:
            pos_word_count += 1
            pos_flag = True
            if neg_flag:
                flip_count += 1
                neg_flag =  False                
        elif senti["neg"] == 1:
            neg_word_count += 1
            neg_flag = True
            if pos_flag:
                flip_count +=1
                pos_flag = False
    return pos_word_count, neg_word_count,flip_count


## function to get list of emojis in a comment
def find_emoji(text):
    return list(x for x in text.split() if x in emoji_dict.keys() )
  
#helper function to collect number of interjection
def comment_interjection(user_comment):
    count = Counter(tag for word,tag in user_comment)
    return count['UH']    

#get heighly emotional words (associated with POS tags)
def get_high_emotion_words(postags):
    highly_pos = 0
    highly_neg = 0
    POS_list = ['JJ','JJR','JJS', 'RB','RBR','RBS','VB','VBD','VBG','VBN','VBP','VBZ']
    num_tokens = len(postags)
    for i in range(num_tokens):
        if postags[i][1] in POS_list:            
            #check sentiment of next word
            if i < (num_tokens - 1) :
                senti_word = sia.polarity_scores(postags[i+1][0])
                if senti_word['pos'] == 1:
                    highly_pos += 1
                if senti_word['neg'] == 1:
                    highly_neg += 1                
    return highly_pos, highly_neg        
    
# get the parent sentiment
def get_parent_sentiment(comment):
    sentiments = TextBlob(str(comment)).sentiment
    polarity = sentiments.polarity
    if polarity >= 0.1:
        return 1
    elif polarity < -0.1:
        return -1
    else:
        return 0        

# Concatinate the extracted features
def featureextraction(dataframe, field, func, column_names):
    return pd.concat((
        dataframe,
        dataframe[field].apply(
            lambda cell: pd.Series(func(cell), index=column_names))), axis=1)


# Extracting the features for each comment 
# Punctuation Features and presence of sarcastic symbol
def allfeatures(user_comment):
    if '!' or '.' or '?' in user_comment:
        Numofexclaimations = user_comment.count('!')
        Numofdots = user_comment.count('.')
        Numofquestionmarks = user_comment.count('?')
    else:
        Numofexclaimations = 0
        Numofdots = 0
        Numofquestionmarks = 0
    if '(!)' in user_comment:
        SarcasticSymbol = 1
    else:
        SarcasticSymbol = 0
    sentiments = TextBlob(str(user_comment)).sentiment
    polarity = sentiments.polarity
    subjectivity = sentiments.subjectivity
    numofcapitals = sum(x.isupper() for x in user_comment.split() if len(x) > 1 )
    elist = find_emoji(user_comment)
    pscore =0
    nscore = 0
    for item in elist:
        if (emoji_dict[item][0] == 1):
            pscore += 1
        elif (emoji_dict[item][0] == -1):
            nscore += 1
    
            
    return Numofexclaimations,Numofdots,Numofquestionmarks,SarcasticSymbol,polarity,subjectivity,numofcapitals,pscore,nscore
                
input_features = featureextraction(test_data_RF, 'comment', allfeatures, ['Numofexclaimations', 'Numofdots','Numofquestionmarks','SarcasticSymbol','Polarity', 'Subjectivity','NumofCapitalWords','PositiveEmojiCount','NegativeEmojiCount'])
input_features['interjection']  = test_data_RF.POS.apply(comment_interjection)
                                   
# Create a list of the feature column's names
features = ['Numofexclaimations','Numofdots', 'Numofquestionmarks', 'SarcasticSymbol', 'Polarity','Subjectivity', 'NumofCapitalWords', 'PositiveEmojiCount','NegativeEmojiCount','interjection',]

X_test = input_features[features]
y_test = input_features['label']
                                   
y_predict = loaded_model.predict(X_test)
result = loaded_model.score(input_features[features], input_features['label'])
num_comments = len(input_features['comment'])
for i in range(num_comments):
    print("Test label= %s, Predicted= %s" % (input_features['label'].iloc[i], y_predict[i]))
    
print("\nWith Random Forest Accuracy is ", result*100)                                      

Test label= 0, Predicted= 1
Test label= 0, Predicted= 0
Test label= 1, Predicted= 1
Test label= 1, Predicted= 1
Test label= 1, Predicted= 0
Test label= 1, Predicted= 1
Test label= 1, Predicted= 1
Test label= 0, Predicted= 0
Test label= 1, Predicted= 1
Test label= 1, Predicted= 0

With Random Forest Accuracy is  70.0


In [10]:
test_data_RF


Unnamed: 0,label,comment,parent_comment,POS
0,0,I used to hold my tek-dek against the window,Skateboard for me. Honestly didn't realize so ...,"[(I, PRP), (used, VBD), (to, TO), (hold, VB), ..."
1,0,"They tried to email Matt Daemon, but it bounce...",Or maybe even that they found Matt Damon.,"[(They, PRP), (tried, VBD), (to, TO), (email, ..."
2,1,"Yeah, 43% winrate as ADC and 39% winrate as mi...",Let's take a moment and appreciate how balance...,"[(Yeah, UH), (,, ,), (43, CD), (%, NN), (winra..."
3,1,"Bro, Bitcoin is the currency of the future!",I take my money in currency please.,"[(Bro, NNP), (,, ,), (Bitcoin, NNP), (is, VBZ)..."
4,1,You're Northernlion's alt account aren't you?,Broken stopwatch idea This may or may not have...,"[(You, PRP), ('re, VBP), (Northernlion, NNP), ..."
5,1,Here's a little secret on how to make LOTS AND...,You can pay people money with Facebook. I have...,"[(Here, RB), ('s, VBZ), (a, DT), (little, JJ),..."
6,1,"The police did a great job, they saved the ban...",California police killed hostage in July bank ...,"[(The, DT), (police, NN), (did, VBD), (a, DT),..."
7,0,There's a difference between believing in God'...,"We can be welcoming, and we should. Hospitalit...","[(There, EX), ('s, VBZ), (a, DT), (difference,..."
8,1,Finally!,4K Video camera finally packed in a phone,"[(Finally, RB), (!, .)]"
9,1,I'll have to make a note to ask her about how ...,Who wants to be the person to ask her about th...,"[(I, PRP), ('ll, MD), (have, VB), (to, TO), (m..."


In [12]:
## CNN model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import re
special_character_removal=re.compile(r'[^a-z\d ]',re.IGNORECASE)
replace_numbers=re.compile(r'\d+',re.IGNORECASE)

def text_to_wordlist(text):   
    # Convert words to lower case and split them
    text = text.lower().split()    
    text = " ".join(text)    
    #Remove Special Characters
    text=special_character_removal.sub('',text)     
    #Replace Numbers
    text=replace_numbers.sub('n',text)  
    
    return(text)

X_test = test_data_CNN['comment']
test_comments=[]
for text in X_test:
    test_comments.append(text_to_wordlist(text))

tokenizer = Tokenizer()
tokenizer.fit_on_texts(test_comments)
test_sequences = tokenizer.texts_to_sequences(test_comments)

word_index = tokenizer.word_index
test_data = pad_sequences(test_sequences, maxlen=2500, padding='post')


from keras.models import model_from_json
json_file = open('model_4.json','r')
loaded_json = json_file.read()
json_file.close()

loaded_model = model_from_json(loaded_json)
loaded_model.load_weights('model_4.h5')
loaded_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

y_predict = loaded_model.predict_classes(test_data)

y_test = test_data_CNN['label']
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
num_comments = len(test_data_CNN['comment'])
for i in range(num_comments):
    print("Test input label= %s, Predicted= %s" % (test_data_CNN['label'].iloc[i], y_predict[i]))
    
print("\nWith CNN Accuracy is :", accuracy_score(y_test,y_predict) * 100) 

Test input label= 0, Predicted= [0]
Test input label= 0, Predicted= [0]
Test input label= 1, Predicted= [0]
Test input label= 0, Predicted= [0]
Test input label= 1, Predicted= [0]
Test input label= 0, Predicted= [0]
Test input label= 0, Predicted= [0]
Test input label= 1, Predicted= [1]
Test input label= 1, Predicted= [0]
Test input label= 1, Predicted= [1]

With CNN Accuracy is : 70.0
