In [1]:
#import statements
import pandas as pd
import numpy as np
import os
import re
import time
from langdetect import detect  
import ast
from bs4 import BeautifulSoup
import requests, json
from nltk import word_tokenize,pos_tag_sents,WordNetLemmatizer
from nltk.corpus import wordnet
from collections import Counter
from textblob import TextBlob
from sklearn.svm import SVC  
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import nltk
# from polyglot.detect import Detector 
# import cld2
from sklearn.ensemble import GradientBoostingClassifier 

In [2]:
# Reads the emoticon look up table file
header = ['EmoticonSymbol','SentimentScore']
emoticon_data = pd.read_csv('EmoticonLookupTable.txt', delimiter='\t', encoding = 'ISO-8859-1',names=header)
#Writing emoticons to a dictionary
emoji_dict = emoticon_data.groupby('EmoticonSymbol')['SentimentScore'].apply(list).to_dict()

In [3]:
#Getting acronyms and slangs from html page and creating a dictionary
resp = requests.get("http://www.netlingo.com/acronyms.php")
soup = BeautifulSoup(resp.text, "html.parser")
slangdict= {}
key=""
value=""
for div in soup.findAll('div', attrs={'class':'list_box3'}):
  for li in div.findAll('li'):
   for a in li.findAll('a'):
       key =a.text
   value = li.text.split(key)[1]
   slangdict[key]=value

In [4]:
#Removing the "-or-" terms in the dictionary and retaining one acronym
for key,value in slangdict.items():
    if "-or-" in value:
       removestring = re.findall('-or-(.*)',value)
       removestring = ''.join(removestring)
       newvalue = value.replace(removestring,'')
       newvalue = newvalue.replace("-or-",'')
       slangdict[key] = newvalue
    elif "-or" in value:
       removestring = re.findall('-or(.*)',value)
       removestring = ''.join(removestring)
       newvalue = value.replace(removestring,'')
       newvalue = newvalue.replace("-or",'')
       slangdict[key] = newvalue
    

In [5]:
key_to_be_replaced = []
for keys in slangdict.keys():
    if " or " in keys:
        key_to_be_replaced.append(keys)

#print(key_to_be_replaced)

for keys in key_to_be_replaced:
    getkeys = keys.split("or")
    for x in getkeys:
        x = x.strip()
        slangdict[x]= slangdict[keys]
    slangdict.pop(keys,None)    
    

In [6]:
file = open("Slangdictionary.txt",'w',encoding='utf-8')
file.write(str(slangdict))
file.close()


In [8]:
#read the training data from the csv file
header = ['label','comment','parent_comment']
cleaneddata = pd.read_table('clean_data_test_balanced_Wparent.csv',
                    sep='|', 
                   # delimiter=',',
                    usecols=[0,1,2],
                    dtype={'label':int,'comment':str},
                    keep_default_na=False)

In [9]:
print(cleaneddata.shape)

(243572, 3)


In [10]:
# Reading the slang dictionary that is already created
f = open("Slangdictionary.txt","r")
res1=f.read()
f.close()
slangdict = ast.literal_eval(res1)

In [11]:
#method to extract features
def featureextraction(dataframe, field, func, column_names):
    return pd.concat((
        dataframe,
        dataframe[field].apply(
            lambda cell: pd.Series(func(cell), index=column_names))), axis=1)

In [12]:
## function to get list of emojis in a comment
def find_emoji(text):
    return list(x for x in text.split() if x in emoji_dict.keys() )

In [13]:
# Extracting the features for each comment 
# Punctuation Features and presence of sarcastic symbol
def allfeatures(user_comment):
    if '!' or '.' or '?' in user_comment:
        Numofexclaimations = user_comment.count('!')
        Numofdots = user_comment.count('.')
        Numofquestionmarks = user_comment.count('?')
    else:
        Numofexclaimations = 0
        Numofdots = 0
        Numofquestionmarks = 0
    if '(!)' in user_comment:
        SarcasticSymbol = 1
    else:
        SarcasticSymbol = 0
    sentiments = TextBlob(str(user_comment)).sentiment
    polarity = sentiments.polarity
    subjectivity = sentiments.subjectivity
    numofcapitals = sum(x.isupper() for x in user_comment.split() if len(x) > 1 )
    elist = find_emoji(user_comment)
    pscore =0
    nscore = 0
    for item in elist:
        if (emoji_dict[item][0] == 1):
            pscore += 1
        elif (emoji_dict[item][0] == -1):
            nscore += 1
    return Numofexclaimations,Numofdots,Numofquestionmarks,SarcasticSymbol,polarity,subjectivity,numofcapitals,pscore,nscore

In [14]:
#feature set 1
start_time = time.time() 
featureddataset = featureextraction(cleaneddata, 'comment', allfeatures, ['Numofexclaimations', 'Numofdots','Numofquestionmarks','SarcasticSymbol','Polarity', 'Subjectivity','NumofCapitalWords','PositiveEmojiCount','NegativeEmojiCount'])
end_time = time.time() 
print("time taken ", end_time-start_time)

time taken  461.1776177883148


In [15]:
#helper function to collect number of interjection
def comment_interjection(user_comment):
    count = Counter(tag for word,tag in user_comment)
    return count['UH']

In [16]:
#feature 2
#feature extraction using POS
start_time = time.time()
txt = cleaneddata['comment'].tolist()
#POS tagging for all the tokens in the sentence
tagged_texts = pos_tag_sents(map(word_tokenize, txt))
end_time = time.time()
cleaneddata['POS'] = tagged_texts
print("time taken ", end_time-start_time)

# number of interjection
featureddataset['interjection']  = cleaneddata.POS.apply(comment_interjection)

time taken  137.26815843582153


In [17]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

sia = SIA()
#feature set 3
#get heighly emotional words (associated with POS tags)
def get_high_emotion_words(postags):
    highly_pos = 0
    highly_neg = 0
    POS_list = ['JJ','JJR','JJS', 'RB','RBR','RBS','VB','VBD','VBG','VBN','VBP','VBZ']
    num_tokens = len(postags)
    for i in range(num_tokens):
        if postags[i][1] in POS_list:            
            #check sentiment of next word
            if i < (num_tokens - 1) :
                senti_word = sia.polarity_scores(postags[i+1][0])
                if senti_word['pos'] == 1:
                    highly_pos += 1
                if senti_word['neg'] == 1:
                    highly_neg += 1
                
    return highly_pos, highly_neg 



In [18]:
featureddataset['highly_positive'],featureddataset['highly_negative'] = zip(*cleaneddata['POS'].map(get_high_emotion_words)) 

In [19]:

def get_parent_sentiment(comment):
    sentiments = TextBlob(str(comment)).sentiment
    polarity = sentiments.polarity
    if polarity >= 0.1:
        return 1
    elif polarity < -0.1:
        return -1
    else:
        return 0

In [20]:
# feature set 4
start_time = time.time()
featureddataset['parent_sentiment'] = cleaneddata.parent_comment.apply(get_parent_sentiment)
end_time = time.time()
print("time taken ", end_time-start_time)

time taken  119.58494448661804


In [21]:
# Additional features pos_words, neg_words, flip_count
def get_pos_neg_word_count(tokens):
    
    pos_word_count = 0
    neg_word_count = 0    
    pos_flag = False
    neg_flag = False
    flip_count = 0
        
    for word in tokens:
        senti = sia.polarity_scores(str(word)) 
        if senti["pos"] == 1:
            pos_word_count += 1
            pos_flag = True
            if neg_flag:
                flip_count += 1
                neg_flag =  False
                
        elif senti["neg"] == 1:
            neg_word_count += 1
            neg_flag = True
            if pos_flag:
                flip_count +=1
                pos_flag = False
    return pos_word_count, neg_word_count,flip_count
    

In [22]:
# feature set 5
start_time = time.time()
emotion_dataset = featureextraction(featureddataset, 'comment', get_pos_neg_word_count, ['PosWords','NegWords','FlipCount'])
end_time = time.time()
print("time taken ", end_time-start_time)


time taken  191.66995072364807


In [None]:
featureddataset = featureextraction(featureddataset, 'comment', get_pos_neg_word_count, ['PosWords','NegWords','FlipCount'])

In [23]:
from keras.preprocessing.text import Tokenizer
# 
# load the Glove embedding into memory
embeddings_index = {}
f = open('glove.6B.100d.txt', encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

# Tokenize the comments
Word_tokenizer = Tokenizer()
Word_tokenizer.fit_on_texts(featureddataset['comment'])
# Word_tokenizer.num_words = 100000
vocab_size = len(Word_tokenizer.word_index) + 1
#encode the train tokens to sequence
sequences = Word_tokenizer.texts_to_sequences(featureddataset['comment'])

# create embedding matrix
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in Word_tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

Using TensorFlow backend.


Found 400000 word vectors.


In [24]:
# featureddataset['embedding'] = sequences
emotion_dataset['embedding'] = sequences

In [26]:
# from keras.preprocessing.text import Tokenizer

# # Tokenize the comments
# Word_tokenizer = Tokenizer()
# Word_tokenizer.fit_on_texts(featureddataset['comment'])
# # Word_tokenizer.num_words = 100000
# vocab_size = len(Word_tokenizer.word_index) + 1
# #encode the train tokens to sequence
# sequences = Word_tokenizer.texts_to_sequences(featureddataset['comment'])

Using TensorFlow backend.


In [27]:
# embedding_matrix = np.zeros((vocab_size, 100))
# for word, i in Word_tokenizer.word_index.items():
#     embedding_vector = embeddings_index.get(word)
#     if embedding_vector is not None:
#         embedding_matrix[i] = embedding_vector

In [25]:
embedding_matrix.shape

(79339, 100)

In [26]:
import math
#helper to calculate the squareroot of square of the embedding matrix
embedding_square ={}
keys = range(vocab_size)
for i in keys:
    embedding = embedding_matrix[i]
    sum_square = 0
    for j in range(len(embedding)):
        values = embedding[j]
        sum_square += values*values
    embedding_square[i] = math.sqrt(sum_square)
# print(embedding_square)

In [27]:
len(embedding_square)

79339

In [28]:
file = open("embedding_square_test.txt",'w',encoding='utf-8')
file.write(str(embedding_square))
file.close()

In [29]:
# import math
# helper to calculate the cosine similarity between two words
def cosine_similarity(word1,word2,v1,v2):
#     "compute cosine similarity of v1 to v2: (v1 dot v2)/{||v1||*||v2||)"
    sumxy = 0
    for i in range(len(v1)):
        x = v1[i]; y = v2[i]
        sumxy += x*y
    return sumxy/(embedding_square[word1]*embedding_square[word2])

In [30]:
#helper function to calculate the cosine similarity between the words in te comments
def calculate_similarity(comment_token):
    token_array = np.matrix(comment_token)
    comment_len = token_array.shape[1]
    most_similar = least_similar = most_dissimilar = least_dissimilar = 0
    if comment_len > 0:
        mat = np.empty(shape=(comment_len,comment_len))
        mat[:] = np.nan
        for i in range(0,comment_len):
            for j in range(i+1,comment_len):
                a = comment_token[i]
                b = comment_token[j]
                mat[i][j] = cosine_similarity(a,b,embedding_matrix[a],embedding_matrix[b].T)
                mat[j][i] = mat[i][j]
    #get the most similar
        similar_mat = np.nanmax(mat,axis=0)
        most_similar = np.nanmax(similar_mat)
        least_similar = np.nanmin(similar_mat)
    #get the most dissimilar 
        dissimilar_mat = np.nanmin(mat,axis=0)
        most_dissimilar = np.nanmax(dissimilar_mat)
        least_dissimilar = np.nanmin(dissimilar_mat)
        
    return most_similar, least_similar, most_dissimilar, least_dissimilar            

In [31]:
# feature set 6
start_time = time.time()
embedded_dataset = featureextraction(emotion_dataset, 'embedding', calculate_similarity, ['most_similar','least_similar','most_dissimilar','least_dissimilar'])
end_time = time.time()
print("time taken ", end_time-start_time)

  app.launch_new_instance()


time taken  1624.5822041034698


In [32]:
values = {'most_similar': 0, 'least_similar': 0, 'most_dissimilar': 0, 'least_dissimilar': 0}
embedded_dataset = embedded_dataset.fillna(value=values)
# embedded_dataset = embedded_dataset.drop(columns=['embedding'])

In [33]:
embedded_dataset.head()

Unnamed: 0,label,comment,parent_comment,Numofexclaimations,Numofdots,Numofquestionmarks,SarcasticSymbol,Polarity,Subjectivity,NumofCapitalWords,...,highly_negative,parent_sentiment,PosWords,NegWords,FlipCount,embedding,most_similar,least_similar,most_dissimilar,least_dissimilar
0,0,Actually most of her supporters and sane peopl...,Hillary's Surrogotes Told to Blame Media for '...,0.0,5.0,0.0,0.0,0.5,0.5,0.0,...,0,0,0,0,0,"[135, 160, 8, 112, 1388, 5, 3768, 40, 497, 32,...",1.0,0.126212,0.068733,-0.133883
1,0,They can't survive without an echo chamber whi...,Thank God Liberals like to live in concentrate...,0.0,1.0,0.0,0.0,0.8,0.75,0.0,...,0,0,0,0,0,"[20, 89, 2783, 244, 52, 4613, 6090, 168, 7, 13...",0.812221,0.351907,0.30781,0.149742
2,0,you're pretty cute yourself 1729 total,Saw this cutie training his Attack today...,0.0,0.0,0.0,0.0,0.25,0.916667,0.0,...,0,0,0,0,0,"[75, 134, 1292, 460, 37933, 1260]",0.639487,-0.068457,-0.068457,-0.162475
3,0,If you kill me you'll crash the meme market,If you were locked in a room with 49 other peo...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,-1,0,0,0,"[27, 6, 328, 51, 551, 2285, 1, 1224, 622]",0.885117,0.045072,0.045072,-0.16871
4,0,I bet he wrote that last message as he was sob...,You're not even that pretty!,0.0,1.0,0.0,0.0,0.0,0.066667,0.0,...,1,1,0,0,0,"[4, 404, 28, 1516, 10, 216, 727, 31, 28, 25, 2...",1.0,0.206977,0.206977,-0.079646


In [34]:
embedded_dataset.isnull().values.any()

False

In [35]:
#write the cleaned data with features into a csv file
embedded_dataset.to_csv('clean_testdata_with_all_features.csv',
           sep= '|',
           index=False)

In [99]:
# # embedded_dataset.drop(columns=['embedding'])
# from sklearn.model_selection import train_test_split
# # training_data, test_data = train_test_split(embedded_dataset, test_size=0.20)
# training_data, test_data, y_train, y_test = train_test_split(embedded_dataset, embedded_dataset['label'], 
#                                                     test_size=0.20)

In [2]:
# load the train data with features
traindata_withfeature = pd.read_table('clean_data_with_all_features.csv',
                    sep='|', 
                   # delimiter=',',
#                     usecols=[0,1,2],
                    dtype={'label':int,'comment':str},
                    keep_default_na=False)

# load the test data with features
testdata_withfeature = pd.read_table('clean_testdata_with_all_features.csv',
                    sep='|', 
                   # delimiter=',',
#                     usecols=[0,1,2],
                    dtype={'label':int,'comment':str},
                    keep_default_na=False)


In [3]:
testdata_withfeature.head()

Unnamed: 0,label,comment,parent_comment,Numofexclaimations,Numofdots,Numofquestionmarks,SarcasticSymbol,Polarity,Subjectivity,NumofCapitalWords,...,highly_negative,parent_sentiment,PosWords,NegWords,FlipCount,embedding,most_similar,least_similar,most_dissimilar,least_dissimilar
0,0,Actually most of her supporters and sane peopl...,Hillary's Surrogotes Told to Blame Media for '...,0.0,5.0,0.0,0.0,0.5,0.5,0.0,...,0,0,0,0,0,"[135, 160, 8, 112, 1388, 5, 3768, 40, 497, 32,...",1.0,0.126212,0.068733,-0.133883
1,0,They can't survive without an echo chamber whi...,Thank God Liberals like to live in concentrate...,0.0,1.0,0.0,0.0,0.8,0.75,0.0,...,0,0,0,0,0,"[20, 89, 2783, 244, 52, 4613, 6090, 168, 7, 13...",0.812221,0.351907,0.30781,0.149742
2,0,you're pretty cute yourself 1729 total,Saw this cutie training his Attack today...,0.0,0.0,0.0,0.0,0.25,0.916667,0.0,...,0,0,0,0,0,"[75, 134, 1292, 460, 37933, 1260]",0.639487,-0.068457,-0.068457,-0.162475
3,0,If you kill me you'll crash the meme market,If you were locked in a room with 49 other peo...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,-1,0,0,0,"[27, 6, 328, 51, 551, 2285, 1, 1224, 622]",0.885117,0.045072,0.045072,-0.16871
4,0,I bet he wrote that last message as he was sob...,You're not even that pretty!,0.0,1.0,0.0,0.0,0.0,0.066667,0.0,...,1,1,0,0,0,"[4, 404, 28, 1516, 10, 216, 727, 31, 28, 25, 2...",1.0,0.206977,0.206977,-0.079646


In [4]:
# drop embedding
traindata_withfeature = traindata_withfeature.drop(columns=['embedding'])

testdata_withfeature = testdata_withfeature.drop(columns=['embedding'])

In [40]:
# drop parent sentiment
# parent_sentiment
traindata_Wparent = traindata_withfeature
traindata_Wparent = traindata_Wparent.drop(columns=['parent_sentiment'])

testdata_Wparent = testdata_withfeature
testdata_Wparent = testdata_Wparent.drop(columns=['parent_sentiment'])

In [50]:
# drop new added features(5)
traindata_parent = traindata_withfeature
traindata_parent = traindata_parent.drop(columns=['highly_positive','highly_negative','PosWords','NegWords','FlipCount'])

testdata_parent = testdata_withfeature
testdata_parent = testdata_parent.drop(columns=['highly_positive','highly_negative','PosWords','NegWords','FlipCount'])

In [16]:
# with only embeddings
traindata = traindata_withfeature
traindata = traindata.drop(columns=['parent_sentiment','highly_positive','highly_negative','PosWords','NegWords','FlipCount'])

testdata = testdata_withfeature
testdata = testdata.drop(columns=['parent_sentiment','highly_positive','highly_negative','PosWords','NegWords','FlipCount'])

In [19]:
testdata.shape
traindata.shape

(978039, 17)

In [51]:
#train data features
newtrain = pd.DataFrame(traindata_parent.iloc[:, 3:])
#train data labels
targetlabel = traindata_parent.iloc[:,0]

In [52]:
# test data features
newtest = pd.DataFrame(testdata_parent.iloc[:, 3:])
#test data labels
testlabel = testdata_parent.iloc[:,0]

In [53]:
# gradient boosting algorithm
start_time = time.time()
gradient_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.5, max_depth=1)
gradient_clf.fit(newtrain,targetlabel)
end_time = time.time()
print("time taken ", end_time-start_time)

time taken  52.24272012710571


In [54]:
gradient_predictions = gradient_clf.predict(newtest)

In [55]:
print("Confusion Matrix")
print(confusion_matrix(testlabel, gradient_predictions))
print("Classification Report")
print(classification_report(testlabel, gradient_predictions))

Confusion Matrix
[[78483 42081]
 [54489 68519]]
Classification Report
              precision    recall  f1-score   support

           0       0.59      0.65      0.62    120564
           1       0.62      0.56      0.59    123008

   micro avg       0.60      0.60      0.60    243572
   macro avg       0.60      0.60      0.60    243572
weighted avg       0.61      0.60      0.60    243572



In [57]:
#random forest
from sklearn.ensemble import RandomForestClassifier 

start_time = time.time()
random_clf = RandomForestClassifier(n_jobs=2, random_state=0)
random_clf.fit(newtrain,targetlabel)
end_time = time.time()
print("time taken ", end_time-start_time)



time taken  23.141639947891235


In [58]:
random_predictions = random_clf.predict(newtest)

In [59]:
print("Confusion Matrix")
print(confusion_matrix(testlabel, random_predictions))
print("Classification Report")
print(classification_report(testlabel, random_predictions))

Confusion Matrix
[[80770 39794]
 [60120 62888]]
Classification Report
              precision    recall  f1-score   support

           0       0.57      0.67      0.62    120564
           1       0.61      0.51      0.56    123008

   micro avg       0.59      0.59      0.59    243572
   macro avg       0.59      0.59      0.59    243572
weighted avg       0.59      0.59      0.59    243572



In [60]:
# embedded_dataset.drop(columns=['embedding'])
print(accuracy_score(testlabel, random_predictions))

0.5897968567815677


In [128]:
newtrain = pd.DataFrame(embedded_dataset.iloc[:, 3:])

In [45]:
print(newtrain.shape)

(978039, 20)


In [130]:
targetlabel = embedded_dataset.iloc[:,0]

In [131]:
print(targetlabel.shape)

(1010781,)


In [None]:
# testing with with parent comment concatinated

In [135]:
start_time = time.time()
# data['comment'] = data.comment.apply(comment_clean)
data_test[['comment','parent_comment']] = data_test[['comment','parent_comment']].applymap(comment_clean)
# remove data with empty comments
valid_test_comment = data_test['comment'] != ' '
data_test = data_test[valid_comment]
end_time = time.time()
print("time taken ", end_time-start_time)

  


time taken  21.321204900741577


In [None]:
start_time = time.time() 
featureddataset = featureextraction(data_test, 'comment', allfeatures, ['Numofexclaimations', 'Numofdots','Numofquestionmarks','SarcasticSymbol','Polarity', 'Subjectivity','NumofCapitalWords','PositiveEmojiCount','NegativeEmojiCount'])
end_time = time.time() 
print("time taken ", end_time-start_time)

In [103]:
test_data_new = test_data.iloc[:, 3:] 

In [119]:
print(test_data_new.shape)

(202157, 15)


In [104]:
test_data_target = test_data.iloc[:,0]

In [120]:
print(test_data_target.shape)

(202157,)


In [105]:
predictions = clf.predict(test_data_new)

In [106]:
print("Confusion Matrix")
print(confusion_matrix(test_data_target, predictions))
print("Classification Report")
print(classification_report(test_data_target, predictions))

Confusion Matrix
[[71179 29832]
 [52228 48918]]
Classification Report
              precision    recall  f1-score   support

           0       0.58      0.70      0.63    101011
           1       0.62      0.48      0.54    101146

   micro avg       0.59      0.59      0.59    202157
   macro avg       0.60      0.59      0.59    202157
weighted avg       0.60      0.59      0.59    202157



In [107]:
print(accuracy_score(test_data_target, predictions))

0.5940778701702142


In [122]:
#learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
# for learning_rate in learning_rates:
#     gb = GradientBoostingClassifier(n_estimators=20, learning_rate = learning_rate, max_features=2, max_depth = 2, random_state = 0)
#     gb.fit(newtrain,targetlabel)
#     predictions = clf.predict(test_data_new)
#     print("Confusion Matrix")
#     print(confusion_matrix(test_data_target, predictions))
#     print()
#     print("Classification Report")
#     print(classification_report(test_data_target, predictions))
#     print()
#     print(accuracy_score(test_data_target, predictions))

In [112]:
# Create a list of the feature column's names
features = embedded_dataset.columns[3:]

In [113]:
X_train = training_data[features]
y_train = training_data['label']
X_test = test_data[features]
y_test = test_data['label']

In [114]:
from sklearn.ensemble import RandomForestClassifier
# Create a random forest Classifier. 
random_clf = RandomForestClassifier(n_jobs=2, random_state=0)

In [115]:
# Train the Classifier to take the training features 
start_time = time.time()
random_clf.fit(X_train, y_train )
end_time = time.time()
print("time taken ", end_time-start_time)



time taken  14.977846622467041


In [116]:
# Apply the Classifier we trained to the test data
y_predit = random_clf.predict(X_test)

In [117]:
print(confusion_matrix(y_test,y_predit))  
print(classification_report(y_test,y_predit))  
print(accuracy_score(y_test,y_predit))

[[69374 31637]
 [51588 49558]]
              precision    recall  f1-score   support

           0       0.57      0.69      0.63    101011
           1       0.61      0.49      0.54    101146

   micro avg       0.59      0.59      0.59    202157
   macro avg       0.59      0.59      0.58    202157
weighted avg       0.59      0.59      0.58    202157

0.588315022482526


In [134]:
#testing on the test dataset
#read the testing data from the csv file
#read the training data from the csv file
header = ['label','comment','parent_comment']
test_clean = pd.read_table('clean_data_test_balanced_final.csv',
                    sep='|', 
                    usecols=[0,1,2],
                    dtype={'label':int,'comment':str},
                    keep_default_na=False)

#extract features
start_time = time.time() 
feature_testdata = featureextraction(test_clean, 'comment', allfeatures, ['Numofexclaimations', 'Numofdots','Numofquestionmarks','SarcasticSymbol','Polarity', 'Subjectivity','NumofCapitalWords','PositiveEmojiCount','NegativeEmojiCount'])
end_time = time.time() 

txt = test_clean['comment'].tolist()
#POS tagging for all the tokens in the sentence
tagged_texts = pos_tag_sents(map(word_tokenize, txt))
end_time = time.time()
test_clean['POS'] = tagged_texts

# number of interjection
feature_testdata['interjection']  = test_clean.POS.apply(comment_interjection)

#emotion intensity
feature_testdata['highly_positive'],feature_testdata['highly_negative'] = zip(*cleaneddata['POS'].map(get_high_emotion_words))

#parent comment
feature_testdata['parent_sentiment'] = test_clean.parent_comment.apply(get_parent_sentiment)

#positive and negative word count
emotion_test_dataset = featureextraction(feature_testdata, 'comment', get_pos_neg_word_count, ['PosWords','NegWords','FlipCount'])

#embedding matrix creation
# Tokenize the comments
Word_tokenizer_test = Tokenizer()
Word_tokenizer_test.fit_on_texts(feature_testdata['comment'])
# Word_tokenizer.num_words = 100000
vocab_size = len(Word_tokenizer_test.word_index) + 1
#encode the train tokens to sequence
sequences = Word_tokenizer_test.texts_to_sequences(feature_testdata['comment'])

# create embedding matrix
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in Word_tokenizer_test.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        
#add the embedding to dataset
emotion_test_dataset['embedding'] = sequences

# import math
#helper to calculate the squareroot of square of the embedding matrix
embedding_square ={}
keys = range(vocab_size)
for i in keys:
    embedding = embedding_matrix[i]
    sum_square = 0
    for j in range(len(embedding)):
        values = embedding[j]
        sum_square += values*values
    embedding_square[i] = math.sqrt(sum_square)

#similarity features
embedded_test_dataset = featureextraction(emotion_test_dataset, 'embedding', calculate_similarity, ['most_similar','least_similar','most_dissimilar','least_dissimilar'])
print("time taken ", end_time-start_time)

values = {'most_similar': 0, 'least_similar': 0, 'most_dissimilar': 0, 'least_dissimilar': 0}
embedded_test_dataset = embedded_test_dataset.fillna(value=values)
embedded_test_dataset = embedded_test_dataset.drop(columns=['embedding'])

#split the test data into the comment and label
test_data_new = test_data.iloc[:, 3:] 
test_data_target = test_data.iloc[:,0]
#predict using gradient boosting
predictions_gradientboost = gradient_clf.predict(test_data_new)
print("Confusion Matrix")
print(confusion_matrix(test_data_target, predictions_gradientboost))
print("Classification Report")
print(classification_report(test_data_target, predictions_gradientboost))
print(accuracy_score(test_data_target, predictions_gradientboost))

#predict using random forest
predictions_randomforest = random_clf.predict(test_data_new)
print("Confusion Matrix")
print(confusion_matrix(test_data_target, predictions_randomforest))
print("Classification Report")
print(classification_report(test_data_target, predictions_randomforest))
print(accuracy_score(test_data_target, predictions_randomforest))

In [None]:
from sklearn.svm import SVC  
start_time = time.time()
# linear kernel
svclassifier = SVC(kernel='linear',C=1,gamma=1)  
svclassifier.fit(X_train, y_train) 

y_pred = svclassifier.predict(X_test)  

end_time = time.time()
print("time taken ", end_time-start_time)
print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))

In [None]:
# Guassian kernel
start_time = time.time()
svclassifier = SVC(kernel='rbf',C=1,gamma=1)  
svclassifier.fit(X_train, y_train) 

y_pred = svclassifier.predict(X_test)  
end_time = time.time()
print("time taken ", end_time-start_time)
print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))

In [None]:
# polynomial kernel with degree 3
start_time = time.time()
svclassifier = SVC(kernel='poly', degree=3) 
svclassifier.fit(X_train, y_train) 

y_pred = svclassifier.predict(X_test) 
end_time = time.time()
print("time taken ", end_time-start_time)

print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))