In [50]:
#import statements
import pandas as pd
import numpy as np
import os
import re
import time
from langdetect import detect  
import ast
from bs4 import BeautifulSoup
import requests, json
from nltk import word_tokenize,pos_tag_sents,WordNetLemmatizer
from nltk.corpus import wordnet
from collections import Counter
from textblob import TextBlob
import nltk
from polyglot.detect import Detector 
import cld2
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [51]:
#read the training data from the csv file
header = ['label','comment','parent_comment']
data = pd.read_table('train-balanced.csv',
                    sep='\t', 
#                     delimiter=',', 
                    names=header,
                    usecols=[0,1,9],
                    dtype={'label':int,'comment':str, 'parent_comment':str},
                    keep_default_na=False)

In [52]:
# Reads the emoticon look up table file
header = ['EmoticonSymbol','SentimentScore']
emoticon_data = pd.read_csv('EmoticonLookupTable.txt', delimiter='\t', encoding = 'ISO-8859-1',names=header)

In [53]:
#Writing emoticons to a dictionary
emoji_dict = emoticon_data.groupby('EmoticonSymbol')['SentimentScore'].apply(list).to_dict()

In [54]:
# Reading the slang dictionary that is already created
f = open("Slangdictionary.txt","r")
res1=f.read()
f.close()
slangdict = ast.literal_eval(res1)

In [55]:
#helper function to clean the comments
def comment_clean(user_comment):
    # remove trailing \r and \n    
    user_comment.rstrip('\r\n')
    
    #remove the # from hashtag
    if '#' in user_comment:
        hash_tag = re.search('#',user_comment)
        if hash_tag is not None:
            user_comment = user_comment.replace(hash_tag.group(0),' ')
    #remove the redit tags(r/) from comment
    if 'r/' in user_comment:
        r_tag = re.search('r/',user_comment)
        if r_tag is not None:
            user_comment = user_comment.replace(r_tag.group(0),' ')
    #remove the URL links from comments  
    if 'HTTP' in user_comment:
        # url of the form [link name](http://url)
        url_link = re.search('\[(.*)\(HTTP(.*)\)', user_comment)
        if url_link is not None:
            user_comment = user_comment.replace(url_link.group(0),' ')
    if 'http' in user_comment:
        # url of the form [link name](http://url)
        url_link = re.search('\[(.*)\(http(.*)\)', user_comment)
        if url_link is not None:
            user_comment = user_comment.replace(url_link.group(0),' ')
        else:
            #url of the form http:/
            url_link = re.search('http(.*)', user_comment)
            if url_link is not None:
                user_comment = user_comment.replace(url_link.group(0),' ') 
                
    # remove numbers from comments
    user_comment_not_num = re.sub(r'\d+', '', user_comment)   
    
    # Check if the comment has exactly 2 stars
    if user_comment.count('*')==2:
        boldwords = re.search(r"\*(.*?)\*",user_comment)
        #print(boldwords.group(0))
        # Check if the comments have any other text other than **
        if boldwords.group(0) != "**":
            Wordstocapitalize = re.findall(r"\*(.*?)\*",boldwords.group(0))
            Wordstocapitalize = "".join( Wordstocapitalize)
            # Replace the user comment with capitalized words
            user_comment = user_comment.replace(boldwords.group(0),Wordstocapitalize.upper())
    comment_words = re.sub(r"[^a-zA-Z0-9\s\']","",user_comment)         
    comment_words=comment_words.split()
    for word in comment_words:
        if word.upper() in slangdict.keys():
            user_comment = user_comment.replace(word.upper(),slangdict[word.upper()])
        elif word in slangdict.keys():
            user_comment = user_comment.replace(word,slangdict[word]) 
        
    # replace non english comments with empty string
    try:
        isReliable, textBytesFound, details = cld2.detect(user_comment_not_num)
    except:
        try_text = ''.join(x for x in user_comment_not_num if x.isprintable())
        isReliable, textBytesFound, details = cld2.detect(try_text)
    cld_match = details[0][0]
    if not (cld_match == 'ENGLISH'):
        poly_match = Detector(user_comment_not_num, quiet=True).language.name
        if (poly_match != 'English'):
            user_comment = ' '    
#     text = ''.join([l for l in user_comment_not_num if unicodedata.category(unicode(l))[0] not in ('S', 'M', 'C')])
#     isReliable, textBytesFound, details = cld2.detect(text)
#     cld_match = details[0][0]
#     if not (cld_match == 'ENGLISH'):
#         poly_match = Detector(text, quiet=True).language.name
#         if (poly_match != 'English'):
#             user_comment = ' '               
   
            
    return user_comment           

In [56]:
#clean each commentnon_eng_data = []
start_time = time.time()
# data['comment'] = data.comment.apply(comment_clean)
data[['comment','parent_comment']] = data[['comment','parent_comment']].applymap(comment_clean)
# remove data with empty comments
valid_comment = data['comment'] != ' '
data = data[valid_comment]
end_time = time.time()
print("time taken ", end_time-start_time)

time taken  87.768639087677


In [57]:
# data['parent_comment'] = data.parent_comment.apply(comment_clean)

In [58]:
#write the cleaned data into a csv file
data.to_csv('clean_data_Cldpoly_withparent.csv',
           sep= '|',
           index=False)

In [59]:
#read the training data from the csv file
cleaneddata = pd.read_table('clean_data_Cldpoly_withparent.csv',
                    sep='|', 
                   # delimiter=',',
                    usecols=[0,1,2],
                    dtype={'label':int,'comment':str},
                    keep_default_na=False)

In [60]:
cleaneddata.head()

Unnamed: 0,label,comment,parent_comment
0,0,Nice Crib and Nice Hand.,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",They're favored to win.
3,0,"This meme isn't funny none of the ""new york ni...",deadass don't kill my buzz
4,0,I could use one of those tools.,Yep can confirm I saw the tool they use for th...


In [61]:
def featureextraction(dataframe, field, func, column_names):
    return pd.concat((
        dataframe,
        dataframe[field].apply(
            lambda cell: pd.Series(func(cell), index=column_names))), axis=1)

In [62]:
## function to get list of emojis in a comment
def find_emoji(text):
    return list(x for x in text.split() if x in emoji_dict.keys() )

In [63]:
# Extracting the features for each comment 
# Punctuation Features and presence of sarcastic symbol
def allfeatures(user_comment):
    if '!' or '.' or '?' in user_comment:
        Numofexclaimations = user_comment.count('!')
        Numofdots = user_comment.count('.')
        Numofquestionmarks = user_comment.count('?')
    else:
        Numofexclaimations = 0
        Numofdots = 0
        Numofquestionmarks = 0
    if '(!)' in user_comment:
        SarcasticSymbol = 1
    else:
        SarcasticSymbol = 0
    sentiments = TextBlob(str(user_comment)).sentiment
    polarity = sentiments.polarity
    subjectivity = sentiments.subjectivity
    numofcapitals = sum(x.isupper() for x in user_comment.split() if len(x) > 1 )
    elist = find_emoji(user_comment)
    pscore =0
    nscore = 0
    for item in elist:
        if (emoji_dict[item][0] == 1):
            pscore += 1
        elif (emoji_dict[item][0] == -1):
            nscore += 1
    return Numofexclaimations,Numofdots,Numofquestionmarks,SarcasticSymbol,polarity,subjectivity,numofcapitals,pscore,nscore

In [64]:
featureddataset = featureextraction(cleaneddata, 'comment', allfeatures, ['Numofexclaimations', 'Numofdots','Numofquestionmarks','SarcasticSymbol','Polarity', 'Subjectivity','NumofCapitalWords','PositiveEmojiCount','NegativeEmojiCount'])

MemoryError: 

In [34]:
def get_parent_sentiment(comment):
    sentiments = TextBlob(str(comment)).sentiment
    polarity = sentiments.polarity
    if polarity >= 0.1:
        return 1
    elif polarity < -0.1:
        return -1
    else:
        return 0
# sentiments = TextBlob(str("you are ")).sentiment

In [35]:
featureddataset['parent_sentiment'] = cleaneddata.parent_comment.apply(get_parent_sentiment)

In [36]:
start_time = time.time()
txt = cleaneddata['comment'].tolist()
#POS tagging for all the tokens in the sentence
tagged_texts = pos_tag_sents(map(word_tokenize, txt))
end_time = time.time()
cleaneddata['POS'] = tagged_texts
print("time taken ", end_time-start_time)

time taken  83.13779640197754


In [37]:

#helper function to collect number of interjection
def comment_interjection(user_comment):
    count = Counter(tag for word,tag in user_comment)
    return count['UH']

In [38]:
##feature extraction
# number of interjection
featureddataset['interjection']  = cleaneddata.POS.apply(comment_interjection)


In [39]:
featureddataset.head()
        

Unnamed: 0,label,comment,parent_comment,Numofexclaimations,Numofdots,Numofquestionmarks,SarcasticSymbol,Polarity,Subjectivity,NumofCapitalWords,PositiveEmojiCount,NegativeEmojiCount,parent_sentiment,interjection
0,0,"I highly doubt this mostly ignored, surely uns...","The GOP has the reputation, in recent times, o...",0.0,1.0,0.0,0.0,0.041313,0.601616,0.0,0.0,0.0,0,0
1,0,Holy shit they are dropping an Halloween surpr...,Donald Trump Used Legally Dubious Method to Av...,0.0,1.0,0.0,0.0,-0.2,0.8,0.0,0.0,0.0,1,0
2,0,Chafetz is a known liar (see PP vids) why does...,Some principles you've got there,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0
3,0,Kansas Number 1 in imaginary Muslim terrorists...,Kansas is probably the last state to have a te...,0.0,0.0,0.0,0.0,0.2,0.3,0.0,0.0,0.0,0,0
4,1,wow it is totally unreasonable to assume that ...,Clinton campaign accuses FBI of 'blatant doubl...,0.0,0.0,0.0,0.0,-0.1,0.783333,0.0,0.0,0.0,-1,0


In [43]:
# Create a list of the feature column's names
features = featureddataset.columns[3:]

In [44]:
features

Index(['Numofexclaimations', 'Numofdots', 'Numofquestionmarks',
       'SarcasticSymbol', 'Polarity', 'Subjectivity', 'NumofCapitalWords',
       'PositiveEmojiCount', 'NegativeEmojiCount', 'parent_sentiment',
       'interjection'],
      dtype='object')

In [41]:

train, test =train_test_split(featureddataset, test_size=0.2) 

In [45]:
X_train = train[features]
y_train = train['label']
X_test = test[features]
y_test = test['label']

In [42]:
# Create a random forest Classifier. 
clf = RandomForestClassifier(n_jobs=2, random_state=0)

In [46]:
# Train the Classifier to take the training features 
clf.fit(X_train, y_train )



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [47]:
# Apply the Classifier we trained to the test data
y_predit = clf.predict(X_test)

In [48]:
print(confusion_matrix(y_test,y_predit))  
print(classification_report(y_test,y_predit))  
print(accuracy_score(y_test,y_predit)) 

[[3225 1569]
 [2017 2648]]
              precision    recall  f1-score   support

           0       0.62      0.67      0.64      4794
           1       0.63      0.57      0.60      4665

   micro avg       0.62      0.62      0.62      9459
   macro avg       0.62      0.62      0.62      9459
weighted avg       0.62      0.62      0.62      9459

0.6208901575219368
