In [1]:
#import statements
import pandas as pd
import numpy as np
import os
import re
import time
from langdetect import detect  
import ast
from bs4 import BeautifulSoup
import requests, json
from nltk import word_tokenize,pos_tag_sents,WordNetLemmatizer
from nltk.corpus import wordnet
from collections import Counter
from textblob import TextBlob

from sklearn.svm import SVC  
from sklearn.metrics import classification_report, confusion_matrix  

# from polyglot.detect import Detector 
# import cld2

In [2]:
os.listdir(os.getcwd())

['.git',
 '.ipynb_checkpoints',
 'clean_data.csv',
 'clean_data.ipynb',
 'clean_data_cld2withpoly.csv',
 'clean_data_Cldpoly.csv',
 'Emoticon and Slang dictionary Creation.ipynb',
 'EmoticonLookupTable.txt',
 'preprocessed_added_features.ipynb',
 'README.md',
 'Slangdictionary.txt',
 'text_process.ipynb',
 'train-balanced.csv']

In [3]:
# Reads the emoticon look up table file

header = ['EmoticonSymbol','SentimentScore']
emoticon_data = pd.read_csv('EmoticonLookupTable.txt', delimiter='\t', encoding = 'ISO-8859-1',names=header)

In [4]:
#Writing emoticons to a dictionary
emoji_dict = emoticon_data.groupby('EmoticonSymbol')['SentimentScore'].apply(list).to_dict()

In [5]:
#Getting acronyms and slangs from html page and creating a dictionary
resp = requests.get("http://www.netlingo.com/acronyms.php")
soup = BeautifulSoup(resp.text, "html.parser")
slangdict= {}
key=""
value=""
for div in soup.findAll('div', attrs={'class':'list_box3'}):
  for li in div.findAll('li'):
   for a in li.findAll('a'):
       key =a.text
   value = li.text.split(key)[1]
   slangdict[key]=value

In [6]:
#Removing the "-or-" terms in the dictionary and retaining one acronym
for key,value in slangdict.items():
    if "-or-" in value:
       removestring = re.findall('-or-(.*)',value)
       removestring = ''.join(removestring)
       newvalue = value.replace(removestring,'')
       newvalue = newvalue.replace("-or-",'')
       slangdict[key] = newvalue
    elif "-or" in value:
       removestring = re.findall('-or(.*)',value)
       removestring = ''.join(removestring)
       newvalue = value.replace(removestring,'')
       newvalue = newvalue.replace("-or",'')
       slangdict[key] = newvalue
    

In [7]:
slangdict.pop('!',None)
slangdict.pop('*$',None)
slangdict.pop('**//',None)
slangdict.pop(',!!!!',None)

'Talk to the hand'

In [8]:
key_to_be_replaced = []
for keys in slangdict.keys():
    if " or " in keys:
        key_to_be_replaced.append(keys)

#print(key_to_be_replaced)

for keys in key_to_be_replaced:
    getkeys = keys.split("or")
    for x in getkeys:
        x = x.strip()
        slangdict[x]= slangdict[keys]
    slangdict.pop(keys,None)    
    

In [9]:
#read the training data from the csv file
header = ['label','comment']
data = pd.read_table('train-balanced.csv',
                    sep='\t', 
                    delimiter=',', 
                    names=header,
                    usecols=[0,1],
                   dtype={'label':int,'comment':str},
                    keep_default_na=False)
data.size

2021652

In [10]:
#helper function to clean the comments
def comment_clean(user_comment):
    # remove trailing \r and \n    
    user_comment.rstrip('\r\n')
    
    #remove the # from hashtag
    if '#' in user_comment:
        hash_tag = re.search('#',user_comment)
        if hash_tag is not None:
            user_comment = user_comment.replace(hash_tag.group(0),' ')
    #remove the redit tags(r/) from comment
    if 'r/' in user_comment:
        r_tag = re.search('r/',user_comment)
        if r_tag is not None:
            user_comment = user_comment.replace(r_tag.group(0),' ')
    #remove the URL links from comments  
    if 'HTTP' in user_comment:
        # url of the form [link name](http://url)
        url_link = re.search('\[(.*)\(HTTP(.*)\)', user_comment)
        if url_link is not None:
            user_comment = user_comment.replace(url_link.group(0),' ')
    if 'http' in user_comment:
        # url of the form [link name](http://url)
        url_link = re.search('\[(.*)\(http(.*)\)', user_comment)
        if url_link is not None:
            user_comment = user_comment.replace(url_link.group(0),' ')
        else:
            #url of the form http:/
            url_link = re.search('http(.*)', user_comment)
            if url_link is not None:
                user_comment = user_comment.replace(url_link.group(0),' ') 
                
    # remove numbers from comments
    user_comment_not_num = re.sub(r'\d+', '', user_comment)   
    
    # Check if the comment has exactly 2 stars
    if user_comment.count('*')==2:
        boldwords = re.search(r"\*(.*?)\*",user_comment)
        #print(boldwords.group(0))
        # Check if the comments have any other text other than **
        if boldwords.group(0) != "**":
            Wordstocapitalize = re.findall(r"\*(.*?)\*",boldwords.group(0))
            Wordstocapitalize = "".join( Wordstocapitalize)
            # Replace the user comment with capitalized words
            user_comment = user_comment.replace(boldwords.group(0),Wordstocapitalize.upper())
    comment_words = re.sub(r"[^a-zA-Z0-9\s\']","",user_comment)         
    comment_words=comment_words.split()
    for word in comment_words:
        if word.upper() in slangdict.keys():
            user_comment = user_comment.replace(word.upper(),slangdict[word.upper()])
        elif word in slangdict.keys():
            user_comment = user_comment.replace(word,slangdict[word]) 
        
    # replace non english comments with empty string
    
#     isReliable, textBytesFound, details = cld2.detect(user_comment_not_num)
#     cld_match = details[0][0]
#     if not (cld_match == 'ENGLISH'):
#        poly_match = Detector(user_comment_not_num, quiet=True).language.name
#        if (poly_match != 'English'):
#            user_comment = ' '               
   
            
    return user_comment           

In [11]:
#clean each commentnon_eng_data = []
start_time = time.time()
data['comment'] = data.comment.apply(comment_clean)
# remove data with empty comments
valid_comment = data['comment'] != ' '
data = data[valid_comment]
end_time = time.time()
print("time taken ", end_time-start_time)

time taken  23.532181978225708


In [12]:
#write the cleaned data into a csv file
data.to_csv('clean_data_Cldpoly.csv',
           sep= '|',
           index=False)
data.size

2021562

In [13]:
#read the training data from the csv file
cleaneddata = pd.read_table('clean_data_Cldpoly.csv',
                    sep='|', 
#                     delimiter=',',
                    usecols=[0,1],
                    dtype={'label':int,'comment':str},
                    keep_default_na=False)
cleaneddata.size

2021562

In [14]:
# Reading the slang dictionary that is already created
f = open("Slangdictionary.txt","r")
res1=f.read()
f.close()
slangdict = ast.literal_eval(res1)

In [15]:
# Capitalizing the characters with *...* and Replacing the words found in slang dictionary with the full forms

def preprocessing(user_comment):
#     Check if the comment has exactly 2 stars
    
    if user_comment.count('*')==2:
        boldwords = re.search(r"\*(.*?)\*",user_comment)
        #print(boldwords.group(0))
        # Check if the comments have any other text other than **
        if boldwords.group(0) != "**":
            Wordstocapitalize = re.findall(r"\*(.*?)\*",boldwords.group(0))
            Wordstocapitalize = "".join( Wordstocapitalize)
            # Replace the user comment with capitalized words
            user_comment = user_comment.replace(boldwords.group(0),Wordstocapitalize.upper())
    comment_words = re.sub(r"[^a-zA-Z0-9\s\']","",user_comment)         
    comment_words=comment_words.split()
    for word in comment_words:
        if word.upper() in slangdict.keys():
            user_comment = user_comment.replace(word.upper(),slangdict[word.upper()])
        elif word in slangdict.keys():
            user_comment = user_comment.replace(word,slangdict[word])            
    return user_comment


In [16]:
cleaneddata['comment'] = cleaneddata.comment.apply(preprocessing)

In [17]:
def featureextraction(dataframe,field, func, column_names):
    return pd.concat((
        dataframe,
        dataframe[field].apply(
            lambda cell: pd.Series(func(cell), index=column_names))), axis=1)

In [18]:
## function to get list of emojis in a comment
def find_emoji(text):
    return list(x for x in text.split() if x in emoji_dict.keys() )

In [19]:
# Extracting the features for each comment 
# Punctuation Features and presence of sarcastic symbol
def allfeatures(user_comment):
    if '!' or '.' or '?' in user_comment:
        Numofexclaimations = user_comment.count('!')
        Numofdots = user_comment.count('.')
        Numofquestionmarks = user_comment.count('?')
    else:
        Numofexclaimations = 0
        Numofdots = 0
        Numofquestionmarks = 0
    if '(!)' in user_comment:
        SarcasticSymbol = 1
    else:
        SarcasticSymbol = 0
    sentiments = TextBlob(str(user_comment)).sentiment
    polarity = sentiments.polarity
    subjectivity = sentiments.subjectivity
    numofcapitals = sum(x.isupper() for x in user_comment.split() if len(x) > 1 )
    elist = find_emoji(user_comment)
    pscore =0
    nscore = 0
    for item in elist:
        if (emoji_dict[item][0] == 1):
            pscore += 1
        elif (emoji_dict[item][0] == -1):
            nscore += 1
    return Numofexclaimations,Numofdots,Numofquestionmarks,SarcasticSymbol,polarity,subjectivity,numofcapitals,pscore,nscore

In [20]:
featureddataset = featureextraction(cleaneddata,'comment', allfeatures, ['Numofexclaimations', 'Numofdots','Numofquestionmarks','SarcasticSymbol','Polarity', 'Subjectivity','NumofCapitalWords','PositiveEmojiCount','NegativeEmojiCount'])
featureddataset.size

11118591

In [21]:

start_time = time.time()
txt = cleaneddata['comment'].tolist()
#POS tagging for all the tokens in the sentence
tagged_texts = pos_tag_sents(map(word_tokenize, txt))
end_time = time.time()
cleaneddata['POS'] = tagged_texts
print("time taken ", end_time-start_time)

time taken  1254.7709407806396


In [22]:

#helper function to collect number of interjection
def comment_interjection(user_comment):
    count = Counter(tag for word,tag in user_comment)
    return count['UH']

In [23]:
##feature extraction
# number of interjection
featureddataset['Interjection']  = cleaneddata.POS.apply(comment_interjection)


In [24]:
print(featureddataset.head())
featureddataset.size

   label                                            comment  \
0      0                          Nice Crib  and Nice Hand.   
1      0  You do know west teams play against west teams...   
2      0  They were underdogs earlier today, but since G...   
3      0  This meme isn't funny none of the "new york ni...   
4      0                    I could use one of those tools.   

   Numofexclaimations  Numofdots  Numofquestionmarks  SarcasticSymbol  \
0                 0.0        1.0                 0.0              0.0   
1                 0.0        0.0                 1.0              0.0   
2                 0.0        0.0                 0.0              0.0   
3                 0.0        1.0                 0.0              0.0   
4                 0.0        1.0                 0.0              0.0   

   Polarity  Subjectivity  NumofCapitalWords  PositiveEmojiCount  \
0  0.600000      1.000000                0.0                 0.0   
1  0.392857      0.517857                0.0  

12129372

In [25]:
featureddataset[['label','Numofexclaimations','Numofdots','Numofquestionmarks','SarcasticSymbol','Polarity', 'Subjectivity','NumofCapitalWords','PositiveEmojiCount','NegativeEmojiCount','Interjection']]

Unnamed: 0,label,Numofexclaimations,Numofdots,Numofquestionmarks,SarcasticSymbol,Polarity,Subjectivity,NumofCapitalWords,PositiveEmojiCount,NegativeEmojiCount,Interjection
0,0,0.0,1.0,0.0,0.0,0.600000,1.000000,0.0,0.0,0.0,0
1,0,0.0,0.0,1.0,0.0,0.392857,0.517857,0.0,0.0,0.0,0
2,0,0.0,0.0,0.0,0.0,0.000000,0.500000,0.0,0.0,0.0,0
3,0,0.0,1.0,0.0,0.0,0.193182,0.727273,0.0,0.0,0.0,0
4,0,0.0,1.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0
5,0,0.0,0.0,0.0,0.0,0.075000,0.300000,0.0,0.0,0.0,0
6,0,0.0,3.0,0.0,0.0,-0.225000,0.750000,0.0,0.0,0.0,0
7,0,1.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0
8,0,0.0,0.0,0.0,0.0,0.700000,0.600000,0.0,0.0,0.0,0
9,0,0.0,1.0,0.0,0.0,0.500000,0.600000,0.0,0.0,0.0,0


In [26]:
X = featureddataset.drop(['label', 'comment'], axis=1)
y = featureddataset['label']
from sklearn.model_selection import train_test_split  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)  

In [None]:
# linear kernel
svclassifier = SVC(kernel='linear',C=1,gamma=1)  
svclassifier.fit(X_train, y_train) 

y_pred = svclassifier.predict(X_test)  

print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred)) 

In [None]:
# Guassian kernel
svclassifier = SVC(kernel='rbf',C=1,gamma=1)  
svclassifier.fit(X_train, y_train) 

y_pred = svclassifier.predict(X_test)  

print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred)) 

In [None]:
# polynomial kernel with degree 3
start_time = time.time()
svclassifier = SVC(kernel='poly', degree=3) 
svclassifier.fit(X_train, y_train) 

y_pred = svclassifier.predict(X_test) 
end_time = time.time()
print("time taken ", end_time-start_time)

print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred)) 

In [None]:
# polynomial kernel with degree 4
start_time = time.time()
svclassifier = SVC(kernel='poly', degree=4) 
svclassifier.fit(X_train, y_train) 

y_pred = svclassifier.predict(X_test) 
end_time = time.time()
print("time taken ", end_time-start_time)

print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred)) 