In [1]:
import numpy as np
import pandas as pd

import nltk
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import spacy
import textstat
from textstat.textstat import textstatistics, legacy_round

import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

In [2]:
dataset=pd.read_csv("D:/Projects/Sarcasm Detection/App/train-balanced-sarcasm.csv")

In [3]:
data = dataset.copy()

In [4]:
data = data.dropna()

In [5]:
data.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,NC and NH.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,They're favored to win.
3,0,"This meme isn't funny none of the ""new york ni...",icebrotha,BlackPeopleTwitter,-8,-1,-1,2016-10,2016-10-18 21:03:47,deadass don't kill my buzz
4,0,I could use one of those tools.,cush2push,MaddenUltimateTeam,6,-1,-1,2016-12,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...


In [6]:
data.shape

(1010773, 10)

In [7]:
data["cleaned"]=data["comment"].astype(str)

## Sentiment Analysis

As sarcasm consists of sentiment expression, we examine the sentiment expressed in the reddit comments.

This assigns a value between -1 and +1 to each comment indicating how negative and positive the comments are represented.

In [8]:
analyzer = SentimentIntensityAnalyzer()

In [9]:
data['cc_score'] = data.comment.apply(analyzer.polarity_scores) 

In [10]:
data['cc_score']=data['cc_score'].apply(lambda x: x["compound"])

## All Capital words

*capital_words* consists of the number of whole capital words present in each comment.

In [11]:
def cap_count(text):
    cap=0
    for word in text.split():
        if word.isupper():
            cap +=1
    return cap

In [12]:
data["capital_words"]=data["comment"].apply(cap_count)

## Word Count

*tot_words* is the total number of words in the comment.

In [13]:
def tot_words(text):
    return(len(text.split()))

In [14]:
data["total_words"]=data["comment"].apply(tot_words)

## Puntuation marks!!

Here we have extracted the number of times a punctuation repeats in the comment. This is calculated for 7 different punctuations.

In [15]:
pun=[".",",","!","?","’","*","”"]
def punct(text,p):
    a=0
    for i in range(0,len(text)):
        if text[i]==p:
            a+=1
    return(a)


In [16]:
for p in pun:
    data[p]=data["comment"].apply(punct,args=(p))
data=data.rename(columns={".":"punc(.)",",":"punc(,)","!":"punc(!)",
                     "?":"punc(?)","’":"punc(')","*":"punc(*)","”":"punc(”)"})

## Repeated Characters

This is a boolean features taking value 0 if there are no unusual repetitions of characters in the comment and takes the value 1 if there are unusual(>5) repetitions of characters.

eg: This is sooooo funny! would take the value 1.

In [17]:
def repeat(text):
    text=text.split()
    words=[]
    for word in text:
        chars=0
        for char in word:
            if word.count(char)>=5:
                chars+=1
        words.append(chars)
    if max(words)>0:
        return(1)
    else:
        return(0)

In [18]:
data["char_repeated"]=data["comment"].apply(repeat)


## Unique Characters


*unique_char*   
The number of distinct characters in the comment.

*ratio_char*    
The ratio of distinct characters to total number of characters in the comment.

*tot_chars*   
The total number of characters in the comment.

In [19]:
def unique_char(text):
    chars=[]
    for i in text:
        if i not in chars:
            chars.append(i)
    return (len(chars))

In [20]:
data["unique_char"]=data["comment"].apply(unique_char)

In [21]:
data["ratio_char"]=data["unique_char"]/data["comment"].apply(len)

In [22]:
data["tot_chars"]=data["comment"].apply(len)

## Subreddit Ratio

The ratio of the number of non sarcastic comments to the number of sarcastic comments in the subreddit to which the comment belongs.

In [23]:
nsarc_count = dataset.subreddit[dataset['label']==0].value_counts()
sarc_count = dataset.subreddit[dataset['label']==1].value_counts()
all_count = dataset.subreddit.value_counts()

In [24]:
subreddits = pd.merge(nsarc_count, sarc_count,
                      right_index = True, left_index = True)

In [25]:
subreddits.columns = ['nsarc_count', 'sarc_count']

In [26]:
subreddits['ratio'] = subreddits.nsarc_count/subreddits.sarc_count

In [27]:
all_count = all_count.reset_index(drop=False)
subreddits = subreddits.reset_index(drop=False)

In [28]:
subreddits1 = pd.merge(all_count, subreddits, on = "index", how = "left")
subreddits = subreddits1.replace(np.NaN, 1)

In [29]:
ratio = dict(zip(list(subreddits['index']), list(subreddits['ratio'])))
data['ratio'] = data['subreddit'].map(ratio)

## Flesch score

Flesch score is an indicator of the readability of the comment, higher scores indicate material that is easier to read; lower numbers mark comments that are more difficult to read.It is calculated based on the average words per sentence and the average syllables per word.

In [30]:
def break_sentences(text):
    a_list =nltk.tokenize.sent_tokenize(text)
    return a_list 

In [31]:
def word_count(text):
    string1=text.strip()
    count=1
    for i in string1:
        if i==" ":
            count+=1
    return count

In [32]:
def sentence_count(text):
    sentences = break_sentences(text)
    return len(sentences)

In [33]:
def avg_sentence_length(text):
    words = word_count(text)
    sentences = sentence_count(text)
    average_sentence_length = float(words / sentences)
    return average_sentence_length

In [34]:
data["Avg_sentence_length"]=data["cleaned"].apply(avg_sentence_length)

In [35]:
def syllable_count(word):
    word = word.lower()
    count = 0
    vowels = "aeiouy"
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith("e"):
        count -= 1
    if count == 0:
        count += 1
    return count

In [36]:
def avg_syllables_per_word(text):
    syllable = syllable_count(text)
    words = word_count(text)
    ASPW = float(syllable) / float(words)
    return legacy_round(ASPW, 1)

In [37]:
data["Avg_syllables_per_word"]=data["cleaned"].apply(avg_syllables_per_word)

In [38]:
temp = data.nlargest(10,['Avg_syllables_per_word'])

In [39]:
temp = data.nsmallest(10,['Avg_syllables_per_word'])

In [40]:
def flesch_reading_ease(text):
    """
        Implements Flesch Formula:
        Reading Ease score = 206.835 - (1.015 × ASL) - (84.6 × ASW)
        Here,
          ASL = average sentence length (number of words 
                divided by number of sentences)
          ASW = average word length in syllables (number of syllables 
                divided by number of words)
    """
    FRE = 206.835 - float(1.015 * avg_sentence_length(text)) -\
          float(84.6 * avg_syllables_per_word(text))
    return legacy_round(FRE, 2)

In [41]:
data["Flesch_score"]=data["cleaned"].apply(flesch_reading_ease)

In [42]:
temp = data.nlargest(147538,['Flesch_score'])

In [43]:
temp = data.nsmallest(33260,['Flesch_score'])

## Swear Words

This is a boolean feature, which takes value 1 if swear words are present in the comment and 0 otherwise.

In [44]:
def swearWord(text):
    feature3=False
    Swearwords =["shit","fuck","damn","bitch","crap","piss","dick","darn",
                 "cock","pussy","asshole","fag","bastard","slut","douche",
                 "bloody","cunt","bugger","bollocks","arsehole"]
    for item in Swearwords:
        if item in text:
            feature3=True
    return feature3

In [45]:
data["SwearWord"]=data["cleaned"].apply(swearWord)

## n-grams

In [50]:
train_texts, valid_texts, y_train, y_valid = train_test_split(data['comment'], data['label'], random_state=17)

In [51]:
tf_idf = TfidfVectorizer(ngram_range=(1, 3), max_features=50000, min_df=2)
# multinomial logistic regression a.k.a softmax classifier
logit = LogisticRegression(C=1, n_jobs=4, solver='lbfgs',random_state=17, verbose=1)
# sklearn's pipeline
tfidf_logit_pipeline = Pipeline([('tf_idf', tf_idf),('logit', logit)])

In [52]:
tfidf_logit_pipeline.fit(train_texts, y_train)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 out of   1 | elapsed:   14.1s finished


Pipeline(steps=[('tf_idf',
                 TfidfVectorizer(max_features=50000, min_df=2,
                                 ngram_range=(1, 3))),
                ('logit',
                 LogisticRegression(C=1, n_jobs=4, random_state=17,
                                    verbose=1))])

In [53]:
valid_pred = tfidf_logit_pipeline.predict(valid_texts)

In [54]:
data['n_gram_prediction'] = tfidf_logit_pipeline.predict(data['comment'])

## Dataset after feature extraction

In [55]:
data.shape

(1010773, 31)

In [56]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1010773 entries, 0 to 1010825
Data columns (total 31 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   label                   1010773 non-null  int64  
 1   comment                 1010773 non-null  object 
 2   author                  1010773 non-null  object 
 3   subreddit               1010773 non-null  object 
 4   score                   1010773 non-null  int64  
 5   ups                     1010773 non-null  int64  
 6   downs                   1010773 non-null  int64  
 7   date                    1010773 non-null  object 
 8   created_utc             1010773 non-null  object 
 9   parent_comment          1010773 non-null  object 
 10  cleaned                 1010773 non-null  object 
 11  cc_score                1010773 non-null  float64
 12  capital_words           1010773 non-null  int64  
 13  total_words             1010773 non-null  int64  
 14  pu

In [57]:
data.head(5)

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment,...,char_repeated,unique_char,ratio_char,tot_chars,ratio,Avg_sentence_length,Avg_syllables_per_word,Flesch_score,SwearWord,n_gram_prediction
0,0,NC and NH.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ...",...,0,8,0.8,10,0.651861,3.0,0.3,178.41,False,0
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...,...,0,21,0.283784,74,0.899302,14.0,1.1,99.57,False,0
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,They're favored to win.,...,0,30,0.247934,121,0.961192,19.0,1.7,43.73,False,0
3,0,"This meme isn't funny none of the ""new york ni...",icebrotha,BlackPeopleTwitter,-8,-1,-1,2016-10,2016-10-18 21:03:47,deadass don't kill my buzz,...,0,21,0.35,60,1.634532,12.0,1.5,67.76,False,0
4,0,I could use one of those tools.,cush2push,MaddenUltimateTeam,6,-1,-1,2016-12,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...,...,0,14,0.451613,31,0.985737,7.0,1.4,81.29,False,0


In [58]:
data.describe()

Unnamed: 0,label,score,ups,downs,cc_score,capital_words,total_words,punc(.),"punc(,)",punc(!),...,punc(”),char_repeated,unique_char,ratio_char,tot_chars,ratio,Avg_sentence_length,Avg_syllables_per_word,Flesch_score,n_gram_prediction
count,1010773.0,1010773.0,1010773.0,1010773.0,1010773.0,1010773.0,1010773.0,1010773.0,1010773.0,1010773.0,...,1010773.0,1010773.0,1010773.0,1010773.0,1010773.0,1010773.0,1010773.0,1010773.0,1010773.0,1010773.0
mean,0.4999817,6.885999,5.49914,-0.1458686,0.06431744,0.4198856,10.46145,0.7107095,0.3952787,0.09664386,...,0.0,0.0130326,19.1417,0.4630284,56.6923,1.141224,10.38804,1.529425,66.90383,0.4633968
std,0.5000002,48.34411,41.27402,0.3529746,0.388393,3.704036,10.53495,9.815932,0.8888833,0.3038451,...,0.0,0.1134141,6.163068,0.2033343,61.82128,1.435628,10.46128,3.827022,323.8463,0.4986587
min,0.0,-507.0,-507.0,-1.0,-1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0002,1.0,0.02439024,1.0,0.0,-281681.4,0.0
25%,0.0,1.0,0.0,0.0,-0.0057,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,15.0,0.3098592,27.0,0.7747748,5.0,1.3,50.5,0.0
50%,0.0,2.0,1.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,...,0.0,0.0,20.0,0.4255319,46.0,0.9450549,9.0,1.5,71.14,0.0
75%,1.0,4.0,3.0,0.0,0.3612,1.0,14.0,1.0,1.0,0.0,...,0.0,0.0,23.0,0.5806452,74.0,1.214965,14.0,1.7,89.75,1.0
max,1.0,9070.0,5163.0,0.0,0.9999,1663.0,2222.0,9794.0,420.0,44.0,...,0.0,1.0,65.0,1.0,10000.0,70.0,2222.0,3332.0,191.27,1.0
