In [None]:
#CelL No. 1

!pip install stanfordnlp
!apt-get install -y openjdk-11-jdk-headless -qq > /dev/null
#Downloaded the zip file and extracted it manually
"""
!wget -q http://nlp.stanford.edu/software/stanford-corenlp-4.4.0.zip
!unzip -q stanford-corenlp-4.4.0.zip
"""

#I was unable to start the CoreNLP server from python as it was giving error Unable to create sub process so executed line 14 directly on cmd
"""
Start the Stanford CoreNLP server
import os
os.chdir('stanford-corenlp-4.4.0')
!java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000 &

Move back to the parent directory
os.chdir('..')
"""

In [None]:
#CelL No. 2

!pip install pandas
!pip install requests
!pip install openpyxl
!pip install Flask

from openpyxl import Workbook
import pandas as pd
import requests

In [None]:
#CelL No. 3

#Preprocessing

def lemmatizee(text):
    url = 'http://localhost:9000'
    properties = {
        'annotators': 'tokenize,ssplit,pos,lemma',
        'outputFormat': 'json'
    }
    response = requests.post(
        url,
        params={'properties': json.dumps(properties)},
        data=text.encode('utf-8'),
        headers={'Content-Type': 'text/plain'}
    )
    lemmatized_text = ''
    if response.status_code == 200:
        data = response.json()
        for sentence in data['sentences']:
            for token in sentence['tokens']:
                lemmatized_text += token['lemma'] + ' '
    else:
        print("Error:", response.status_code)

    return lemmatized_text.strip()
data_list = []
with open('Sarcasm_Headlines_Dataset.json', 'r') as file:
  for line in file:
    data = json.loads(line)
    data_list.append(data)

Converted_Dataset = pd.DataFrame(data_list)
Converted_Dataset = Converted_Dataset[['headline', 'is_sarcastic']]
for i,x in Converted_Dataset.iterrows():
    x['headline'] = lemmatizee(x['headline'])

Converted_Dataset.to_excel('lemmatizedDataset.xlsx', index=False)

In [None]:
#CelL No. 4

#Module 1
import json
import requests

endpoint = 'http://api.conceptnet.io/c/en/'
params = {
    'filter': 'core',
    'limit': 1000
}
def conceptNet(word):
    response = requests.get(endpoint + word, params=params)
    data = json.loads(response.text)
    edges = data['edges']
    edges.sort(key=lambda x: x['weight'], reverse=True)

    return edges

In [None]:
#CelL No. 5

!pip install senticnet
!pip install sentistrength

from senticnet.senticnet import SenticNet
from sentistrength import PySentiStr

In [None]:
#CelL No. 6

#Module 2
sn = SenticNet()
def senticNetScore(word):
    try:
        polarity = sn.polarity_value(word)
        return float(polarity) * 5
    except KeyError:
        return None

senti = PySentiStr()

senti.setSentiStrengthPath('SentiStrength.jar')
senti.setSentiStrengthLanguageFolderPath('SentiStrength_Data')
def sentiStrengthScore(word):
    result = senti.getSentiment(word)
    return result

def sWordScore(word):
    sNet = senticNetScore(word)
    sStrength = sentiStrengthScore(word)[0]
    if sNet == None and sStrength == 0:
        expansion = conceptNet(word)
        if len(expansion) == 0:
            return 0
        else:
            score = 0
            expansion = expansion[:5]
            for edge in expansion:
                try:
                    score += senticNetScore(edge['end']['label'])
                except:
                    score += 0
            return score / 5
    elif sNet == None:
        return sStrength
    elif sStrength == 0:
        return sNet
    else:
        return (sNet + sStrength) / 2

def posNegScore(sentenceScores):
    pos = 0
    neg = 0
    for i in sentenceScores:
        if (i > 0):
            pos += i
        elif (i < 0):
            neg += i
    return pos,neg

def isContradiction(sentence):
    print("Sentence: ",sentence)
    results = []
    for word in nltk.word_tokenize(sentence):
        results.append(sWordScore(word))
    posSum, negSum = posNegScore(results)
    if posSum != 0 and negSum != 0:
        return True
    return False

In [None]:
#CelL No. 7

!pip install space
!pip install nltk

import re
import spacy
import nltk
from nltk.stem import WordNetLemmatizer

In [None]:
#CelL No. 8

#Module 3

spacyNLP = spacy.load("en_core_web_sm")
lemmatizer = WordNetLemmatizer()

def extractSubject(sentence):
    doc = spacyNLP(sentence)
    for token in doc:
        if token.dep_ == "nsubj":
            return token.text
    return None

def hasAntecedents(text):
    doc = spacyNLP(text)
    antecedents = []
    for token in doc:
        if token.dep_ == "nsubj" and token.head.pos_ == "VERB":
            for mention in doc.ents:
                if mention.start <= token.i < mention.end:
                    antecedents.append(mention.text)
    return len(antecedents) > 0

def identicalPronouns(w1, w2):
    return lemmatizer.lemmatize(w1, 'n') == lemmatizer.lemmatize(w2, 'n')

def identicalSubjects(w1, w2):
    cleanedSubject1 = re.sub(r'[^a-zA-Z]', '', w1)
    cleanedSubject2 = re.sub(r'[^a-zA-Z]', '', w2)
    return cleanedSubject1 == cleanedSubject2

def definiteNounPhraseFeature(text, w2):
    tokens = nltk.word_tokenize(text)
    for i in range(len(tokens)):
        if tokens[i] == w2 and i > 0 and tokens[i - 1] == 'the':
            return True
    return False

def demonstrativeNounPhraseFeature(text, w2):
    tokens = nltk.word_tokenize(text)
    for i in range(len(tokens)):
        if tokens[i] == w2 and i > 0 and tokens[i - 1] in ['this', 'that', 'these', 'those']:
            return True
    return False

def properNameFeature(w1, w2):
    taggedWords = nltk.pos_tag([w1, w2])
    return all(tag in ['NNP', 'NNPS'] for word, tag in taggedWords)

def checkCoherence(sentence):
    sentences = nltk.sent_tokenize(sentence)
    if len(sentences) > 1:
        if hasAntecedents(sentence):
            return True
        subject1 = extractSubject(sentences[0])
        subject2 = extractSubject(sentences[1])
        if identicalPronouns(subject1, subject2) or identicalSubjects(subject1, subject2) or \
           definiteNounPhraseFeature(sentences[1], subject2) or demonstrativeNounPhraseFeature(sentences[1], subject2) or \
           properNameFeature(subject1, subject2):
            return True
    return False

In [None]:
#CelL No. 9

#Module 4
import re

def countEmoticons(headline):
    return len(re.findall(r'[^\w\s,]', headline))

def countRepetitivePunctuations(headline):
    return len(re.findall(r'([\W_]){2,}', headline))

def countRepetitiveSequences(headline):
    return len(re.findall(r'(\S)\1{1,}', headline))

def countCapitalLetters(headline):
    return len(re.findall(r'[A-Z]', headline))

def countExclamationMarks(headline):
    return len(re.findall(r'!', headline))

with open("Slang_and_Booster_Words.txt", 'r') as file:
    slang_booster = [line.strip().casefold() for line in file if line.strip()]

print(slang_booster)

def countBoostersAndSlangs(headline):
    return sum(1 for word in headline.split() if word.lower() in slang_booster)

with open("Idioms.txt", 'r') as file:
    idioms = [line.strip().casefold() for line in file if line.strip()]

print(idioms)

def countIdioms(headline):
    return sum(1 for word in headline.split() if word.lower() in idioms)

In [None]:
#CelL No. 10

#FeatureSet Definition & Computation

import pandas as pd
import copy

features = {"headline":[], "is_sarcastic": [], "contra" : [], "contra_plus_coher" : [],
            "pos_low": [], "pos_med" : [], "pos_high" : [],
            "neg_low": [], "neg_med" : [], "neg_high" : [],
            "emo_low": [], "emo_med" : [], "emo_high" : [],
            "punct_low": [], "punct_med" : [], "punct_high" : [],
            "char_low": [], "char_med" : [], "char_high" : [],
            "cap_low": [], "cap_med" : [], "cap_high" : [],
            "slangBooster_low": [], "slangBooster_med" : [], "slangBooster_high" : [],
            "exclaim_low": [], "exclaim_med" : [], "exclaim_high" : [],
            "idiom_low": [], "idiom_med" : [], "idiom_high" : []}

def calLowMedHigh(score,a,b,t):
    if score < a and t == "low":
        return 1
    elif score >= a and score <= b and t == "med":
        return 1
    elif score > b and t == "high":
        return 1
    else:
        return 0

def calculateFeatures(headline, isSarcastic = None):
    vector = copy.deepcopy(features)
    for i in vector:
        vector[i] = [0]
    vector["headline"] = headline
    if isSarcastic == None:
        vector["is_sarcastic"] = [None]
    else:
        vector["is_sarcastic"] = [isSarcastic]
    sen = re.sub(r'[^\w\s]|[\d]', '', headline)
    
    words = nltk.word_tokenize(sen)
    if len(nltk.sent_tokenize(headline)) > 1:
        if isContradiction(sen) and checkCoherence(sen):
            vector["contra_plus_coher"] = [1]
        else:
            vector["contra_plus_coher"] = [0]
    else:
        if isContradiction(sen):
            vector["contra"] = [1]
        else:
            vector["contra"] = [0] 
    sample = []
    for i in words:
        sample.append(sWordScore(i))
    print("Calculating Positive Negative Scores for", headline)
    posSum, negSum = posNegScore(sample)
    print("Calculating Emoticons Scores for", headline)
    emoSum = countEmoticons(headline)
    print("Calculating Repetitive Punctuations Scores for", headline)
    punctSum = countRepititivePunctuations(headline)
    print("Calculating Repetitive Sequence Scores for", headline)
    charSum = countRepititiveSequences(headline)
    print("Calculating Capital Letters in", headline)
    capSum = countCapitalLetters(headline)
    print("Calculating Slangs and Boosters in", headline)
    bSSum = countBoostersAndSlangs(headline)
    print("Calculating Exclaimation Marks in", headline)
    exclaimSum = countExclamationMarks(headline)
    print("Calculating Idioms in", headline)
    idiomSum = countIdioms(headline)

    vector["pos_low"] = [calLowMedHigh(posSum,0,1,"low")]
    vector["pos_med"] = [calLowMedHigh(posSum,0,1,"med")]
    vector["pos_high"] = [calLowMedHigh(posSum,0,1,"high")]
    vector["neg_low"] = [calLowMedHigh(negSum,-1,0,"high")]
    vector["neg_med"] = [calLowMedHigh(negSum,-1,0,"med")]
    vector["neg_high"] = [calLowMedHigh(negSum,-1,0,"low")]

    vector["emo_low"] = [calLowMedHigh(emoSum,1,3,"low")]
    vector["emo_med"] = [calLowMedHigh(emoSum,1,3,"med")]
    vector["emo_high"] = [calLowMedHigh(emoSum,1,3,"high")]
    vector["punct_low"] = [calLowMedHigh(punctSum,1,3,"low")]
    vector["punct_med"] = [calLowMedHigh(punctSum,1,3,"med")]
    vector["punct_high"] = [calLowMedHigh(punctSum,1,3,"high")]
    vector["char_low"] = [calLowMedHigh(charSum,1,3,"low")]
    vector["char_med"] = [calLowMedHigh(charSum,1,3,"med")]
    vector["char_high"] = [calLowMedHigh(charSum,1,3,"high")]
    vector["cap_low"] = [calLowMedHigh(capSum,1,3,"low")]
    vector["cap_med"] = [calLowMedHigh(capSum,1,3,"med")]
    vector["cap_high"] = [calLowMedHigh(capSum,1,3,"high")]
    vector["slangBooster_low"] = [calLowMedHigh(bSSum,1,3,"low")]
    vector["slangBooster_med"] = [calLowMedHigh(bSSum,1,3,"med")]
    vector["slangBooster_high"] = [calLowMedHigh(bSSum,1,3,"high")]
    vector["exclaim_low"] = [calLowMedHigh(exclaimSum,1,3,"low")]
    vector["exclaim_med"] = [calLowMedHigh(exclaimSum,1,3,"med")]
    vector["exclaim_high"] = [calLowMedHigh(exclaimSum,1,3,"high")]
    vector["idiom_low"] = [calLowMedHigh(idiomSum,1,3,"low")]
    vector["idiom_med"] = [calLowMedHigh(idiomSum,1,3,"med")]
    vector["idiom_high"] = [calLowMedHigh(idiomSum,1,3,"high")]
    
    return vector

In [None]:
#CelL No. 11

#Feature Set Computation over Lemmatized DataSet

#It took more than 76 hours for the dataset of 26710 tweets
df1 = pd.read_excel("lemmatizedDataset.xlsx")
final = pd.DataFrame(features)

for i in range(len(df1["headline"])):
    x = pd.DataFrame(calculateFeatures(df1["headline"][i], df1["is_sarcastic"][i]))
    final = pd.concat([final, x], ignore_index=True)
print(final)
final.to_excel('FeatureSet.xlsx', index=False)

In [None]:
#CelL No. 12

!pip install scikit-learn
import sklearn
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

In [None]:
#CelL No. 13

#Evaluations Metrix Definitions

final = pd.read_excel("FeatureSet.xlsx")
results = pd.DataFrame({"Method" : [], "Precision" : [], "Recall": [], "F-Measure": [], "Accuracy": []})

def calculate_metrics(y_true, y_pred):
    precision = precision_score(y_true, y_pred, average='binary')
    recall = recall_score(y_true, y_pred, average='binary')
    f_measure = f1_score(y_true, y_pred, average='binary')
    accuracy = accuracy_score(y_true, y_pred)

    return precision, recall, f_measure, accuracy

In [None]:
#CelL No. 14

#Contradiction in Sentiment Scores

y = final["is_sarcastic"]
contradiction = []

for i in range(len(final["headline"])):
    if(final['contra'][i] == 1 or final["contra_plus_coher"][i] == 1):
        contradiction.append(1)
    else:
        contradiction.append(0)

contraPre, contraRecall, contraF, contraAccuracy = calculate_metrics(y, contradiction)
print("Contradiction in Sentiment Scores")
print(contraPre)
print(contraRecall)
print(contraF)
print(contraAccuracy)

In [None]:
#CelL No. 15

#N-Grams Prediction

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict

ngramX = final["headline"]
ngramRange = (1,3)
vectorizer = CountVectorizer(ngram_range=ngramRange)
vectorizedX = vectorizer.fit_transform(ngramX)
ngramSVM = SVC(kernel='linear')
ngramSVM.fit(vectorizedX, y)
ngramPred = cross_val_predict(ngramSVM, vectorizedX, y, cv=10, n_jobs=4)

ngramPre, ngramRecall, ngramF, ngramAccuracy = calculate_metrics(y, ngramPred)
print("Feature Space Classification")
print(ngramPre)
print(ngramRecall)
print(ngramF)
print(ngramAccuracy)

In [None]:
#CelL No. 16

#Feature Space Prediction

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict

featuresX = final.drop('is_sarcastic', axis = 1)  # Features
featuresX = featuresX.drop('headline', axis = 1)
vectorizer = CountVectorizer()
vectorizedX = vectorizer.fit_transform(featuresX)
featuresSVM = SVC(kernel='linear')
assert len(featuresX) == len(y), "Mismatched lengths between features and labels"
featuresSVM.fit(featuresX, y)
featuresPred = cross_val_predict(featuresSVM, featuresX, y, cv=10,n_jobs=4)

featuresPre, featuresRecall, featuresF, featuresAccuracy = calculate_metrics(y, featuresPred)
print("Feature Space Classification")
print(featuresPre)
print(featuresRecall)
print(featuresF)
print(featuresAccuracy)

In [None]:
#CelL No. 17

#N-grams + Feature Set Combined Prediction

ngramFeatures = []
for i in range(len(ngramPred)):
    if ngramPred[i] == featuresPred[i]:
        ngramFeatures.append(ngramPred[i])
    else:
        ngramsMargin = ngramSVM.decision_function(vectorizedX[i]) 
        featuresMargin = featuresSVM.decision_function([featuresX.iloc[i]])  # Remove the wrapping of [featureX[i]] in square brackets
        if abs(ngramsMargin) > abs(featuresMargin):
            ngramFeatures.append(ngramPred[i])
        else:
            ngramFeatures.append(featuresPred[i]) 
            
combinedPre, combinedRecall, combinedF, combinedAccuracy = calculate_metrics(y, ngramFeatures)
print("Combined N-grams & Feature Set Classification")
print(combinedPre)
print(combinedRecall)
print(combinedF)
print(combinedAccuracy)

In [None]:
#CelL No. 18

#Saving Evaluation Metrics

res = {"Method" : [], "Precision" : [], "Recall": [], "F-Measure": [], "Accuracy": []}
res["Method"].append("contradiction in Sentiment Scores")
res["Precision"].append(contraPre)
res["Recall"].append(contraRecall)
res["F-Measure"].append(contraF)
res["Accuracy"].append(contraAccuracy)

res["Method"].append("N-grams SVC Classification")
res["Precision"].append(ngramPre)
res["Recall"].append(ngramRecall)
res["F-Measure"].append(ngramF)
res["Accuracy"].append(ngramAccuracy)

res["Method"].append("Feature-Space SVC Classification")
res["Precision"].append(featuresPre)
res["Recall"].append(featuresRecall)
res["F-Measure"].append(featuresF)
res["Accuracy"].append(featuresAccuracy)

res["Method"].append("Combined N-grams & Feature Space")
res["Precision"].append(combinedPre)
res["Recall"].append(combinedRecall)
res["F-Measure"].append(combinedF)
res["Accuracy"].append(combinedAccuracy)

results = pd.DataFrame(res)
results.to_excel("Evaluation.xlsx", index=False)