In [1]:
# importing libraries
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from string import punctuation
import contractions
from unidecode import unidecode
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score,roc_curve
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
import numpy as np
import matplotlib.pyplot as plt
import scikitplot as skplt
import seaborn as sns
from sklearn import metrics


In [16]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Importing the data

In [2]:
import json
data = [json.loads(line) for 
        line in open("Sarcasm_Headlines_Dataset.json", 'r')]

In [3]:
new_df = pd.DataFrame.from_dict(data) 

In [4]:
data = new_df.drop(['article_link'],axis=1)

In [5]:
data = data.head(100)

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   headline      100 non-null    object
 1   is_sarcastic  100 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 1.7+ KB


In [9]:
data.head(2)

Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0


# Splitting the data

In [11]:
x_train,x_test,y_train,y_test = train_test_split(data.headline,data.is_sarcastic,test_size=0.25,random_state=42)

# Preprocessing functions

In [12]:
# creating functions for preprocessing
def remove_newlines(data):
    formatted_text = data.replace("\\n",' ').replace("\t"," ")
    return formatted_text

def contraction_map(data):
    fixed_text = contractions.fix(data)
    return fixed_text

def handle_accented(data):
    fixed_text = unidecode(data)
    return fixed_text

stopword_list = stopwords.words("english")
stopword_list.remove("no")
stopword_list.remove("not")
stopword_list.remove("nor")
def cleaning_text(data):
    tokens = word_tokenize(data)
    clean_tokens = [ i.lower() for i in tokens if (i.lower() not in stopword_list) and (i not in punctuation) ]
    clean_tokens = [ i for i in  clean_tokens if (len(i)>1) and i.isalpha()]
    return clean_tokens

def lemmatization(data):
    lemmatizer = WordNetLemmatizer()
    final_tokens = []
    for i in data:
        lemmatized_word = lemmatizer.lemmatize(i)
        final_tokens.append(lemmatized_word)
    return " ".join(final_tokens)

In [13]:
clean_train = x_train.apply(remove_newlines)
clean_test = x_test.apply(remove_newlines)

clean_train = clean_train.apply(contraction_map)
clean_test = clean_test.apply(contraction_map)

clean_train = clean_train.apply(handle_accented)
clean_test = clean_test.apply(handle_accented)

clean_train = clean_train.apply(cleaning_text)
clean_test = clean_test.apply(cleaning_text)

clean_train = clean_train.apply(lemmatization)
clean_test = clean_test.apply(lemmatization)

# word indexing

In [15]:
# word indexing
max_words = 1000
tk = Tokenizer(num_words=max_words,oov_token="##oov##")
train_sent_list = clean_train.to_list()
test_sent_list = clean_test.to_list()
tk.fit_on_texts(train_sent_list)
x_train_seq = tk.texts_to_sequences(train_sent_list)
x_test_seq = tk.texts_to_sequences(test_sent_list)

In [17]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_words_per_sent = 500
x_train_seq = pad_sequences(x_train_seq,padding='post',maxlen = max_words_per_sent,truncating='post')
x_test_seq = pad_sequences(x_test_seq,padding='post',maxlen = max_words_per_sent,truncating='post')

# Building simple RNN

In [18]:
from tensorflow.keras.layers import Dense,Embedding,SimpleRNN,Bidirectional,LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
es = EarlyStopping(monitor='val_loss',patience=3,restore_best_weights=True)

In [19]:
model = Sequential()
embdding_dim = 50
model.add(Embedding(input_dim = max_words+1,output_dim = embdding_dim,input_length = max_words_per_sent))
model.add(SimpleRNN(units=64,return_sequences=True))
model.add(Dense(20,activation='relu'))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
history = model.fit(x_train_seq,y_train,batch_size=252,epochs=10,validation_data=(x_test_seq,y_test),callbacks=[es])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


# Deep RNN

In [20]:
model = Sequential()
embdding_dim = 50
model.add(Embedding(input_dim = max_words+1,output_dim = embdding_dim,input_length = max_words_per_sent))
model.add(SimpleRNN(units=64,return_sequences=True))
model.add(SimpleRNN(units=32,return_sequences=True))
model.add(Dense(20,activation='relu'))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
history = model.fit(x_train_seq,y_train,batch_size=252,epochs=10,validation_data=(x_test_seq,y_test),callbacks=[es])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


# Bidirectional RNN

In [21]:
model = Sequential()
embdding_dim = 50
model.add(Embedding(input_dim = max_words+1,output_dim = embdding_dim,input_length = max_words_per_sent))
model.add(Bidirectional(SimpleRNN(units=64,return_sequences=True)))
model.add(Dense(20,activation='relu'))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
history = model.fit(x_train_seq,y_train,batch_size=252,epochs=10,validation_data=(x_test_seq,y_test),callbacks=[es])


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


# LSTM

In [22]:
model = Sequential()
embdding_dim = 50
model.add(Embedding(input_dim = max_words+1,output_dim = embdding_dim,input_length = max_words_per_sent))
model.add(LSTM(units=8,return_sequences=True))
model.add(Dense(20,activation='relu'))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
history = model.fit(x_train_seq,y_train,batch_size=252,epochs=10,validation_data=(x_test_seq,y_test),callbacks=[es])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Bidirectional LSTM

In [23]:
model = Sequential()
embdding_dim = 50
model.add(Embedding(input_dim = max_words+1,output_dim = embdding_dim,input_length = max_words_per_sent))
model.add(Bidirectional(LSTM(units=8,return_sequences=True)))
model.add(Dense(20,activation='relu'))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
history = model.fit(x_train_seq,y_train,batch_size=252,epochs=10,validation_data=(x_test_seq,y_test),callbacks=[es])


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
