# Sarcasm Detection


In [29]:
# Importing required Libraries..

import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Flatten, Bidirectional, GlobalMaxPool1D
from tensorflow.keras.models import Model, Sequential
from keras.utils import to_categorical

In [1]:
# Reading the data..

data = pd.read_json("Sarcasm_Headlines_Dataset.json", lines=True)

In [2]:
print(data.shape)
data.head(10)

(26709, 3)


Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0
5,https://www.huffingtonpost.com/entry/advancing...,advancing the world's women,0
6,https://www.huffingtonpost.com/entry/how-meat-...,the fascinating case for eating lab-grown meat,0
7,https://www.huffingtonpost.com/entry/boxed-col...,"this ceo will send your kids to school, if you...",0
8,https://politics.theonion.com/top-snake-handle...,top snake handler leaves sinking huckabee camp...,1
9,https://www.huffingtonpost.com/entry/fridays-m...,friday's morning email: inside trump's presser...,0


In [3]:
# Dropping the columns that are not needed...

data = data.drop(['article_link'], axis = 1)

In [4]:
print(data.shape)
data.head(10)

(26709, 2)


Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0
5,advancing the world's women,0
6,the fascinating case for eating lab-grown meat,0
7,"this ceo will send your kids to school, if you...",0
8,top snake handler leaves sinking huckabee camp...,1
9,friday's morning email: inside trump's presser...,0


In [5]:
# Finding max length of strings in 'headline' column, so we can pad all strings to make them the same length...

data["Headline Length"]= data["headline"].str.len()
max_len = data["Headline Length"].max(axis = 0)
print(max_len)
data = data.drop(['Headline Length'], axis = 1)

254


Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0
5,advancing the world's women,0
6,the fascinating case for eating lab-grown meat,0
7,"this ceo will send your kids to school, if you...",0
8,top snake handler leaves sinking huckabee camp...,1
9,friday's morning email: inside trump's presser...,0


In [7]:
# Setting Model Parameters...

max_features = 10000
maxlen = max_len
embedding_size = 200

In [8]:
# Apply Keras Tokenizer...(strings to numbers)

tk = Tokenizer(
    num_words = max_features,
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    lower=True,
    split=" ",
    char_level=False,
    oov_token=None
)

tk.fit_on_texts(texts=data['headline'])

In [9]:
# Specifing independent and dependent variables for our model....

X = tk.texts_to_sequences(data['headline'])
X = pad_sequences(X, maxlen = maxlen)
y = np.asarray(data['is_sarcastic'])

print("Number of Samples:", len(X))
print(X[0])
print("Number of Labels: ", len(y))
print(y[0])

Number of Samples: 26709
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0  

In [10]:
# Vocabulary size after tokenizing...

num_words = len(tk.word_index) + 1
print(num_words)

29657


In [19]:
# Glove Word Embeddings...

glove_file = "glove.6B.zip"

In [20]:
# Extract Glove embedding zip file

from zipfile import ZipFile
with ZipFile(glove_file, 'r') as z:
  z.extractall()

In [11]:
# Perform Glove Word Embeddings to encode our dependent variables...

EMBEDDING_FILE = './glove.6B.200d.txt'

embeddings = {}
for o in open(EMBEDDING_FILE, encoding="utf8"):
    word = o.split(" ")[0]
    # print(word)
    embd = o.split(" ")[1:]
    embd = np.asarray(embd, dtype='float32')
    # print(embd)
    embeddings[word] = embd



In [12]:
# Create a weight matrix for words in training docs...

embedding_matrix = np.zeros((num_words, 200))

for word, i in tk.word_index.items():
    embedding_vector = embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

len(embeddings.values())

# Converting labels to categorical data...
y_mod = to_categorical(y)

400000

In [13]:
# Building a Sequential Model for Sarcasm Detection

model = Sequential()
model.add(Embedding(num_words, embedding_size, weights = [embedding_matrix]))
model.add(Bidirectional(LSTM(128, return_sequences = False, recurrent_dropout=0.1)))
model.add(Dense(128,activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 200)         5931400   
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               336896    
_________________________________________________________________
dense (Dense)                (None, 128)               32896     
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 258       
Total params: 6,301,450
Trainable params: 6,301,450
Non-trainable params: 0
_________________________________________________________________
None


In [16]:
# Training the Model...(with validation split = 0.2)

model.fit(X, y_mod, epochs = 3, batch_size=100, verbose = 1, validation_split = 0.2)

Train on 21367 samples, validate on 5342 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x23b116d1988>

In [62]:
# (Alternately With larger batch size for faster training)
# model.fit(X, y_mod, epochs = 5, batch_size=1000, verbose = 1, validation_split = 0.2)

Train on 21367 samples, validate on 5342 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x1d41f17af08>

In [19]:
# Displaying the score and accuracy of our model...

score,acc = model.evaluate(X, y_mod, verbose = 0, batch_size = 32)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

score: 0.15
acc: 0.95
