# Importing Necessary Libraries

In [1]:
import numpy as np 
import pandas as pd 
import os
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, GRU, LSTM, Bidirectional,Conv1D,MaxPooling1D,Flatten,Dropout
from keras.initializers import Constant
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
import tensorflow_hub as hub
import tensorflow_text as text

# Importing Dataset

In [2]:
data_1 = pd.read_json("Sarcasm_Headlines_Dataset.json", lines=True)
data_2 = pd.read_json("Sarcasm_Headlines_Dataset_v2.json", lines=True)
data =  pd.concat([data_1, data_2])
data.head()

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0


# Text Cleaning & Preprocessing

In [3]:
def clean_text(text):
    text = text.lower()
    pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    text = pattern.sub('', text)
    text = " ".join(filter(lambda x:x[0]!='@', text.split()))
    emoji = re.compile("["
                           u"\U0001F600-\U0001FFFF"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    
    text = emoji.sub(r'', text)
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)        
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text) 
    text = re.sub(r"\'ll", " will", text)  
    text = re.sub(r"\'ve", " have", text)  
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"did't", "did not", text)
    text = re.sub(r"can't", "can not", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"couldn't", "could not", text)
    text = re.sub(r"have't", "have not", text)
    text = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-]", "", text)
    return text

In [4]:
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')
nltk.download('stopwords')

def CleanTokenize(df):
    head_lines = list()
    lines = df["headline"].values.tolist()

    for line in lines:
        line = clean_text(line)
        # tokenize the text
        tokens = word_tokenize(line)
        # remove puntuations
        table = str.maketrans('', '', string.punctuation)
        stripped = [w.translate(table) for w in tokens]
        # remove non alphabetic characters
        words = [word for word in stripped if word.isalpha()]
        stop_words = set(stopwords.words("english"))
        # remove stop words
        words = [w for w in words if not w in stop_words]
        head_lines.append(words)
    return head_lines

head_lines = CleanTokenize(data)
head_lines[0:10]

[nltk_data] Downloading package punkt to /home/intern1/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/intern1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[['former',
  'versace',
  'store',
  'clerk',
  'sues',
  'secret',
  'black',
  'code',
  'minority',
  'shoppers'],
 ['roseanne',
  'revival',
  'catches',
  'thorny',
  'political',
  'mood',
  'better',
  'worse'],
 ['mom',
  'starting',
  'fear',
  'sons',
  'web',
  'series',
  'closest',
  'thing',
  'grandchild'],
 ['boehner',
  'wants',
  'wife',
  'listen',
  'come',
  'alternative',
  'debtreduction',
  'ideas'],
 ['jk', 'rowling', 'wishes', 'snape', 'happy', 'birthday', 'magical', 'way'],
 ['advancing', 'worlds', 'women'],
 ['fascinating', 'case', 'eating', 'labgrown', 'meat'],
 ['ceo', 'send', 'kids', 'school', 'work', 'company'],
 ['top', 'snake', 'handler', 'leaves', 'sinking', 'huckabee', 'campaign'],
 ['fridays', 'morning', 'email', 'inside', 'trumps', 'presser', 'ages']]

# Train Test Split the Data

In [5]:
validation_split = 0.2
max_length = 30 #when set to Mximum length ..doesnt give a good efficiency


tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(head_lines)
sequences = tokenizer_obj.texts_to_sequences(head_lines)

word_index = tokenizer_obj.word_index
print("unique tokens - ",len(word_index))
vocab_size = len(tokenizer_obj.word_index) + 1
print('vocab size -', vocab_size)

lines_pad = pad_sequences(sequences, maxlen=max_length, padding='post')
sentiment =  data['is_sarcastic'].values

indices = np.arange(lines_pad.shape[0])
np.random.shuffle(indices)
lines_pad = lines_pad[indices]
sentiment = sentiment[indices]

num_validation_samples = int(validation_split * lines_pad.shape[0])

X_train_pad = lines_pad[:-num_validation_samples]
y_train = sentiment[:-num_validation_samples]
X_test_pad = lines_pad[-num_validation_samples:]
y_test = sentiment[-num_validation_samples:]

unique tokens -  28657
vocab size - 28658


In [6]:
print('Shape of X_train_pad:', X_train_pad.shape)
print('Shape of y_train:', y_train.shape)

print('Shape of X_test_pad:', X_test_pad.shape)
print('Shape of y_test:', y_test.shape)

Shape of X_train_pad: (44263, 30)
Shape of y_train: (44263,)
Shape of X_test_pad: (11065, 30)
Shape of y_test: (11065,)


# Loading BERT Preprocess & Base Model from Tensorflow

In [7]:
preprocess_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'
encoder_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4'

In [8]:
bert_preprocess_model = hub.KerasLayer(preprocess_url)

In [9]:
text_test = ['Very bad movie','I love Python']
text_preprocessed = bert_preprocess_model(text_test)
text_preprocessed.keys()

dict_keys(['input_word_ids', 'input_mask', 'input_type_ids'])

In [10]:
bert_model = hub.KerasLayer(encoder_url)

In [11]:
bert_results = bert_model(text_preprocessed)
bert_results.keys()

dict_keys(['default', 'pooled_output', 'encoder_outputs', 'sequence_output'])

In [12]:
def get_word_embedding(word):
    text_preprocessed = bert_preprocess_model([word])
    bert_results = bert_model(text_preprocessed)
    return np.array(bert_results['pooled_output'][0])
    

In [13]:
get_word_embedding("snake").shape

(768,)

# Determine the max_length of sentence in the corpus

In [14]:
all_lengths = [len(x) for x in head_lines]

In [15]:
max_length_sentence = max(all_lengths)

In [16]:
max_length_sentence

106

# Train_Test_Split

In [17]:
validation_split = 0.2
max_length = 25 #when set to Mximum length ..doesnt give a good efficiency


tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(head_lines)
sequences = tokenizer_obj.texts_to_sequences(head_lines)

word_index = tokenizer_obj.word_index
print("unique tokens - ",len(word_index))
vocab_size = len(tokenizer_obj.word_index) + 1
print('vocab size -', vocab_size)

lines_pad = pad_sequences(sequences, maxlen=max_length, padding='post')
sentiment =  data['is_sarcastic'].values

indices = np.arange(lines_pad.shape[0])
np.random.shuffle(indices)
lines_pad = lines_pad[indices]
sentiment = sentiment[indices]

num_validation_samples = int(validation_split * lines_pad.shape[0])

X_train_pad = lines_pad[:-num_validation_samples]
y_train = sentiment[:-num_validation_samples]
X_test_pad = lines_pad[-num_validation_samples:]
y_test = sentiment[-num_validation_samples:]

unique tokens -  28657
vocab size - 28658


In [18]:
print('Shape of X_train_pad:', X_train_pad.shape)
print('Shape of y_train:', y_train.shape)

print('Shape of X_test_pad:', X_test_pad.shape)
print('Shape of y_test:', y_test.shape)

Shape of X_train_pad: (44263, 25)
Shape of y_train: (44263,)
Shape of X_test_pad: (11065, 25)
Shape of y_test: (11065,)


# Creating Embedding BERT Matrix + Layer

In [None]:
embedding_dim = 768
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
c = 0
for word, i in word_index.items():
    print(f"=========================={i}th iteration=========================")
#     embedding_vector = embeddings_index.get(word)
    try:
        embedding_vector = get_word_embedding(word)
    except:
        embedding_vector = None
    if embedding_vector is not None:
        c+=1
        embedding_matrix[i] = embedding_vector
    
print(c)



## Saving Embedding Matrix for Future use

In [25]:
# import pickle as pkl
# with open("EmbeddingMatrixBERT.pkl",'wb') as f:
#     pkl.dump([embedding_dim,embedding_matrix],f)

## Loding Embedding Matrix

In [7]:
import pickle as pkl
with open("EmbeddingMatrixBERT.pkl",'rb') as f:
    embedding_dim,embedding_matrix = pkl.load(f)

In [26]:
embedding_layer = Embedding(len(word_index) + 1,
                            embedding_dim,
                            weights=[embedding_matrix],
                            input_length=max_length,
                            trainable=False)

## Creating a CheckPoint

In [42]:
checkpoint = ModelCheckpoint(filepath="Sarcasm_Bert_Lstm_MBSA_v2.h5", 
                             monitor='val_loss',
                             verbose=1, 
                             save_best_only=True,
                             mode='min')

# Creating the main Neural Network with LSTM

In [43]:
model = Sequential()
model.add(embedding_layer)
model.add(LSTM(300,return_sequences = True))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print('Summary of the built model...')
print(model.summary())

Summary of the built model...
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 30, 768)           22009344  
_________________________________________________________________
lstm_2 (LSTM)                (None, 30, 300)           1282800   
_________________________________________________________________
dropout_2 (Dropout)          (None, 30, 300)           0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               160400    
_________________________________________________________________
dropout_3 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 23,452,645
Trainable params: 1,443,301
Non-trainable params: 22,009,344
____

## Evaluating Model

In [44]:
history = model.fit(X_train_pad, y_train,epochs=100,callbacks=[checkpoint], validation_data=(X_test_pad, y_test))

Epoch 1/100

Epoch 00001: val_loss improved from inf to 0.66679, saving model to Sarcasm_Bert_Lstm_MBSA_v2.h5
Epoch 2/100

Epoch 00002: val_loss improved from 0.66679 to 0.64296, saving model to Sarcasm_Bert_Lstm_MBSA_v2.h5
Epoch 3/100

Epoch 00003: val_loss improved from 0.64296 to 0.61545, saving model to Sarcasm_Bert_Lstm_MBSA_v2.h5
Epoch 4/100

Epoch 00004: val_loss improved from 0.61545 to 0.59951, saving model to Sarcasm_Bert_Lstm_MBSA_v2.h5
Epoch 5/100

Epoch 00005: val_loss improved from 0.59951 to 0.58558, saving model to Sarcasm_Bert_Lstm_MBSA_v2.h5
Epoch 6/100

Epoch 00006: val_loss improved from 0.58558 to 0.57080, saving model to Sarcasm_Bert_Lstm_MBSA_v2.h5
Epoch 7/100

Epoch 00007: val_loss improved from 0.57080 to 0.54570, saving model to Sarcasm_Bert_Lstm_MBSA_v2.h5
Epoch 8/100

Epoch 00008: val_loss did not improve from 0.54570
Epoch 9/100

Epoch 00009: val_loss improved from 0.54570 to 0.53606, saving model to Sarcasm_Bert_Lstm_MBSA_v2.h5
Epoch 10/100

Epoch 00010: v

In [49]:
model.evaluate(X_test_pad,y_test)



[0.33465898036956787, 0.9047446846961975]

#Saving the model

In [16]:
model.save('Sarcasm_Bert_Lstm.h5')

# Load the Model

In [45]:
loaded_model = load_model('Sarcasm_Bert_Lstm_MBSA_v2.h5')

# Making Predictions

In [46]:
def predict_sarcasm(s):
    x_final = pd.DataFrame({"headline":[s]})
    test_lines = CleanTokenize(x_final)
    test_sequences = tokenizer_obj.texts_to_sequences(test_lines)
    test_review_pad = pad_sequences(test_sequences, maxlen=max_length, padding='post')
    pred = loaded_model.predict(test_review_pad)
    pred*=100
    if pred[0][0]>=50: return "It's a sarcasm!" 
    else: return "It's not a sarcasm."

In [47]:
predict_sarcasm("I was depressed. He asked me to be happy. I am not depressed anymore.")

"It's a sarcasm!"

In [51]:
predict_sarcasm("You just broke my car window.Great job!!")

"It's not a sarcasm."

In [52]:
predict_sarcasm("You just broke my car window. good Boy!")

"It's not a sarcasm."

In [53]:
predict_sarcasm("You just saved my dog's life. Sorry.")

"It's not a sarcasm."

In [54]:
predict_sarcasm("I want a million dollars!")

"It's not a sarcasm."

In [55]:
predict_sarcasm("I just won a million dollars!")

"It's a sarcasm!"

In [60]:
predict_sarcasm("Hey Aniket, are you a boy??")

"It's not a sarcasm."