# Sarcasm Detection
 **Acknowledgement**

Misra, Rishabh, and Prahal Arora. "Sarcasm Detection using Hybrid Neural Network." arXiv preprint arXiv:1908.07414 (2019).

**Required Files given in below link.**

https://drive.google.com/drive/folders/1xUnF35naPGU63xwRDVGc-DkZ3M8V5mMk

## Install `Tensorflow2.0` 

In [0]:
!!pip uninstall tensorflow
!pip install tensorflow==2.0.0

## Get Required Files from Drive

In [0]:
from google.colab import drive
drive.mount('/content/drive/')

In [0]:
#Set your project path 
import os
project_path =  os.chdir('/content/drive/My Drive/PGP-AIML-UT-Austin-Jun19/NLP/Project')

#**## Reading and Exploring Data**

## Read Data "Sarcasm_Headlines_Dataset.json". Explore the data and get  some insights about the data. 

In [0]:
import pandas as pd
df = pd.read_json ('https://raw.githubusercontent.com/rishabhmisra/News-Headlines-Dataset-For-Sarcasm-Detection/master/Sarcasm_Headlines_Dataset.json', lines=True)
df.tail()

df['is_sarcastic'].value_counts()/len(df)

In [0]:
df['is_sarcastic'].value_counts()/len(df)

## Drop `article_link` from dataset. 
As we only need headline text data and is_sarcastic column for this project. We can drop artical link column here.

In [0]:
df = df[['headline','is_sarcastic']]
df.tail()

In [0]:
import seaborn as sns

sns.countplot(df.is_sarcastic)
plt.xlabel('Label')
plt.title('Sarcasm vs Non-sarcasm')

## Get the Length of each line and find the maximum length.
As different lines are of different length. We need to pad the our sequences using the max length.

In [0]:
import re

df['headline'] = df['headline'].apply(lambda x: x.lower())
df['headline'] = df['headline'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

In [0]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df)

In [0]:
word_idx = tokenizer.word_index
idx_word = tokenizer.index_word

In [0]:
sequences = tokenizer.texts_to_sequences(df)

In [0]:
from sklearn.model_selection import train_test_split

X = df["headline"]
y = df["is_sarcastic"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, 
                                                          random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, 
                                                          random_state=52)

In [0]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

model = make_pipeline(TfidfVectorizer(), MultinomialNB())
model.fit(X_train, y_train)

In [0]:
from sklearn.metrics import accuracy_score

print(f'Accuracy on train set: {accuracy_score(y_train, model.predict(X_train))}')
print(f'Accuracy on validation set: {accuracy_score(y_val, model.predict(X_val))}')

In [0]:
tokenizer = Tokenizer(lower=False)
tokenizer.fit_on_texts(X_train)

word_index = tokenizer.word_index
print(f"Found {len(word_index)} unique tokens.")

sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_test = tokenizer.texts_to_sequences(X_test)
sequences_val = tokenizer.texts_to_sequences(X_val)

max_len = max([len(x) for x in sequences_train+sequences_val+sequences_test])
print(f"The longest headline has {max_len} words.")

#**## Modelling**

## Import required modules required for modelling.

In [0]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Flatten, Bidirectional, GlobalMaxPool1D
from tensorflow.keras.models import Model, Sequential

## Set Different Parameters for the model. 

In [0]:
max_features = 10000
maxlen = 135 ## Add your max length here ##
embedding_size = 200

## Apply Keras Tokenizer of headline column of your data. 
Hint - First create a tokenizer instance using Tokenizer(num_words=max_features) 
And then fit this tokenizer instance on your data column df['headline'] using .fit_on_texts()

In [0]:
tokenizer = Tokenizer(maxlen=max_features)
tokenizer.fit_on_texts(X_train)

word_index = tokenizer.word_index
print(f"Found {len(word_index)} unique tokens.")

sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_test = tokenizer.texts_to_sequences(X_test)
sequences_val = tokenizer.texts_to_sequences(X_val)

max_len = max([len(x) for x in sequences_train+sequences_val+sequences_test])
print(f"The longest headline contains {max_len} words.")

## Define X and y for your model

In [0]:
X = tokenizer.texts_to_sequences(df['headline'])
X = pad_sequences(X, maxlen = maxlen)
y = np.asarray(df['is_sarcastic'])

print("Number of Samples:", len(X))
print(X[0])
print("Number of Labels: ", len(y))
print(y[0])

## Get the Vocabulary size ( 4 marks)
Hint : You can use tokenizer.word_index.

In [0]:
word_index = tokenizer.word_index
print(f"Found {len(word_index)} unique tokens.")

#**## Word Embedding**

## Get Glove Word Embeddings

In [0]:
glove_file = project_path + "./glove.6B.zip"

In [0]:
#Extract Glove embedding zip file
from zipfile import ZipFile
with ZipFile(glove_file, 'r') as z:
  z.extractall()

# Get the Word Embeddings using Embedding file as given below.

In [0]:
EMBEDDING_FILE = './glove.6B.200d.txt'

embeddings = {}
for o in open(EMBEDDING_FILE):
    word = o.split(" ")[0]
    # print(word)
    embd = o.split(" ")[1:]
    embd = np.asarray(embd, dtype='float32')
    # print(embd)
    embeddings[word] = embd



# Create a weight matrix for words in training docs

In [0]:
embedding_matrix = np.zeros((num_words, 200))

for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

len(embeddings.values())

## Create and Compile your Model 
Use Sequential model instance and then add Embedding layer, Bidirectional(LSTM) layer, then dense and dropout layers as required. 
In the end add a final dense layer with sigmoid activation for binary classification.


In [0]:
### Embedding layer for hint 
## model.add(Embedding(num_words, embedding_size, weights = [embedding_matrix]))
### Bidirectional LSTM layer for hint 
## model.add(Bidirectional(LSTM(128, return_sequences = True)))
vocab_size = 25343
import tensorflow as tf

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_size, input_length=maxlen),
    Dropout(0.2),
    Bidirectional(LSTM(64, return_sequences = True)),
    Bidirectional(LSTM(32)),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

In [0]:
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

SVG(model_to_dot(model, show_shapes=True).create(prog='dot', format='svg'))

## Fit your model with a batch size of 100 and validation_split = 0.2. and state the validation accuracy 

In [0]:
batch_size = 100
epochs = 5

## Add your code here ##
BiLSTM = model.fit(train_padded, train_labels, epochs = num_epochs, 
                    validation_data = (test_padded, test_labels), verbose =2)

In [0]:
score,acc = BiLSTM.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)

In [0]:
pos_cnt, neg_cnt, pos_correct, neg_correct = 0, 0, 0, 0
for x in range(len(X_test)):
    
    result = BiLSTM.predict(X_test[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]
   
    if np.argmax(result) == np.argmax(Y_test[x]):
        if np.argmax(Y_test[x]) == 0:
            neg_correct += 1
        else:
            pos_correct += 1
       
    if np.argmax(Y_test[x]) == 0:
        neg_cnt += 1
    else:
        pos_cnt += 1



print("pos_acc", pos_correct/pos_cnt*100, "%")
print("neg_acc", neg_correct/neg_cnt*100, "%")

In [0]:
scores = BiLSTM.evaluate(X_train, y_train, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

In [0]:
scores = BiLSTM.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

In [0]:
plt.plot(BiLSTM.history['loss'], label='train loss')
plt.plot(BiLSTM.history['val_loss'], label='val loss')
plt.xlabel("epoch")
plt.ylabel("Cross-entropy loss")
plt.legend();

In [0]:
plt.plot(BiLSTM.history['accuracy'], label='train accuracy')
plt.plot(BiLSTM.history['val_accuracy'], label='val accuracy')
plt.xlabel("epoch")
plt.ylabel("accuracy")
plt.legend();

In [0]:
BiLSTM.evaluate(X_test, y_test)