# NLP Project
## Sarcasm Detection

# Libraries

In [47]:
import pandas as pd, string
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Dataset

In [48]:
train_data = pd.read_csv('Train_Dataset.csv')
train_data.head()

Unnamed: 0,tweet,sarcastic
0,The only thing I got from college is a caffein...,1
1,I love it when professors draw a big question ...,1
2,Remember the hundred emails from companies whe...,1
3,Today my pop-pop told me I was not “forced” to...,1
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1


In [49]:
train_data.isnull().sum()

tweet        0
sarcastic    0
dtype: int64

In [50]:
test_data = pd.read_csv('Test_Dataset.csv')
test_data.head()

Unnamed: 0,tweet,sarcastic
0,"Size on the the Toulouse team, That pack is mo...",0
1,Pinball!,0
2,So the Scottish Government want people to get ...,1
3,villainous pro tip : change the device name on...,0
4,I would date any of these men 🥺,0


In [51]:
test_data.isnull().sum()

tweet        0
sarcastic    0
dtype: int64

In [52]:
data = train_data

# Preprocessing Data

In [53]:
table = str.maketrans('', '', string.punctuation)
def stopwords_lists(user):
    user = user.lower()
    stop_words = set(stopwords.words(user))
    return stop_words
stopword = stopwords.words('english')
datastore = data


sentences = []
labels = []

datastore = datastore.sample(frac=1.0, random_state=42).reset_index(drop=True)
for idx, item in datastore.iterrows():
    sentence = item['tweet'].lower()
    sentence = sentence.replace(",", " , ")
    sentence = sentence.replace(".", " . ")
    sentence = sentence.replace("-", " - ")
    sentence = sentence.replace("/", " / ")
    soup = BeautifulSoup(sentence, 'html.parser')
    sentence = soup.get_text()
    words = sentence.split()
    filtered_sentence = ""

    for word in words:
        word = word.translate(table)
        if word not in stopword:
            filtered_sentence = filtered_sentence + word + " "

    sentences.append(filtered_sentence)
    labels.append(item['sarcastic'])

  soup = BeautifulSoup(sentence, 'html.parser')


# Training the Dataset

In [54]:
len(data)

6934

In [55]:
training_size = int(len(data)*0.8)
training_sentences = sentences[:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[:training_size]
testing_labels = labels[training_size:]

max_length = 100
vocab_size = 20000

trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"


tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)

#Encode the sentences into sequences
training_sequences = tokenizer.texts_to_sequences(training_sentences)
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)

#Padding the sequences
training_padded = pad_sequences(training_sequences, padding='post')
testing_padded = pad_sequences(testing_sequences, padding='post')

#Converting to Numpy format, which can be used in embeddings
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

### Model Architechture

In [56]:
import tensorflow as tf
from tensorflow import keras

embedding_dim = 64

model = tf.keras.Sequential([
 tf.keras.layers.Embedding(vocab_size, embedding_dim),
 tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),
 tf.keras.layers.Dense(24, activation='relu'),
 tf.keras.layers.Dense(1, activation='sigmoid')
])

# Learning Rate
adam = tf.keras.optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, amsgrad=False)
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])

model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, None, 64)          1280000   
                                                                 
 bidirectional_4 (Bidirecti  (None, 128)               66048     
 onal)                                                           
                                                                 
 dense_8 (Dense)             (None, 24)                3096      
                                                                 
 dense_9 (Dense)             (None, 1)                 25        
                                                                 
Total params: 1349169 (5.15 MB)
Trainable params: 1349169 (5.15 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [57]:
import matplotlib.pyplot as plt


No_of_epochs = 30
history = model.fit(training_padded, training_labels, epochs= No_of_epochs, batch_size=32, validation_split=0.2)


plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()


plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.show()

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30