In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
import numpy as np 
import pandas as pd 
import nltk
import os
import re #regular exprtion لتنضيف الداتا
import matplotlib.pyplot as plt
from nltk.tokenize import sent_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, GRU, LSTM, Bidirectional
#from keras.layers.embeddings import Embedding
from keras.initializers import Constant
from keras.callbacks import ModelCheckpoint
from keras.models import load_model

In [None]:
data_1 = pd.read_json("/content/drive/MyDrive/NLP_Project/sarcasm_detection_shared_task_twitter_training.json", lines=True)
del data_1["context"]
#data_2 = pd.read_json("/content/Sarcasm_Headlines_Dataset.json", lines=True)
#del data_2["article_link"]
data_3 = pd.read_json("/content/drive/MyDrive/NLP_Project/Sarcasm_Headlines_Dataset_v2.json", lines=True)
del data_3["article_link"]
data =  pd.concat([data_1,data_3])
data.head()
print(data)

       is_sarcastic                                           headline
0                 1  @USER @USER @USER I don't get this .. obviousl...
1                 1  @USER @USER trying to protest about . Talking ...
2                 1  @USER @USER @USER He makes an insane about of ...
3                 1  @USER @USER Meanwhile Trump won't even release...
4                 1  @USER @USER Pretty Sure the Anti-Lincoln Crowd...
...             ...                                                ...
28614             1       jews to celebrate rosh hashasha or something
28615             1  internal affairs investigator disappointed con...
28616             0  the most beautiful acceptance speech this week...
28617             1  mars probe destroyed by orbiting spielberg-gat...
28618             1                 dad clarifies this not a food stop

[33619 rows x 2 columns]


# Pre Processing

In [None]:
def clean_text(text):
    text = text.lower()
    pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') # this regular expression pattern can be used to identify and extract URLs from text strings.
    text = pattern.sub('', text)
    text = " ".join(filter(lambda x:x[0]!='@', text.split()))
    emoji = re.compile("["  # used to identify and remove Unicode emojis from a string of text
                           u"\U0001F600-\U0001FFFF"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"  #Miscellaneous symbols
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    
    text = emoji.sub(r'', text)
    text = text.lower()
    # Limitation and stemming 
    # convert " ' " to actual word
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)        
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text) 
    text = re.sub(r"\'ll", " will", text)  
    text = re.sub(r"\'ve", " have", text)  
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"did't", "did not", text)
    text = re.sub(r"can't", "can not", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"couldn't", "could not", text)
    text = re.sub(r"have't", "have not", text)
    text = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-]", "", text)
    return text

In [None]:
import nltk
nltk.download('punkt')
import nltk
nltk.download('stopwords')
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

def CleanTokenize(df):
    head_lines = list()
    lines = df["headline"].values.tolist()
    #lines = data["headline"]
    #a=pd.array(lines, dtype="string")
    for line in lines:
        line = clean_text(line)
        # tokenize the text
        tokens = word_tokenize(line)
        # remove puntuations
        table = str.maketrans('', '', string.punctuation)#function call creates a translation table maps each character 
        stripped = [w.translate(table) for w in tokens]#The translate() method is then called on each token in the tokens list to apply the translation table and remove punctuation
        # remove non alphabetic characters
        words = [word for word in stripped if word.isalpha()]
        stop_words = set(stopwords.words("english"))#This function returns a list of common English stop words that are often removed from text data in NLP tasks.
        # remove stop words
        words = [w for w in words if not w in stop_words]
        head_lines.append(words)
    return head_lines

head_lines = CleanTokenize(data) #store the cleaned and tokenized headlines in the head_lines variable
head_lines[0:1]
print(type(head_lines))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
<class 'list'>


In [None]:
validation_split = 0.2  #sets the fraction of the data that will be used for validation
max_length = 25         #sets the maximum length of the input sequences
tokenizer_obj = Tokenizer() #initializes a Tokenizer object.
tokenizer_obj.fit_on_texts(head_lines) # fits the Tokenizer object on the tokenized and cleaned text data. 
sequences = tokenizer_obj.texts_to_sequences(head_lines) #converts the tokenized sequences to numerical sequences using the fitted Tokenizer object.

word_index = tokenizer_obj.word_index  #The word_index dictionary contains key-value pairs where each word in the text data is a key and its corresponding integer index is the value.
print("unique tokens - ",len(word_index)) #The len(word_index) function returns the number of unique words in the text data.
vocab_size = len(tokenizer_obj.word_index) + 1 #The vocab_size variable is initialized to the number of unique words in the text data plus one. This is because the integer index starts from 1 and not 0. The vocab_size indicates the size of the vocabulary that will be used to train the machine learning model.
print('vocab size -', vocab_size)

lines_pad = pad_sequences(sequences, maxlen=max_length, padding='post') #pad_sequences function is used to pad the sequences to a fixed length of max_length. The sequences variable contains the tokenized sequences for each text headline. padding='post' means that the padding will be added to the end of each sequence.
sentiment =  data['is_sarcastic'].values #extracts the labels for each text from the data object and assigns them to the sentiment variable. This creates a NumPy array containing the sentiment labels for all the headlines in the dataset.

indices = np.arange(lines_pad.shape[0]) #creates an array of indices from 0 to the number of rows in the lines_pad array
np.random.shuffle(indices) #shuffles the indices randomly
lines_pad = lines_pad[indices] #use the shuffled indices to rearrange the order of the rows in the lines_pad and sentiment arrays, so that the rows are in a different order than they were before.
sentiment = sentiment[indices]

num_validation_samples = int(validation_split * lines_pad.shape[0])# stores the number of validation samples, which is calculated by multiplying the validation split value with the total number of padded sequences.

X_train_pad = lines_pad[:-num_validation_samples]
y_train = sentiment[:-num_validation_samples]
X_test_pad = lines_pad[-num_validation_samples:]
y_test = sentiment[-num_validation_samples:]

unique tokens -  33331
vocab size - 33332


In [None]:
print('Shape of X_train_pad:', X_train_pad.shape)
print('Shape of y_train:', y_train.shape)
print('Shape of X_test_pad:', X_test_pad.shape)
print('Shape of y_test:', y_test.shape)

Shape of X_train_pad: (26896, 25)
Shape of y_train: (26896,)
Shape of X_test_pad: (6723, 25)
Shape of y_test: (6723,)


In [None]:
embeddings_index = {}
embedding_dim = 100
GLOVE_DIR = "/content/drive/MyDrive/NLP_Project/glove.twitter.27B.100d.txt"
f = open(GLOVE_DIR, encoding = "utf-8")
for line in f:
    values = line.split() #splitting each line into a list of values.
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index)) #prints the number of word vectors that were found in the file.

Found 4907 word vectors.


In [None]:
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim)) #len(word_index) is the total number of unique words in the training data, and embedding_dim is the dimension of the pre-trained GloVe word vectors.
c = 0
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        c+=1
        embedding_matrix[i] = embedding_vector
print(c) #The variable c keeps track of the number of words for which the embedding vectors are found in the embeddings_index dictionary

2292


In [None]:
embedding_layer = Embedding(len(word_index) + 1,
                            embedding_dim,#the size of the embedding vector.
                            weights=[embedding_matrix],#the pre-trained embedding matrix.
                            input_length=max_length,#the length of the input sequences.
                            trainable=False)#the weights of the embedding layer are frozen and will not be updated during training

In [None]:
model = Sequential()
model.add(embedding_layer)
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.25))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
print('Summary of the built model...')
print(model.summary())

Summary of the built model...
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 25, 100)           3333200   
                                                                 
 lstm_2 (LSTM)               (None, 64)                42240     
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                                 
Total params: 3,375,505
Trainable params: 42,305
Non-trainable params: 3,333,200
_________________________________________________________________
None


In [None]:
history = model.fit(X_train_pad, y_train, batch_size=32, epochs=25, validation_data=(X_test_pad, y_test), verbose=2)

Epoch 1/25
841/841 - 34s - loss: 0.6445 - acc: 0.6271 - val_loss: 0.6127 - val_acc: 0.6664 - 34s/epoch - 40ms/step
Epoch 2/25
841/841 - 30s - loss: 0.6060 - acc: 0.6657 - val_loss: 0.5852 - val_acc: 0.6836 - 30s/epoch - 35ms/step
Epoch 3/25
841/841 - 30s - loss: 0.5858 - acc: 0.6824 - val_loss: 0.5816 - val_acc: 0.6853 - 30s/epoch - 35ms/step
Epoch 4/25
841/841 - 29s - loss: 0.5707 - acc: 0.6939 - val_loss: 0.5644 - val_acc: 0.6921 - 29s/epoch - 34ms/step
Epoch 5/25
841/841 - 28s - loss: 0.5603 - acc: 0.7007 - val_loss: 0.5669 - val_acc: 0.6909 - 28s/epoch - 34ms/step
Epoch 6/25
841/841 - 28s - loss: 0.5511 - acc: 0.7088 - val_loss: 0.5609 - val_acc: 0.7018 - 28s/epoch - 34ms/step
Epoch 7/25
841/841 - 28s - loss: 0.5426 - acc: 0.7174 - val_loss: 0.5556 - val_acc: 0.7041 - 28s/epoch - 34ms/step
Epoch 8/25
841/841 - 29s - loss: 0.5365 - acc: 0.7194 - val_loss: 0.5535 - val_acc: 0.7037 - 29s/epoch - 34ms/step
Epoch 9/25
841/841 - 28s - loss: 0.5268 - acc: 0.7254 - val_loss: 0.5533 - val_a

In [None]:
def predict_sarcasm(s):
    x_final = pd.DataFrame({"headline":[s]})
    test_lines = CleanTokenize(x_final)
    test_sequences = tokenizer_obj.texts_to_sequences(test_lines)
    test_review_pad = pad_sequences(test_sequences, maxlen=max_length, padding='post')
    pred = model.predict(test_review_pad)
    pred*=100
    if pred[0][0]>=50: return "It's a sarcasm!" 
    else: return "It's not a sarcasm."

In [None]:
predict_sarcasm("I was depressed. He asked me to be happy. I am not depressed anymore.")

"It's a sarcasm!"

In [None]:
predict_sarcasm("You just broke my car window. Great job.")

"It's a sarcasm!"

In [None]:
predict_sarcasm("I want a million dollars!")

"It's not a sarcasm."

In [None]:
predict_sarcasm("I just won a million dollars!")

"It's a sarcasm!"

In [None]:
predict_sarcasm("I guess there is stupid everywhere")

"It's a sarcasm!"

In [None]:
predict_sarcasm("I’d love to discuss ideas and approaches to detect ")

"It's not a sarcasm."

In [None]:
predict_sarcasm("qatar deporting dutch woman who reported she was drugged and raped")

"It's a sarcasm!"

In [None]:
predict_sarcasm("you are stupid")

"It's a sarcasm!"