In [1]:
import numpy as np
import pandas as pd
import gensim
import os
from keras.preprocessing.text import Tokenizer
from keras.utils.data_utils import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, GRU, LSTM
# GRU and LSTM are RNN it is sequence learning
from keras.layers import Embedding
from keras.initializers import Constant
from keras.callbacks import ModelCheckpoint
from keras.models import load_model

In [2]:
data = pd.read_json("Sarcasm_Headlines_Dataset.json",lines = True)
df = pd.DataFrame(data)

In [3]:
df = df.dropna()

In [4]:
y = df['is_sarcastic']
df = df.drop(columns=['is_sarcastic','article_link'])

In [5]:
df.head()

Unnamed: 0,headline
0,former versace store clerk sues over secret 'b...
1,the 'roseanne' revival catches up to our thorn...
2,mom starting to fear son's web series closest ...
3,"boehner just wants wife to listen, not come up..."
4,j.k. rowling wishes snape happy birthday in th...


In [6]:
y.head()

0    0
1    0
2    1
3    1
4    0
Name: is_sarcastic, dtype: int64

In [7]:
import re
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [8]:
df

Unnamed: 0,headline
0,former versace store clerk sues over secret 'b...
1,the 'roseanne' revival catches up to our thorn...
2,mom starting to fear son's web series closest ...
3,"boehner just wants wife to listen, not come up..."
4,j.k. rowling wishes snape happy birthday in th...
...,...
26704,american politics in moral free-fall
26705,america's best 20 hikes
26706,reparations and obama
26707,israeli ban targeting boycott supporters raise...


In [9]:

def clean_text(df):
    all_reviews = list()
    lines = df["headline"].values.tolist()
    cnt = 0
    
    pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    for text in lines:
        try:
            text = text.lower()
            text = pattern.sub('', text)
            text = re.sub(r"[,.\"!@#$%^&*(){}?/;`~:<>+=-]", "", text)
            tokens = word_tokenize(text)
            table = str.maketrans('', '', string.punctuation)
            stripped = [w.translate(table) for w in tokens]
            words = [word for word in stripped if word.isalpha()]
            stop_words = set(stopwords.words("english"))
            stop_words.discard("not")
            PS = PorterStemmer()
            words = [PS.stem(w) for w in words if not w in stop_words]
            words = ' '.join(words)
            all_reviews.append(words)
            cnt = cnt+1
        except:
            continue
    return all_reviews

all_reviews = clean_text(df)
all_reviews[0:20]


['former versac store clerk sue secret black code minor shopper',
 'roseann reviv catch thorni polit mood better wors',
 'mom start fear son web seri closest thing grandchild',
 'boehner want wife listen not come altern debtreduct idea',
 'jk rowl wish snape happi birthday magic way',
 'advanc world women',
 'fascin case eat labgrown meat',
 'ceo send kid school work compani',
 'top snake handler leav sink huckabe campaign',
 'friday morn email insid trump presser age',
 'airlin passeng tackl man rush cockpit bomb threat',
 'facebook reportedli work healthcar featur app',
 'north korea prais trump urg us voter reject dull hillari',
 'actual cnn jeffrey lord indefens',
 'barcelona hold huge protest support refuge',
 'nuclear bomb deton rehears spiderman music',
 'cosbi lawyer ask accus nt come forward smear legal team year ago',
 'stock analyst confus frighten boar market',
 'bloomberg program build better citi got bigger',
 'craig hick indict']

In [10]:
# For xgboost
from sklearn.feature_extraction.text import CountVectorizer
CV = CountVectorizer(min_df = 5)   

X = CV.fit_transform(all_reviews).toarray()


In [11]:
from sklearn.model_selection import train_test_split
X_trainc, X_testc, y_trainc, y_testc = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
CV = TfidfVectorizer(min_df=5)   
X = CV.fit_transform(all_reviews).toarray()

In [13]:
from sklearn.model_selection import train_test_split
X_traint, X_testt, y_traint, y_testt = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [14]:
from sklearn.model_selection import train_test_split
validation_split = 0.8
max_length = 20

tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(all_reviews)
sequences = tokenizer_obj.texts_to_sequences(all_reviews)

word_index = tokenizer_obj.word_index
print("unique tokens - "+str(len(word_index)))
vocab_size = len(tokenizer_obj.word_index) + 1
print('vocab_size - '+str(vocab_size))
# This is for the values which are not present in the dataset

lines_pad = pad_sequences(sequences, maxlen=max_length, padding='post')


X_train , X_test ,y_train,y_test = train_test_split(lines_pad,y,train_size=validation_split,random_state=42)


unique tokens - 18859
vocab_size - 18860


In [15]:
print('Shape of X_train_pad:', X_train.shape)
print('Shape of y_train:', y_train.shape)

print('Shape of X_test_pad:', X_test.shape)
print('Shape of y_test:', y_test.shape)

Shape of X_train_pad: (21367, 20)
Shape of y_train: (21367,)
Shape of X_test_pad: (5342, 20)
Shape of y_test: (5342,)


We use a GloVE model to obtain the vector representation of the words of a pretrained model and then get the mapping from words to
numbers

In [16]:
embedding_index = {}
embedding_dim = 300
glove_dir = "D:\GloveModel"
f = open(os.path.join(glove_dir, 'glove.6B.300d.txt'), encoding = "utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coeff = np.asarray(values[1:], dtype='float32')
    embedding_index[word] = coeff
f.close()


In [17]:
# Mapping the word  from vector
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [33]:
from keras.optimizers import Adam, SGD, RMSprop, Adadelta, Adagrad, Adamax, Nadam, Ftrl
from keras.layers import Flatten
from keras.layers import Dense,Embedding,LSTM,Dropout,Bidirectional,GRU


model_glove = Sequential()
model_glove.add(embedding_layer)
model_glove.add(Bidirectional(LSTM(units=32, recurrent_dropout = 0.4, dropout = 0.4)))
model_glove.add(Dense(1, activation='sigmoid'))

model_glove.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model_glove.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 300)           5658000   
                                                                 
 bidirectional_2 (Bidirectio  (None, 64)               85248     
 nal)                                                            
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                                 
Total params: 5,743,313
Trainable params: 85,313
Non-trainable params: 5,658,000
_________________________________________________________________
None


In [39]:
import tensorflow as tf
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=4)

model_glove.fit(X_train, y_train, epochs=100,batch_size=32, validation_data=(X_test, y_test), verbose=1,callbacks = [callback])


Epoch 1/100


Epoch 2/100

KeyboardInterrupt: 

In [43]:
from sklearn.metrics import accuracy_score

y_pred = model_glove.predict(X_test)
y_pred = np.where(y_pred > 0.5, 1, 0)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.8013852489704231


In [237]:
import xgboost as xgb

model = xgb.XGBClassifier()
model.fit(X_trainc,y_trainc , eval_set = [(X_trainc,y_trainc),(X_testc,y_testc)],early_stopping_rounds=10)

from sklearn.metrics import accuracy_score




[0]	validation_0-logloss:0.66346	validation_1-logloss:0.66543
[1]	validation_0-logloss:0.64776	validation_1-logloss:0.65100
[2]	validation_0-logloss:0.63789	validation_1-logloss:0.64083
[3]	validation_0-logloss:0.63004	validation_1-logloss:0.63360
[4]	validation_0-logloss:0.62497	validation_1-logloss:0.62954
[5]	validation_0-logloss:0.62057	validation_1-logloss:0.62539
[6]	validation_0-logloss:0.61642	validation_1-logloss:0.62099
[7]	validation_0-logloss:0.61313	validation_1-logloss:0.61764
[8]	validation_0-logloss:0.60974	validation_1-logloss:0.61536
[9]	validation_0-logloss:0.60715	validation_1-logloss:0.61276
[10]	validation_0-logloss:0.60446	validation_1-logloss:0.61095
[11]	validation_0-logloss:0.60120	validation_1-logloss:0.60783
[12]	validation_0-logloss:0.59863	validation_1-logloss:0.60504
[13]	validation_0-logloss:0.59638	validation_1-logloss:0.60304
[14]	validation_0-logloss:0.59414	validation_1-logloss:0.60073
[15]	validation_0-logloss:0.59200	validation_1-logloss:0.59861
[1

In [238]:
y_pred = model.predict(X_testc)
y_pred = np.where(y_pred > 0.5, 1, 0)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.7283788843129914


In [239]:
import xgboost as xgb

model = xgb.XGBClassifier()
model.fit(X_traint,y_traint , eval_set = [(X_traint,y_traint),(X_testt,y_testt)],early_stopping_rounds=10)

from sklearn.metrics import accuracy_score




[0]	validation_0-logloss:0.66298	validation_1-logloss:0.66517
[1]	validation_0-logloss:0.64681	validation_1-logloss:0.65042
[2]	validation_0-logloss:0.63605	validation_1-logloss:0.64031
[3]	validation_0-logloss:0.62802	validation_1-logloss:0.63267
[4]	validation_0-logloss:0.62195	validation_1-logloss:0.62769
[5]	validation_0-logloss:0.61759	validation_1-logloss:0.62450
[6]	validation_0-logloss:0.61323	validation_1-logloss:0.62137
[7]	validation_0-logloss:0.60965	validation_1-logloss:0.61774
[8]	validation_0-logloss:0.60620	validation_1-logloss:0.61448
[9]	validation_0-logloss:0.60307	validation_1-logloss:0.61184
[10]	validation_0-logloss:0.60016	validation_1-logloss:0.60958
[11]	validation_0-logloss:0.59760	validation_1-logloss:0.60794
[12]	validation_0-logloss:0.59502	validation_1-logloss:0.60557
[13]	validation_0-logloss:0.59221	validation_1-logloss:0.60309
[14]	validation_0-logloss:0.58959	validation_1-logloss:0.60068
[15]	validation_0-logloss:0.58743	validation_1-logloss:0.59853
[1

In [37]:
y_pred = model.predict(X_testt)
y_pred = np.where(y_pred > 0.5, 1, 0)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

NameError: name 'model' is not defined

In [241]:

from sklearn.naive_bayes import GaussianNB 
model = GaussianNB()
model.fit(X_trainc,y_trainc)

y_pred = model.predict(X_testc)

from sklearn.metrics import accuracy_score, f1_score, precision_score
print(model.score(X_testc,y_testc))

0.675215275177836


In [242]:

from sklearn.naive_bayes import BernoulliNB 
model = BernoulliNB()
model.fit(X_trainc,y_trainc)

y_pred = model.predict(X_testt)

from sklearn.metrics import accuracy_score, f1_score, precision_score
print(model.score(X_testt,y_testt))

0.7869711718457506


In [41]:
# from joblib import dump
# dump(model_glove,"NNmodelsarcasm.joblib")

['NNmodelsarcasm.joblib']

In [42]:
from joblib import load
load("NNmodelsarcasm.joblib",model_glove)

<keras.engine.sequential.Sequential at 0x2216499bd10>

In [21]:
# from joblib import dump
# dump(model_glove,"NNtestemodelsarcasm.joblib")

['NNtestemodelsarcasm.joblib']

In [245]:
def predict_sarcasm(s):
    x_final = pd.DataFrame({"headline":[s]})
    test_lines = clean_text(x_final)
        test_sequences = tokenizer_obj.texts_to_sequences(test_lines)
        test_review_pad = pad_sequences(test_sequences, maxlen=max_length, padding='post')
    pred = model_glove.predict(test_review_pad)
    pred*=100
    if pred[0][0]>=50: return "It's a sarcasm!" 
    else: return "It's not a sarcasm."

In [246]:
predict_sarcasm("I was depressed. He asked me to be happy. I am not depressed anymore.")




"It's not a sarcasm."

In [247]:
predict_sarcasm("You just broke my car window. Great job.")



"It's a sarcasm!"

In [248]:
predict_sarcasm("You just saved my dog's life. Thanks a million.")



"It's not a sarcasm."

In [249]:
predict_sarcasm("I want a million dollars!")



"It's not a sarcasm."

In [250]:
predict_sarcasm("I just won a million dollars!")




"It's a sarcasm!"

In [251]:
predict_sarcasm("His name is Bob. He is a nice person.")




"It's a sarcasm!"

In [252]:
predict_sarcasm("Sarcasm is very easy to detect.")




"It's a sarcasm!"

In [253]:
predict_sarcasm("That's just what I needed today!")



"It's not a sarcasm."

In [254]:
predict_sarcasm("I work 40 hours a week for me to be this poor.")



"It's not a sarcasm."