## Imports

In [2]:
# loading data
import pandas as pd
import string
#visualization
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score
# numeric pro
import numpy as np
from keras import callbacks

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
# Regular Expression for text cleaning
import re

# keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import GlobalMaxPooling1D, Dense, Input, LSTM, Embedding, Dropout, Activation, Flatten, Bidirectional, GlobalMaxPool1D
from wordcloud import WordCloud, STOPWORDS

#nltk 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords



# Word2vec
import gensim
#sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer 
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.metrics import auc

## Data exploration & Analysis

In [3]:
df_train = pd.read_csv("/kaggle/input/sarcasm/train-balanced-sarcasm.csv")


In [4]:
df_train.head(4)

In [5]:
df_train.describe()

In [6]:
df_train.columns

In [7]:
len(df_train)

In [8]:
df_train.isnull().sum()

In [9]:
df_train = df_train.drop(columns={'author','date','created_utc','subreddit','score','ups','downs','parent_comment'})

In [10]:
df_train

In [11]:
df_train['label'].value_counts()

#### Data Distribution

In [12]:
cum = df_train['label'].value_counts().to_frame()
cum['comment'] = cum.index
cumfig, ax = plt.subplots(figsize=(5,5))
sns.barplot(data=cum,x='comment',y='label',ax=ax)

#### Length of sarcastic and no sarcastic comments

In [13]:
sns.boxplot(x= df_train.loc[df_train['label'] == 1, 'comment'].str.len()).set(title = 'Length of Sarcastic Comments', xlabel = 'Length')
sns.despine(offset=5, trim=True)
plt.show()

In [14]:
sns.boxplot(x= df_train.loc[df_train['label'] == 0, 'comment'].str.len()).set(title = 'Length of No Sarcastic Comments', xlabel = 'Length')
sns.despine(offset=5, trim=True)
plt.show()

#### word cloud

In [15]:
wordcloud = WordCloud(background_color='grey', stopwords = STOPWORDS,
                max_words = 500, max_font_size = 100, 
                random_state = 17, width=800, height=400)

plt.figure(figsize=(12, 12))
wordcloud.generate(str(df_train.loc[df_train['label'] == 1, 'comment']))
plt.grid(b= False)
plt.imshow(wordcloud);

## Data preprocessing

Machine learning models don't work with abreviations and contraction so we should replace them with contracted words with their non contracted version

In [16]:
df_train = df_train.loc[0:int(len(df_train)/2)]
len(df_train)

In [17]:
contractions = pd.read_csv("/kaggle/input/contractions/contractions.csv")
contractions.head(4)

In [18]:
# we have some emojis
emojis = {':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad', 
          ':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised',
          ':-@': 'shocked', ':@': 'shocked',':-$': 'confused', ':\\': 'annoyed', 
          ':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused', '$_$': 'greedy',
          '@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused',
          '<(-_-)>': 'robot', 'd[-_-]b': 'dj', ":'-)": 'sadsmile', ';)': 'wink', 
          ';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat'}

In [19]:
word = "isn't"
ww = word in contractions.Contraction.values
contractions[contractions.Contraction==word].Meaning.values[0]

In [20]:
def cleaning_text(s):
    s = str(s).lower().strip()
    s = " ".join([contractions[contractions.Contraction==word].Meaning.values[0] if word in contractions.Contraction.values else word for word in s.split()])
    s = " ".join([emojis[word] if word in emojis.keys() else word for word in s.split()])

    # removing \n
    sss = '\n'
    s = re.sub(sss, '', s)
    # put spaces before & after punctuations to make words seprate
    s = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-]", "", s)
    # Remove >=2 continues spaces with 1 space.
    s = re.sub('[ ]{2,}', ' ', s).strip()
    return s

In [21]:
df_train.comment = df_train.comment.apply(cleaning_text)
comments = df_train['comment'].values
labels = df_train['label'].values

In [22]:
comments[3]

In [23]:
comments

In [24]:
df_train

## Vectorize

In [25]:
# settings that you use for count vectorizer will go here 
vectorizer = TfidfVectorizer(sublinear_tf=True, stop_words='english')
 
data_train= vectorizer.fit_transform(comments)
data_train

In [26]:
c_t,c_test,l_t,l_test = train_test_split(data_train,labels,test_size=0.20)

In [27]:
data_train

In [28]:
print(vectorizer.idf_)

# Baseline models

## Logistic Regression

In [29]:
from sklearn.linear_model import LogisticRegression

classifier1 = LogisticRegression(solver='lbfgs', max_iter=1000)
classifier1.fit(c_t, l_t)
score1 = classifier1.score(c_test, l_test)

print("Accuracy:", score1)

In [30]:
l_pred1 = classifier1.predict(c_test)

In [31]:
from sklearn.metrics import confusion_matrix
cm1 = confusion_matrix(l_test, l_pred1)
cm1

In [32]:
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(classifier1, c_test,l_test)  

## Support vector

In [33]:
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix
 
classifier2 = LinearSVC()
classifier2.fit(c_t, l_t)
 #score
score2 = classifier2.score(c_test, l_test)
print("Accuracy:", score2)
# predicting test set results
l_pred2 = classifier2.predict(c_test)
 
# making the confusion matrix
cm2 = confusion_matrix(l_test, l_pred2)
cm2

## Naive bayes

In [34]:
from sklearn.naive_bayes import MultinomialNB
 
classifier3 = MultinomialNB().fit(c_t, l_t)
 
# predicting test set results
l_pred3 = classifier3.predict(c_test)
#score
score3 = classifier3.score(c_test, l_test)
print("Accuracy:", score3)
# making the confusion matrix
cm3 = confusion_matrix(l_test, l_pred3)
cm3


## Stochastic gradient descent

In [35]:
from sklearn.linear_model import SGDClassifier
 
classifier4 = SGDClassifier().fit(c_t, l_t)
 
# predicting test set results
l_pred4 = classifier4.predict(c_test)
#score
score4 = classifier4.score(c_test, l_test)
print("Accuracy:", score4)
# making the confusion matrix
cm4 = confusion_matrix(l_test, l_pred4)
cm4


# Neural Network model

#### without pretrained EW

In [36]:
validation_split = 0.2
#split
com_tr,com_val, lab_tr,lab_val = train_test_split(df_train['comment'],df_train['label'],test_size=0.2)


In [37]:
tok = Tokenizer()
tok.fit_on_texts(com_tr)
# text ---> integer sequence
com_tr_seq  = tok.texts_to_sequences(com_tr) 
com_val_seq = tok.texts_to_sequences(com_val)

# integer sequences --> integer sequences with same length
com_tr_seq  = pad_sequences(com_tr_seq, maxlen=250)
com_val_seq = pad_sequences(com_val_seq, maxlen=250)

In [38]:
word_index = tok.word_index
print("unique tokens - ",len(word_index))
vocab_size = len(word_index) + 1
print('vocab size -', vocab_size)

In [39]:
from tensorflow.keras.models import Sequential


In [48]:
def training(model,nepch,bsize):
    early = callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=1, mode='auto')
    num_epochs = nepch
    batch_size = bsize
    with tf.device('/gpu:0'):
        history = model.fit(com_tr_seq, 
                            lab_tr, 
                            epochs=num_epochs,
                            batch_size=batch_size,
                            validation_split=0.2,
                            callbacks = [early],
                            verbose=1)

**Model 1**

In [41]:
#embedding_dim = 16
#max_length = 100
#model1= Sequential()
#model1.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
#model1.add(Dropout(0.3))
#model1.add(Bidirectional(LSTM(100)))
#model1.add(Dropout(0.3))
#model1.add(Flatten())
#model1.add(Dense(1,activation='sigmoid'))
#model1.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
#model1.summary()

In [42]:
#training(model1,5,64)

In [43]:
#acc1= model1.evaluate(com_val_seq,lab_val)
#acc1

**model 5**

In [46]:
embedding_dim = 16
max_length = 100
model5 = Sequential()
model5.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
model5.add(LSTM(64, recurrent_dropout=0.2))
#model.add(MaxPooling1D(2))
model5.add(Dense(64, activation='relu'))

model5.add(Dense(1, activation='sigmoid'))
model5.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

model5.summary()

In [50]:
training(model5,5,128)

In [51]:
acc5= model5.evaluate(com_val_seq,lab_val)
acc5

### with EW

In [52]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

In [None]:
!unzip glove*.zip
[n]

In [56]:
glove_input_file = 'glove.6B.300d.txt'

In [57]:
#we create a dictionary which contains a word and
#its vector
embeddings_dic={}
f = open(glove_input_file)
for line in f:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:],'float32')
    embeddings_dic[word]=vector
f.close()

In [58]:
# we build the embedding matrix for our text
# we will use it in the embedding layer
embedding_dim = 300
embedding_matrix = np.zeros((vocab_size, embedding_dim))
h = 0
for word, i in word_index.items():
    embedding_vector = embeddings_dic.get(word)
    if embedding_vector is not None:
        # word not found=> we put 0
        embedding_matrix[i] = embedding_vector
        h+=1

print(h)

In [59]:
# building of the embedding layer
embedding_layer = Embedding(
    vocab_size,
    embedding_dim,
    weights=[embedding_matrix],
    trainable=False
)

**model 2**

In [None]:
#model2 = Sequential()
#input layer
#model2.add(embedding_layer)

# BI LSTM layer
#model2.add(Bidirectional(LSTM(64,return_sequences=True)))
#model2.add(Bidirectional(LSTM(16)))


#Full connected layers
#model2.add(Dense(64, activation='relu'))
#model2.add(Dropout(0.5))
#model2.add(Dense(16, activation='relu'))
#model2.add(Dropout(0.5))

# Output layer
#model2.add(Dense(1, activation='sigmoid'))

#model2.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
#print('Summary of the built model...')
#print(model2.summary())

In [None]:
#training(model2,5,32)
#acc2=model2.evaluate(com_val_seq,lab_val)
#acc2

**model3**

In [None]:
#model3 = Sequential()
#model3.add(embedding_layer)
#model3.add(LSTM(64, dropout=0.2, recurrent_dropout=0.25))
#model3.add(Dense(1, activation='sigmoid'))

#model3.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

#print('Summary of the built model...')
#print(model3.summary())

In [None]:
#training(model3,5,32)
#acc3 = model3.evaluate(com_val_seq,lab_val)
#acc3

**model4**

In [None]:

#model4 = Sequential()
#model4.add(embedding_layer)
#model4.add(LSTM(64, recurrent_dropout=0.2))
#model4.add(MaxPooling1D(2))
#model4.add(Dense(64, activation='relu'))

#model4.add(Dense(1, activation='sigmoid'))
#model4.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

#model4.summary()

In [None]:
#training(model4,5,32)

In [None]:
#accr4 = model4.evaluate(com_val_seq,lab_val)
#accr4

**model6: CNN+LSTM**

In [63]:
model6 = Sequential()
model6.add(embedding_layer)
model6.add(keras.layers.Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model6.add(keras.layers.MaxPooling1D(pool_size=2))
model6.add(LSTM(100))
model6.add(Dense(1, activation='sigmoid'))
model6.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model6.summary())

In [64]:
training(model6,5,32)

In [65]:
accr6 = model6.evaluate(com_val_seq,lab_val)
accr6

## Final tests

In [66]:
def predict_sarcasm(comment,model):
    x = pd.DataFrame({"comment":[comment]})
    cleaned =  cleaning_text(x)
    tok.fit_on_texts(cleaned )
    com_seq = tok.texts_to_sequences(cleaned )
    com_pad = pad_sequences(com_seq, maxlen=100, padding='post')
    pred = model.predict(com_pad)
    pred*=100
    #print(pred)
    if pred[0,0]>=50: return "It's a sarcasm!" 
    else: return "It's not a sarcasm."

In [68]:
predict_sarcasm("At least you tried your best",model6)

In [69]:
predict_sarcasm("I am busy right now, can I ignore you some other time?",model6)