# Detecting Sarcasm on Reddit comments
##### Lynda Attouche & Sami Benyahia

# Imports

In [None]:
# loading data
import pandas as pd

# visualization
import matplotlib.pyplot as plt
import seaborn as sn

# word cloud
from wordcloud import WordCloud, STOPWORDS

# numeric + string 
import numpy as np
import string

# Regular Expression for text cleaning
import re

# nltk 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords


# word2vec
import gensim
import gensim.downloader as gensim_api

#bert 
import transformers
import torch
import plotly.express as px

# tensorflow
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import preprocessing as kprocessing


# keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import MaxPooling1D,GlobalMaxPooling1D,Conv1D, Dense, Input, LSTM, Embedding, Dropout, Activation, Flatten, Bidirectional, GlobalMaxPool1D
from keras import callbacks

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer 
from sklearn.metrics import roc_auc_score, accuracy_score,roc_curve, auc, plot_confusion_matrix, confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from tensorflow.keras.models import Sequential
from sklearn.manifold import TSNE
from sklearn.naive_bayes import MultinomialNB



# Data exploration & Analysis

In [None]:
#loading data 
df_train = pd.read_csv("/kaggle/input/sarcasm/train-balanced-sarcasm.csv")

In [None]:
df_train.head(4)

In [None]:
df_train.describe()

In [None]:
df_train.columns

In [None]:
len(df_train)

In [None]:
df_train.isnull().sum()

In [None]:
df_train

In [None]:
df_train['label'].value_counts()

## Data Distribution

In [None]:
cum = df_train['label'].value_counts().to_frame()
cum['comment'] = cum.index
cumfig, ax = plt.subplots(figsize=(5,5))
sn.barplot(data=cum,x='comment',y='label',ax=ax)

## Length of sarcastic and no sarcastic comments

In [None]:
sn.boxplot(x= df_train.loc[df_train['label'] == 1, 'comment'].str.len()).set(title = 'Length of Sarcastic Comments', xlabel = 'Length')
sn.despine(offset=5, trim=True)
plt.show()

In [None]:
sn.boxplot(x= df_train.loc[df_train['label'] == 0, 'comment'].str.len()).set(title = 'Length of No Sarcastic Comments', xlabel = 'Length')
sn.despine(offset=5, trim=True)
plt.show()

## Word cloud

In [None]:
wordcloud = WordCloud(background_color='grey', stopwords = STOPWORDS,
                max_words = 500, max_font_size = 100, 
                random_state = 17, width=800, height=400)

plt.figure(figsize=(12, 12))
wordcloud.generate(str(df_train.loc[df_train['label'] == 1, 'comment']))
plt.grid(b= False)
plt.imshow(wordcloud);

# Data preprocessing

## Removing columns

In [None]:
df_train = df_train.drop(columns={'author','date','created_utc','subreddit','score','ups','downs','parent_comment'})
df_train

## Cleaning text (comments)

In [None]:
# loading contractions
contractions = pd.read_csv("../input/d/ishivinal/contractions/contractions.csv")
contractions.head(4)

In [None]:
# emojis
emojis = {':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad', 
          ':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised',
          ':-@': 'shocked', ':@': 'shocked',':-$': 'confused', ':\\': 'annoyed', 
          ':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused', '$_$': 'greedy',
          '@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused',
          '<(-_-)>': 'robot', 'd[-_-]b': 'dj', ":'-)": 'sadsmile', ';)': 'wink', 
          ';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat'}

In [None]:
# test of contractions
word = "isn't"
ww = word in contractions.Contraction.values
contractions[contractions.Contraction==word].Meaning.values[0]

In [None]:
def cleaning_text(s):
    s = str(s).lower().strip()
    s = " ".join([contractions[contractions.Contraction==word].Meaning.values[0] if word in contractions.Contraction.values else word for word in s.split()])
    s = " ".join(['' if word in emojis.keys() else word for word in s.split()])

    # removing \n
    sss = '\n'
    s = re.sub(sss, '', s)
    # put spaces before & after punctuations to make words seprate
    s = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-]", "", s)
    # Remove >=2 continues spaces with 1 space.
    s = re.sub('[ ]{2,}', ' ', s).strip()
    return s

In [None]:
#df_train.comment = df_train.comment.apply(cleaning_text)
#On utilise un module de serialization pour accelerer le preprocessing
!pip3 install pickle5
path_to_file = '../input/preproquick/clean_text.pkl'

import pickle5 as p
import pickle


with open(path_to_file, "rb") as fh:
    data = p.load(fh)

df_train = data


comments = df_train['comment'].values
labels = df_train['label'].values

In [None]:
comments[3]

In [None]:
comments

In [None]:
df_train

## Word Embedding

## Word2Vec

In [None]:
df_test = df_train[800000:]
df_train_w = df_train[:800000]

In [None]:
#On recupere le model pretrained de google
#nlp = gensim_api.load("word2vec-google-news-300")

In [None]:
#On sépare chacun des mots de chaque commentaire dans un tableau
corpus = df_train_w["comment"]
lst_corpus = []
for string in corpus:
    lst_words = string.split()
    lst_grams = [" ".join(lst_words[i:i+1]) for i in range(0, len(lst_words), 1)]
    lst_corpus.append(lst_grams)

In [None]:
#On crée notre propre modele au lieu d'utiliser le pre-trained ci dessus
nlp = gensim.models.word2vec.Word2Vec(lst_corpus, vector_size=300,window=8, min_count=1)

In [None]:
#On transforme les mots de notre corpus en vecteur à l'aide du model que l'on a crée 
vocab = list(nlp.wv.key_to_index)
X = nlp.wv[vocab[:5000]]

In [None]:
#TSNE permet de réduire la dimension pour passer à des points en 2D
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X)

In [None]:
#On crée un dataframe avec chaque mot et sa representation vectorielle (les 5000 mots les plus utilisés)
dfW2V = pd.DataFrame(X_tsne, index=vocab[:5000], columns=['x', 'y'])
name = []
for word, pos in dfW2V.iterrows():
    name.append(word)
dfW2V["name"]= name
dfW2V


In [None]:
#Representation des 500 mots les plus courants selon leurs coordonnées

fig = px.scatter(dfW2V[:500], x="x", y="y", text ="name", size_max=100)
fig.update_traces(textposition='top center')
fig.update_layout(title_text='Representation des mots', title_x=0.5)


fig.show()

## Bert

In [None]:
#BERT ne fonctionne que pour les  commentaires de moins de 512 caracteres on supprime donc les commentaires plus longs
df_train_b = df_train
indexNames = df_train_b[df_train_b["comment"].map(len) > 512 ].index
indexNames
df_train_b = df_train_b.drop(indexNames)
df_trainX = df_train_b[:1000] 

In [None]:
## On recupere un modele pré-entrainé de BERT
##DistilBert est une version plus petite mais bien plus rapide de BERT
tokenizer = transformers.AutoTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)
model_class, tokenizer_class, pretrained_weights = (transformers.DistilBertModel, transformers.DistilBertTokenizer, 'distilbert-base-uncased')
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [None]:
#On tokenize les phrases pour mieux convenir au modele de BERT
tokenized = df_trainX["comment"].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, padding = True)))

In [None]:
tokenized

In [None]:
#On applique un padding sur toutes les phrases pour qu'elle soit de meme taille puis on les represente sous la forme d'un
# 2-d array car c'est selon le guide que j'ai suivi bien plus rapide que de traiter une liste de liste de taille variable.
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])


In [None]:
#On applique un masque pour empêcher de créer de la confusion dans le modèle à cause du padding
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

In [None]:
#On utilise le modèle pré-entrainé pour créer un embedding de chaque phrase
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)


In [None]:
#On garde seulement la partie de l'output qui correspond à l'embedding de la phrase entiere 
embedding = last_hidden_states[0][:,0,:].numpy()
labels = df_trainX["label"]

In [None]:
embedding

## TFID

In [None]:
vectorizer = TfidfVectorizer(sublinear_tf=True, stop_words='english')
 
data_train= vectorizer.fit_transform(df_train.comment)
data_train

In [None]:
# Décomposition de notre jeu de données en ensemble d'entrainement,de validation et de test 
test_ratio = 0.1
val_ratio = 0.2
X_train, X_test, Y_train, Y_test = train_test_split(data_train,df_train.label, test_size = test_ratio)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size = val_ratio)

In [None]:
data_train

In [None]:
print(vectorizer.idf_)

# Models

## Baseline models

In [None]:
s_accuracy= []

### Logistic Regression

In [None]:
classifier1 = LogisticRegression(solver='lbfgs', max_iter=1000)
#training the model
classifier1.fit(X_train,Y_train)

#score
score1 = classifier1.score(X_val,Y_val)
s_accuracy.append(score1)
print("Accuracy:", score1)

In [None]:
# predicting val set results
Ypred1 = classifier1.predict(X_val)
Ypred1

In [None]:
#confusion matrix
plot_confusion_matrix(classifier1, X_val,Y_val)  

In [None]:
#metrics
Ypred_proba1 = classifier1.predict_proba(X_val)[::,1]
fpr1, tpr1, _ = roc_curve(Y_val,  Ypred_proba1)

#ROC curve
plt.plot(fpr1,tpr1,'m')
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC Curve')
plt.show()

### Support vector

In [None]:
from sklearn.svm import SVC
classifier2 = SVC(kernel='linear',probability=True)
classifier2.fit(X_train,Y_train)

 #score
score2 = classifier2.score(X_val,Y_val)
print("Accuracy:", score2)
s_accuracy.append(score2)

In [None]:
# predicting val set results
Ypred2 = classifier2.predict(X_val)
Ypred2

In [None]:
# confusion matrix
plot_confusion_matrix(classifier2, X_val,Y_val)  

In [None]:
#metrics
Ypred_proba2 = classifier2.predict_proba(X_val)[::,1]
fpr2, tpr2, _ = metrics.roc_curve(Y_val,  Ypred_proba2)

#ROC curve
plt.plot(fpr2,tpr2)
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC Curve')
plt.show()

### Naive bayes

In [None]:
 
classifier3 = MultinomialNB().fit(X_train,Y_train)
 
#score
score3 = classifier3.score(X_val, Y_val)
s_accuracy.append(score3)
print("Accuracy:", score3)


In [None]:
# predicting test set results
Ypred3 = classifier3.predict(X_val)
Ypred3

In [None]:
#  confusion matrix
plot_confusion_matrix(classifier3, X_val,Y_val)  

In [None]:
from sklearn.metrics import roc_curve
#metrics
Ypred_proba3 = classifier3.predict_proba(X_val)[::,1]
fpr3, tpr3, _ = roc_curve(Y_val,  Ypred_proba3)

#ROC curve
plt.plot(fpr3,tpr3)
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC Curve')
plt.show()

### Comparaison

In [None]:
model_name = ["Logistic regression", "SVM", "Naive Bayes"]
d = {'Accuracy': s_accuracy} 

sd = pd.DataFrame(s_accuracy, index=[model_name[i] for i in range(len(model_name))] ) 
sd = sd.rename(columns={0: "Score"})
sd

### Final Model

In [None]:
clf = sd.idxmax().values[0]
score = clf.score(X_test)
print("Le classifier final est:"+clf)
print("Le score obtenu = "+str('%.2f'%score))

## Neural Network models

### Functions

In [None]:
def training(model,nepch,bsize):
    """
    Training the neural network
    @params:
            - model: neural network
            - npech: number of epochs
            - bsize: batch size
    @return: 
            history of neural network
    """
    early = callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=1, mode='auto')
    num_epochs = nepch
    batch_size = bsize
    with tf.device('/gpu:0'): # remove this line if not using Kaggle gpu
        history = model.fit(com_tr_seq, 
                            lab_tr, 
                            epochs=num_epochs,
                            batch_size=batch_size,
                            validation_split=0.2,
                            callbacks = [early],
                            verbose=1)
        
    return history

In [None]:
def plot_acc_loss(history):
    """
    Plot accuracy and loss of a model
    @params:
            - history: history of the model
    @return:
            plots
    """
    fig,ax = plt.subplots(1,2,figsize=(10,5))
    l = list(history.history.keys()
    # accuracy plot
    ax[0].plot(history.history[l[0]])
    ax[0].plot(history.history[l[2]])
    ax[0].set_title('model accuracy')
    ax[0].set_ylabel('accuracy')
    ax[0].set_xlabel('epoch')
    ax[0].legend(['train', 'test'], loc='upper left')
    # loss plot
    ax[1].plot(history.history[l[1]])
    ax[1].plot(history.history[l[3]])
    ax[1].set_title('model loss')
    ax[1].set_ylabel('loss')
    ax[1].set_xlabel('epoch')
    ax[1].legend(['train', 'test'], loc='upper left')

In [None]:
def predicted_label(model):
    """
    Compute predictions
    @params:
            - model: neural network model
    @return:
            - list of prediction 
    """
    pred = model.predict(com_test_seq)
    lab_pred = []
    for i in pred:
        if i>0.5:
            lab_pred.append(1)
        else:
            lab_pred.append(0)
    return lab_pred

In [None]:
# Splitting data : train and test set
com_tr,com_test, lab_tr,lab_test = train_test_split(df_train['comment'],df_train['label'],test_size=0.2)


In [None]:
def plot_cm(model):
    pred = model.predict(com_test_seq)
    lab_pred = predicted_label(model)
    cm = confusion_matrix(lab_test,lab_pred)  
    sn.heatmap(cm, annot=True)

In [None]:
def plot_roc_curve(model):
    pred = model.predict(com_test_seq)
    lab_pred = predicted_label(model)
    # false and true positive rates
    fpr,tpr,_ = roc_curve(lab_test,lab_pred)
    # area under roc curve
    roc_auc = auc(fpr, tpr)
    #ROC curve
    plt.figure()
    lw = 2
    plt.plot(fpr,tpr,color="darkorange",lw=lw,label="ROC curve (area = %0.2f)" % roc_auc,)
    plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("Receiver operating characteristic example")
    plt.legend(loc="lower right")
    plt.show()

In [None]:
def compare_roc(models):
    fig,ax = plt.subplots()
    for model in models:
        pred = model.predict(com_test_seq)
        lab_pred = predicted_label(model)
        # false and true positive rates
        fpr,tpr,_ = roc_curve(lab_test,lab_pred)
        # area under roc curve
        roc_auc = auc(fpr, tpr)
        #ROC curve
        lw = 2
        i=1
        ax.plot(fpr,tpr,lw=lw,label='model'+str(i))
        i+=1
    ax.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
    ax.set_xlim([0.0, 1.0])
    ax.set_ylim([0.0, 1.05])
    ax.set_xlabel("False Positive Rate")
    ax.set_ylabel("True Positive Rate")
    ax.set_title("Receiver operating characteristic")
    ax.legend(loc="lower right")

In [None]:
accuracy=[]

### Without Pretrained Word Embedding

#### Tokenization

In [None]:
tok = Tokenizer()
tok.fit_on_texts(com_tr)
# text ---> integer sequence
com_tr_seq  = tok.texts_to_sequences(com_tr) 
com_test_seq = tok.texts_to_sequences(com_test)

# integer sequences --> integer sequences with same length
com_tr_seq  = pad_sequences(com_tr_seq, maxlen=100)
com_test_seq = pad_sequences(com_test_seq, maxlen=100)

In [None]:
word_index = tok.word_index
print("unique tokens - ",len(word_index))
vocab_size = len(word_index) + 1
print('vocab size -', vocab_size)

**Model 1: Bidirectional LSTM**

In [None]:
embedding_dim = 16
model1= Sequential()
#input layer/ embedding layer
model1.add(Embedding(vocab_size, embedding_dim, input_length=100))
#bidirectional lstm layer
model1.add(Bidirectional(LSTM(128)))
#fc layers
model1.add(Dropout(0.3))
model1.add(Flatten())
#output layer
model1.add(Dense(1,activation='sigmoid'))
# compiling model
model1.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print('Summary of the built model:')
print(model1.summary())

In [None]:
# training the model
hist1 = training(model1,5,64)
# evaluating the model 
acc1= model1.evaluate(com_test_seq,lab_test)
print("Accuracy")
accuracy.append(acc1[1])
acc1[1]

In [None]:
# accuracy and loss
plot_acc_loss(hist1)

In [None]:
# confusion matrix
plot_cm(model1)

In [None]:
# roc curve
plot_roc_curve(model1)

**model 2: LSTM**

In [None]:
model2 = Sequential()
#input layer
model2.add(Embedding(vocab_size, embedding_dim, input_length=100))
#lstm layer
model2.add(LSTM(64, dropout=0.5))
#fc layer
model2.add(Dense(64, activation='relu'))
#output layer
model2.add(Dense(1, activation='sigmoid'))
#compiling model
model2.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

print('Summary of the built model:')
print(model2.summary())

In [None]:
hist2 = training(model2,5,64)

In [None]:
acc2= model2.evaluate(com_test_seq,lab_test)
accuracy.append(acc2[1])
print("Accuracy")
acc2[1]

In [None]:
# accuracy and loss
plot_acc_loss(hist2)

In [None]:
# confusion matrix
plot_cm(model2)

In [None]:
# roc curve
plot_roc_curve(model2)

### With Pretrained Word Embedding

**Glove**

In [None]:
#upload the pretrained word embedding
!wget http://nlp.stanford.edu/data/glove.6B.zip

In [None]:
#unzip
!unzip glove*.zip

In [None]:
#choosing the dimension, here we chose 300
glove_input_file = 'glove.6B.300d.txt'

In [None]:
#we create a dictionary which contains a word and
#its vector
embeddings_dic={}
f = open(glove_input_file)
for line in f:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:],'float32')
    embeddings_dic[word]=vector
f.close()

In [None]:
# we build the embedding matrix for our text
# we will use it in the embedding layer
embedding_dim = 300
embedding_matrix = np.zeros((vocab_size, embedding_dim))
h = 0
for word, i in word_index.items():
    embedding_vector = embeddings_dic.get(word)
    if embedding_vector is not None:
        # word not found=> we put 0
        embedding_matrix[i] = embedding_vector
        h+=1

print(h)

In [None]:
# building of the embedding layer
embedding_layer = Embedding(
    vocab_size,
    embedding_dim,
    weights=[embedding_matrix],
    trainable=False
)

**model 3: bidirectional lstm**

In [None]:
model3 = Sequential()

#input layer
model3.add(embedding_layer)

# 1st bi-LSTM layer
model3.add(Bidirectional(LSTM(128,return_sequences=True)))
# 2nd bi-LSTM layer
model3.add(Bidirectional(LSTM(64)))


#FC layers
model3.add(Dense(128, activation='relu'))
model3.add(Dropout(0.3))
model3.add(Dense(64, activation='relu'))
model3.add(Dropout(0.5))

# Output layer
model3.add(Dense(1, activation='sigmoid'))
# compiling the model
model3.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

print('Summary of the built model:')
print(model3.summary())

In [None]:
hist3 = training(model3,5,64)
acc3=model3.evaluate(com_test_seq,lab_test)
accuracy.append(acc3[1])
print("Accuracy")
acc3[1]

In [None]:
#accuracy and loss
plot_acc_loss(hist3)

In [None]:
#confusion matrix
plot_cm(model3)

In [None]:
#roc curve
plot_roc_curve(model3)

**model 4: lstm**

In [None]:
model4 = Sequential()
#input layer
model4.add(embedding_layer)
#lstm layer
model4.add(LSTM(64, dropout=0.2))
#fc layer
model4.add(Dense(64, activation='relu'))
#output layer
model4.add(Dense(1, activation='sigmoid'))
#compiling layer
model4.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

print('Summary of the built model:')
print(model4.summary())

In [None]:
hist4 = training(model4,5,64)
acc4 = model4.evaluate(com_test_seq,lab_test)
accuracy.append(acc4[1])

print("Accuracy")
acc4[1]

In [None]:
#accuracy and loss
plot_acc_loss(hist4)

In [None]:
#confusion matrix
plot_cm(model4)

In [None]:
#roc curve
plot_roc_curve(model4)

**model 5: CNN+LSTM**

In [None]:
model5 = Sequential()
#input layer
model5.add(embedding_layer)
#convolutional layer
model5.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model5.add(MaxPooling1D(pool_size=2))
#lstm layer
model5.add(LSTM(128))
#output layer
model5.add(Dense(1, activation='sigmoid'))
#compiling the model 
model5.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print('Summary of the built model:')
print(model5.summary())

In [None]:
hist5 = training(model5,5,64)
accr5 = model5.evaluate(com_test_seq,lab_test)
accuracy.append(acc5[1])

print("Accuracy")
accr5[1]

In [None]:
#accuracy and loss
plot_acc_loss(hist5)

In [None]:
#confusion matrix
plot_cm(model5)

In [None]:
#roc curve
plot_roc_curve(model5)

### Comparison

In [None]:
model_name_ = ["BiLSTM", "LSTM", "BiLSTM+Glove","LSTM+Glove","LSTM+CNN+Glove"]
d_ = {'Accuracy': accuracy} 

sd_ = pd.DataFrame(accuracy, index=[model_name_[i] for i in range(len(model_name_))] ) 
sd_ = sd_.rename(columns={0: "Score"})
sd_

In [None]:
models = [model1,model2,model3,model4,model5]
compare_roc(models)

### Final model

In [None]:
mod = sd_.idxmax().values[0]
score_ = sd_.max().values[0]
print("Le modèle final est:"+mod)
print("Le score obtenu = "+str('%.2f'%score_))

## Final tests

In [None]:
def predict_sarcasm(comment,model):
    """
    Prediction if a comment is sarcastic or not
    @param:
            - comment: string representing the text contained in the comment
            - model: model used for prediction
    @return: 
            - string
    """
    x = pd.DataFrame({"comment":[comment]})
    cleaned =  cleaning_text(x)
    tok.fit_on_texts(cleaned )
    com_seq = tok.texts_to_sequences(cleaned )
    com_pad = pad_sequences(com_seq, maxlen=100, padding='post')
    pred = model.predict(com_pad)
    pred*=100
    #print(pred)
    if pred[0,0]>=50: return "It's a sarcasm!" 
    else: return "It's not a sarcasm."

In [None]:
#no sarcasm
predict_sarcasm("At least you tried your best",model3)

In [None]:
#sarcasm
predict_sarcasm("I am busy right now, can I ignore you some other time?",model3)

In [None]:
#no sarcasm
predict_sarcasm("OMG! How dare are u?",model3)

In [None]:
#sarcasm
predict_sarcasm("oh politics, what fun it is",model3)