# Sarcasm Classifier

### Imports:

In [None]:
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences



import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import seaborn as sns

import nltk
from nltk.corpus import nps_chat
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.corpus import stopwords
nltk.download('stopwords')

import pandas as pd 
import re
import pickle
import datetime
import io


import os
from os import makedirs
from os import chdir
from os import path
cwd = os.getcwd()


Using TensorFlow backend.


### Data Import:

In [None]:
GENdata = pd.read_csv(cwd + '/sarcasmCorpora/GEN-sarc-notsarc.csv')
HYPdata = pd.read_csv(cwd + '/sarcasmCorpora/HYP-sarc-notsarc.csv')
RQdata = pd.read_csv(cwd + '/sarcasmCorpora/RQ-sarc-notsarc.csv')

GENdata['Type'] = 'general' 
HYPdata['Type'] = 'hyperbole'
RQdata['Type'] = 'rhetorical'

### Data splitting:

In [None]:
def split_data(text, labels, split_size):
    sentences_train, sentences_test, label_train, label_test = train_test_split(
        text, labels, test_size = split_size, random_state = 42)
    
    sentences_train = np.array(sentences_train)
    label_train = np.array(label_train)
    sentences_test = np.array(sentences_test)
    label_test = np.array(label_test)
    
    return sentences_train, sentences_test, label_train, label_test

### Padding Sequences:

In [None]:
max_length = 200

def make_sequences(tok, train_text, test_text):
    training_sequences = tok.texts_to_sequences(train_text)
    training_padded = pad_sequences(training_sequences, maxlen=max_length, 
                                    padding = 'post', truncating = 'post')
    
    testing_sequences = tok.texts_to_sequences(test_text)
    testing_padded = pad_sequences(testing_sequences, maxlen=max_length, 
                                   padding = 'post', truncating = 'post')
    
    return training_padded, testing_padded

### Cleaning text:

In [None]:
def clean_text(text):
    text = text.lower()
    
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"\?", "", text)
    text = re.sub(r"\!", "", text)
    text = re.sub(r"\.", "", text)
    text = re.sub(r"\,", "", text)
    text = re.sub(r"\r\n", "", text)
    return text

## NN1: Type of input

In [None]:
HYPdata['id'] = HYPdata['id'].apply(lambda x: x + 6520)
RQdata['id'] = RQdata['id'].apply(lambda x: x + 6520 + 1164)

sarcasm_dataset = GENdata.append(HYPdata).append(RQdata).set_index('id')

type_labels = sarcasm_dataset['Type'].tolist()
sentencesRaw = sarcasm_dataset['text'].tolist()
sentences = list(map(clean_text, sentencesRaw))

Type_test_size = 0.5

Type_train, Type_test, Type_label_train, Type_label_test = split_data(sentences, type_labels, Type_test_size)

In [None]:
nl_Type_train, nl_Type_test = [],[]

def number_label(l, new_l):
    for element in l:
        if(element == 'general'):
            new_l.append(0)
        elif(element == 'hyperbole'):
            new_l.append(1)
        elif(element == 'rhetorical'):
            new_l.append(2)
            
number_label(Type_label_train, nl_Type_train)
number_label(Type_label_test, nl_Type_test)

nl_Type_train, nl_Type_test = np.array(nl_Type_train), np.array(nl_Type_test)

In [None]:
Type_tok = Tokenizer(oov_token="<OOV>")
Type_tok.fit_on_texts(Type_train)
Type_word_index = Type_tok.word_index
Type_vocab_size = len(Type_word_index) + 1

In [None]:
Type_train_padded, Type_test_padded = make_sequences(Type_tok, Type_train, Type_test)

In [None]:
Type_train_padded.shape

In [None]:
Type_embedding_dim = 32

Type_model = tf.keras.Sequential([
        tf.keras.layers.Embedding(Type_vocab_size, Type_embedding_dim, input_length = max_length, name = 'TYPEembed'),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(24, activation = 'relu'),
        tf.keras.layers.Dense(3, activation = 'sigmoid'),
    ])

Type_model.compile(loss = 'sparse_categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

Type_model.summary()

Type_history = Type_model.fit(Type_train_padded, nl_Type_train, epochs = 30, 
                              validation_data = (Type_test_padded, nl_Type_test), verbose = 2)

## NN2: Sarcasm or not

In [39]:
GEN_text = list(map(clean_text, GENdata['text'].tolist()))
GEN_labels = GENdata['class'].tolist()
HYP_text = list(map(clean_text, HYPdata['text'].tolist()))
HYP_labels = HYPdata['class'].tolist()
RQ_text = list(map(clean_text, RQdata['text'].tolist()))
RQ_labels = RQdata['class'].tolist()

GEN_test_size, HYP_test_size, RQ_test_size = 0.33, 0.1, 0.4

GEN_train, GEN_test, GEN_label_train, GEN_label_test = split_data(GEN_text, GEN_labels, GEN_test_size)
HYP_train, HYP_test, HYP_label_train, HYP_label_test = split_data(HYP_text, HYP_labels, HYP_test_size)
RQ_train, RQ_test, RQ_label_train, RQ_label_test = split_data(RQ_text, RQ_labels, RQ_test_size)

In [40]:
blGEN_train, blGEN_test, blHYP_train, blHYP_test, blRQ_train, blRQ_test = [],[],[],[],[],[]

def make_binery(l, new_l):
    for element in l:
        if(element == 'notsarc'):
            new_l.append(0)
        elif(element == 'sarc'):
            new_l.append(1)
            
    new_l = np.array(new_l)
            
make_binery(GEN_label_train, blGEN_train)
make_binery(GEN_label_test, blGEN_test)
make_binery(HYP_label_train, blHYP_train)
make_binery(HYP_label_test, blHYP_test)
make_binery(RQ_label_train, blRQ_train)
make_binery(RQ_label_test, blRQ_test)

In [41]:
blGEN_train, blGEN_test = np.array(blGEN_train), np.array(blGEN_test)
blHYP_train, blHYP_test = np.array(blHYP_train), np.array(blHYP_test)
blRQ_train, blRQ_test = np.array(blRQ_train), np.array(blRQ_test)

### Tokenizers:

In [42]:
GENtok = Tokenizer(oov_token="<OOV>")
GENtok.fit_on_texts(GEN_train)
GEN_word_index = GENtok.word_index
GEN_vocab_size = len(GEN_word_index) + 1

HYPtok = Tokenizer(oov_token="<OOV>")
HYPtok.fit_on_texts(HYP_train)
HYP_word_index = HYPtok.word_index
HYP_vocab_size = len(HYP_word_index) + 1

RQtok = Tokenizer(oov_token="<OOV>")
RQtok.fit_on_texts(RQ_train)
RQ_word_index = RQtok.word_index
RQ_vocab_size = len(RQ_word_index) + 1

In [43]:
GEN_train_padded, GEN_test_padded = make_sequences(GENtok, GEN_train, GEN_test)
HYP_train_padded, HYP_test_padded = make_sequences(HYPtok, HYP_train, HYP_test)
RQ_train_padded, RQ_test_padded = make_sequences(RQtok, RQ_train, RQ_test)

## General NN:

In [80]:
GEN_embedding_dim = 16

GEN_model = tf.keras.Sequential([
        tf.keras.layers.Embedding(GEN_vocab_size, GEN_embedding_dim, 
                                  input_length = max_length, name = 'GENembed'),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(16, activation = 'sigmoid'),
        tf.keras.layers.Dense(24, activation = 'relu'),
        tf.keras.layers.Dense(32, activation = 'sigmoid'),
        tf.keras.layers.Dense(12, activation = 'relu'),
        tf.keras.layers.Dense(1, activation = 'sigmoid')
    ])

GEN_model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

GEN_model.summary()
GEN_n_epochs = 15

GEN_history = GEN_model.fit(GEN_train_padded, blGEN_train, epochs = GEN_n_epochs, 
                            validation_data = (GEN_test_padded, blGEN_test), verbose = 2)

Model: "sequential_19"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
GENembed (Embedding)         (None, 200, 16)           259760    
_________________________________________________________________
global_average_pooling1d_19  (None, 16)                0         
_________________________________________________________________
dense_68 (Dense)             (None, 16)                272       
_________________________________________________________________
dense_69 (Dense)             (None, 24)                408       
_________________________________________________________________
dense_70 (Dense)             (None, 32)                800       
_________________________________________________________________
dense_71 (Dense)             (None, 12)                396       
_________________________________________________________________
dense_72 (Dense)             (None, 1)               

## Hyperbolic NN:

In [77]:
HYP_embedding_dim = 32

HYP_model = tf.keras.Sequential([
        tf.keras.layers.Embedding(HYP_vocab_size, HYP_embedding_dim, 
                                  input_length = max_length, name = 'HYPembed'),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(16, kernel_regularizer=keras.regularizers.l2(0.001), activation = 'relu'),
        tf.keras.layers.Dense(16, kernel_regularizer=keras.regularizers.l2(0.001), activation = 'relu'),
        tf.keras.layers.Dense(1, activation = 'sigmoid')
    ])

HYP_model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

HYP_model.summary()
HYP_n_epochs = 20

HYP_history = HYP_model.fit(HYP_train_padded, blHYP_train, epochs = HYP_n_epochs, 
                            validation_data = (HYP_test_padded, blHYP_test), verbose = 2)

Model: "sequential_16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
HYPembed (Embedding)         (None, 200, 32)           250528    
_________________________________________________________________
global_average_pooling1d_16  (None, 32)                0         
_________________________________________________________________
dense_57 (Dense)             (None, 16)                528       
_________________________________________________________________
dense_58 (Dense)             (None, 16)                272       
_________________________________________________________________
dense_59 (Dense)             (None, 1)                 17        
Total params: 251,345
Trainable params: 251,345
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
33/33 - 0s - loss: 0.7269 - accuracy: 0.5215 - val_loss: 0.7226 - val_accuracy: 0.5128
Epoch 2/20
33/3

## Rhetorical NN:

In [78]:
RQ_embedding_dim = 3

RQ_model = tf.keras.Sequential([
        tf.keras.layers.Embedding(RQ_vocab_size, RQ_embedding_dim, input_length = max_length, name = 'RQembed'),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(16, kernel_regularizer=keras.regularizers.l2(0.001), activation = 'relu'),
        tf.keras.layers.Dense(16, kernel_regularizer=keras.regularizers.l2(0.001), activation = 'relu'), 
        tf.keras.layers.Dense(1, activation = 'sigmoid')
    ])

RQ_model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

RQ_model.summary()
RQ_n_epochs = 50

RQ_history = RQ_model.fit(RQ_train_padded, blRQ_train, epochs = RQ_n_epochs, 
                          validation_data = (RQ_test_padded, blRQ_test), verbose = 2)

Model: "sequential_17"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
RQembed (Embedding)          (None, 200, 3)            26796     
_________________________________________________________________
global_average_pooling1d_17  (None, 3)                 0         
_________________________________________________________________
dense_60 (Dense)             (None, 16)                64        
_________________________________________________________________
dense_61 (Dense)             (None, 16)                272       
_________________________________________________________________
dense_62 (Dense)             (None, 1)                 17        
Total params: 27,149
Trainable params: 27,149
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
32/32 - 0s - loss: 0.7118 - accuracy: 0.4966 - val_loss: 0.7097 - val_accuracy: 0.4934
Epoch 2/50
32/32 

## Tensorboard

In [53]:
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [90]:
!rm -rf ./logs/ 

In [91]:
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

In [92]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

RQ_history = RQ_model.fit(RQ_train_padded, blRQ_train, epochs = 50, 
                            validation_data = (RQ_test_padded, blRQ_test), verbose = 2, callbacks=[tensorboard_callback])

Epoch 1/50
32/32 - 0s - loss: 0.0828 - accuracy: 1.0000 - val_loss: 0.8192 - val_accuracy: 0.6843
Epoch 2/50
32/32 - 0s - loss: 0.0819 - accuracy: 1.0000 - val_loss: 0.8202 - val_accuracy: 0.6872
Epoch 3/50
32/32 - 0s - loss: 0.0810 - accuracy: 1.0000 - val_loss: 0.8245 - val_accuracy: 0.6769
Epoch 4/50
32/32 - 0s - loss: 0.0795 - accuracy: 1.0000 - val_loss: 0.8225 - val_accuracy: 0.6799
Epoch 5/50
32/32 - 0s - loss: 0.0790 - accuracy: 1.0000 - val_loss: 0.8239 - val_accuracy: 0.6725
Epoch 6/50
32/32 - 0s - loss: 0.0780 - accuracy: 1.0000 - val_loss: 0.8231 - val_accuracy: 0.6784
Epoch 7/50
32/32 - 0s - loss: 0.0770 - accuracy: 1.0000 - val_loss: 0.8237 - val_accuracy: 0.6784
Epoch 8/50
32/32 - 0s - loss: 0.0764 - accuracy: 1.0000 - val_loss: 0.8227 - val_accuracy: 0.6784
Epoch 9/50
32/32 - 0s - loss: 0.0754 - accuracy: 1.0000 - val_loss: 0.8220 - val_accuracy: 0.6872
Epoch 10/50
32/32 - 0s - loss: 0.0746 - accuracy: 1.0000 - val_loss: 0.8320 - val_accuracy: 0.6740
Epoch 11/50
32/32 -

In [93]:
%tensorboard --logdir logs/fit

Reusing TensorBoard on port 6007 (pid 27606), started 0:20:15 ago. (Use '!kill 27606' to kill it.)

# Input functions

In [115]:
def type_classifier(txt):
    text = clean_text(txt)
    input_sequence = Type_tok.texts_to_sequences([text])
    input_padded = pad_sequences(input_sequence, maxlen=max_length, padding = 'post', truncating = 'post')
    
    if text == "": return
    
    prediction_list = Type_model.predict(input_padded)
    prediction_location = np.argmax(prediction_list)
    
    type_list = ['general', 'hyperbole', 'rhetorical']

    return(type_list[prediction_location])

In [116]:
type_classifier("That man is as tall as a house.")

'general'

In [127]:
Gthreshold = 0.7
Hthreshold = 0.7
Rthreshold = 0.7

def sarcasm_classifier(txt, Type):
    text = clean_text(txt)
    if(Type == None):
        print("No input text was found")
        return
        
    elif(Type == 'general'):
        input_sequence = GENtok.texts_to_sequences([text])
        input_padded = pad_sequences(input_sequence, maxlen=max_length,
                                     padding = 'post', truncating = 'post')

        predicted_probability = GEN_model.predict(input_padded)
    
        if(predicted_probability <= Gthreshold):
            prediction = "not sarcastic"
        elif(predicted_probability > Gthreshold):
            prediction = "sarcastic"
        
    elif(Type == 'hyperbole'):
        input_sequence = HYPtok.texts_to_sequences([text])
        input_padded = pad_sequences(input_sequence, maxlen=max_length, padding = 'post', truncating = 'post')

        predicted_probability = HYP_model.predict(input_padded)
    

        if(predicted_probability <= Hthreshold):
            prediction = "not sarcastic"
        elif(predicted_probability > Hthreshold):
            prediction = "sarcastic"
            
    elif(Type == 'rhetorical'):
        input_sequence = RQtok.texts_to_sequences([text])
        input_padded = pad_sequences(input_sequence, maxlen=max_length, padding = 'post', truncating = 'post')

        predicted_probability = RQ_model.predict(input_padded)
    
        if(predicted_probability <= Rthreshold):
            prediction = "not sarcastic"
        elif(predicted_probability > Rthreshold):
            prediction = "sarcastic"
            
    print("INPUT: " + text)
    print("Type: " + Type + " statement")
    print("prediction: " + prediction)
    print(str(predicted_probability[0][0]) + " activation")

In [128]:
def Kappa(text):
    return sarcasm_classifier(text, type_classifier(text))

In [135]:
Kappa("Hello my name is Jason")

INPUT: hello my name is jason
Type: general statement
prediction: sarcastic
0.99082863 activation


## Analysis

In [142]:
def update1(df, word, column):
    loc = df.loc[df['Words']==word]
    locI = int(loc.index.tolist()[0])
    new_value = int(loc[column]) + 1
    new_v_series = pd.Series([new_value], name=column, index=[locI])
    df.update(new_v_series)
    return df

def binDecode(x, thres):
    if (x < thres):
        return 0
    else:
        return 1

def Count_Correct(model, test_data, test_padded, test_label, word_index, thres):
    predicted_labels = model.predict(test_padded)
    words = list(word_index.keys())
    
    df = pd.DataFrame(columns = ['Words', 'Correct', 'Total', 'Rightly Sarcastic'])
    df['Words'] = words
    df['Correct'] = [0]*len(word_index)
    df['Total'] = [0]*len(word_index)
    df['Rightly Sarcastic'] = [0]*len(word_index)
    df['Incorrectly Sarcastic'] = [0]*len(word_index)
    
    for i in range(len(test_data)):
        boo = (binDecode(predicted_labels[i], thres) == binDecode(test_label[i], thres))
        element = test_data[i]
        for word in element.split():
            if (word in (df.Words.values)):
                word = word
            else: word = "<OOV>"
            df = update1(df, word, 'Total')
            if boo:
                df = update1(df, word, 'Correct')
                if (binDecode(predicted_labels[i], thres) == 1):
                    df = update1(df, word, 'Rightly Sarcastic')
            else:
                if (binDecode(predicted_labels[i], thres) == 1):
                    df = update1(df, word, 'Incorrectly Sarcastic')
    return(df)

In [143]:
GENdf = Count_Correct(GEN_model, GEN_test, GEN_test_padded, blGEN_test, GEN_word_index, Gthreshold)

In [144]:
HYPdf = Count_Correct(HYP_model, HYP_test, HYP_test_padded, blHYP_test, HYP_word_index, Hthreshold)

In [145]:
RQdf = Count_Correct(RQ_model, RQ_test, RQ_test_padded, blRQ_test, RQ_word_index, Rthreshold)

# Exporting the models:

In [None]:
chdir(cwd)
if (os.path.exists(cwd+"/ModelsSARC")):
    print("File exists")
else:
    makedirs("ModelsSARC")
    chdir(cwd + "/ModelsSARC")

    Type_model.save('Type_model.h5')
    GEN_model.save('GEN_model.h5')
    HYP_model.save('HYP_model.h5')
    RQ_model.save('RQ_model.h5')

    with open('Type_tok.pickle', 'wb') as handle:
        pickle.dump(GENtok, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open('GENtok.pickle', 'wb') as handle:
        pickle.dump(GENtok, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open('HYPtok.pickle', 'wb') as handle:
        pickle.dump(HYPtok, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open('RQtok.pickle', 'wb') as handle:
        pickle.dump(RQtok, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [146]:
chdir(cwd)
if (os.path.exists(cwd+"/dfSARC")):
    print("File exists")
else:
    makedirs("dfSARC")
    chdir(cwd + "/dfSARC")
    
    GENdf.to_csv("GENdf.csv")
    HYPdf.to_csv("HYPdf.csv")
    RQdf.to_csv("RQdf.csv")

### Word Embedding Export

In [25]:
GENweights = GEN_model.get_layer('GENembed').get_weights()[0]
HYPweights = HYP_model.get_layer('HYPembed').get_weights()[0]
RQweights = RQ_model.get_layer('RQembed').get_weights()[0]

In [26]:
GEN_vocab = GEN_word_index
Gout_v = io.open('Gvecs.tsv', 'w', encoding='utf-8')
Gout_m = io.open('Gmeta.tsv', 'w', encoding='utf-8')

for num, word in enumerate(GEN_vocab):
    if num == 0: continue
    vec = GENweights[num]
    Gout_m.write(word + "\n")
    Gout_v.write('\t'.join([str(x) for x in vec]) + "\n")
Gout_v.close()
Gout_m.close()

try:
    from google.colab import files
except ImportError:
    pass
else:
    files.download('Gvecs.tsv')
    files.download('Gmeta.tsv')

In [27]:
HYP_vocab = HYP_word_index
Hout_v = io.open('Hvecs.tsv', 'w', encoding='utf-8')
Hout_m = io.open('Hmeta.tsv', 'w', encoding='utf-8')

for num, word in enumerate(HYP_vocab):
    if num == 0: continue
    vec = HYPweights[num]
    Hout_m.write(word + "\n")
    Hout_v.write('\t'.join([str(x) for x in vec]) + "\n")
Hout_v.close()
Hout_m.close()

try:
    from google.colab import files
except ImportError:
    pass
else:
    files.download('Hvecs.tsv')
    files.download('Hmeta.tsv')

In [28]:
RQ_vocab = RQ_word_index
Rout_v = io.open('Rvecs.tsv', 'w', encoding='utf-8')
Rout_m = io.open('Rmeta.tsv', 'w', encoding='utf-8')

for num, word in enumerate(RQ_vocab):
    if num == 0: continue
    vec = RQweights[num]
    Rout_m.write(word + "\n")
    Rout_v.write('\t'.join([str(x) for x in vec]) + "\n")
Rout_v.close()
Rout_m.close()

try:
    from google.colab import files
except ImportError:
    pass
else:
    files.download('Rvecs.tsv')
    files.download('Rmeta.tsv')