## Data Gathering

In [17]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [18]:
sd = pd.read_csv("essays.csv",sep=",", encoding='cp1252')
sd.head()

Unnamed: 0,#AUTHID,TEXT,cEXT,cNEU,cAGR,cCON,cOPN
0,1997_504851.txt,"Well, right now I just woke up from a mid-day ...",n,y,y,n,y
1,1997_605191.txt,"Well, here we go with the stream of consciousn...",n,n,y,n,n
2,1997_687252.txt,An open keyboard and buttons to push. The thin...,n,y,n,y,y
3,1997_568848.txt,I can't believe it! It's really happening! M...,y,n,y,y,n
4,1997_688160.txt,"Well, here I go with the good old stream of co...",y,n,y,n,y


In [19]:
sd.head()

Unnamed: 0,#AUTHID,TEXT,cEXT,cNEU,cAGR,cCON,cOPN
0,1997_504851.txt,"Well, right now I just woke up from a mid-day ...",n,y,y,n,y
1,1997_605191.txt,"Well, here we go with the stream of consciousn...",n,n,y,n,n
2,1997_687252.txt,An open keyboard and buttons to push. The thin...,n,y,n,y,y
3,1997_568848.txt,I can't believe it! It's really happening! M...,y,n,y,y,n
4,1997_688160.txt,"Well, here I go with the good old stream of co...",y,n,y,n,y


In [20]:
sd.describe()

Unnamed: 0,#AUTHID,TEXT,cEXT,cNEU,cAGR,cCON,cOPN
count,2468,2468,2468,2468,2468,2468,2468
unique,2468,2468,2,2,2,2,2
top,2003_216.txt,It was very frustrating to get on to the web s...,y,n,y,y,y
freq,1,1,1277,1235,1310,1254,1272


## Preprocessing

In [21]:
sd['TEXT'].isnull().values.nonzero()[0]

array([], dtype=int64)

In [22]:
sd['TEXT'].fillna('.',inplace=True)

In [23]:
from keras.preprocessing.text import Tokenizer
from keras_preprocessing import sequence
t = Tokenizer(split = ' ', lower=True)
#Fit the Tokenizer on text
t.fit_on_texts(sd['TEXT'].values)
#Converting text to sequences
X = t.texts_to_sequences(sd['TEXT'].values)
#Padding the sequences
X = sequence.pad_sequences(X)

In [24]:
np.amax(X)

31268

In [25]:
from sklearn.model_selection import train_test_split
Y = pd.get_dummies(sd['cEXT']).values
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=42)
X_train

array([[    0,     0,     0, ...,    17,   511,  1798],
       [    0,     0,     0, ...,  1564, 23748,   577],
       [    0,     0,     0, ...,   439,    45,   688],
       ...,
       [    0,     0,     0, ...,     9,   177,   406],
       [    0,     0,     0, ...,   462,    16,    11],
       [    0,     0,     0, ...,   107,   720,    10]], dtype=int32)

In [26]:
# from tensorflow.keras.utils import to_categorical
# Ymc = to_categorical(sd.loc[sd['sarcastic']==1][['sarcasm', 'irony', 'satire', 'understatement', 'overstatement', 'rhetorical_question']])
# Xm_train, Xm_test, ymc_train, ymc_test = train_test_split(Xm, Ymc, test_size=0.25, random_state=42)

# Metrics

In [27]:
def call_predictions(predictions_multi, y_test):
    prediction_list = []
    real_list = []
    print(predictions_multi, y_test)
    for i in range(len(predictions_multi)):
        max = np.argmax(predictions_multi[i])
        prediction_list.append(max)
    predictions = np.array(prediction_list)

    for i in range(len(y_test)):
        max = np.argmax(y_test[i])
        real_list.append(max)
    real = np.array(real_list)
  
    return real, predictions

In [28]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

def binary_scores(real, predictions):
    matrix = confusion_matrix(real, predictions)
    print("Accuracy ",accuracy_score(real, predictions))
    tn, fp, fn, tp = matrix.ravel()
    print("tn ", tn," fp ", fp , " fn ", fn, " tp ", tp)
    print("Precision ",precision_score(real, predictions, zero_division=1))
    print("Recall ",recall_score(real, predictions, zero_division=1))  
    print("F1 Score ",f1_score(real, predictions, zero_division=1))
    print('-' * 70)
    print()

def multi_scores(real, predictions):
    matrix = confusion_matrix(real, predictions)
    print("accuracy_score: ", accuracy_score(real, predictions))
    tn, fp, fn, tp = matrix.ravel()
    print("tn ", tn," fp ", fp , " fn ", fn, " tp ", tp)
    print("micro average scores:")
    print("precision_score: ", precision_score(real, predictions, average='micro', zero_division=1))
    print("recall_score: ", recall_score(real, predictions, average='micro'))
    print("f1_score: ", f1_score(real, predictions, average='micro'))
    print("macro average scores:")
    print("precision_score: ", precision_score(real, predictions, average='macro', zero_division=1))
    print("recall_score: ", recall_score(real, predictions, average='macro'))
    print("f1_score: ", f1_score(real, predictions, average='macro'))
    print('-' * 70)
    print()

In [29]:
import tensorflow as tf
from tensorflow import keras
from keras.layers import *

# RNN

In [30]:
from keras.layers import *
def run_RNN(X_train, X_test, y_test, y_train, drop, no_of_layers, embedding_input_length, output_dimension ):
    model=keras.Sequential()
    model.add(Embedding(len(t.word_counts)+1, 128, input_length = embedding_input_length))
    if no_of_layers == 3:
        model.add(SimpleRNN(128, return_sequences=True))
        model.add(Dropout(drop))
    model.add(SimpleRNN(128, return_sequences=True))
    model.add(Dropout(drop))
    model.add(SimpleRNN(128))
    model.add(Dropout(drop))
    model.add(Dense(output_dimension, activation='softmax'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    batch_size = 32
    epochs = 5
    model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size)
    predictions_double = model.predict(X_test)
    real, predictions = call_predictions(predictions_double,y_test)
    if output_dimension == 2:
        binary_scores(real, predictions)
    else:
        multi_scores(real, predictions)

In [34]:
run_RNN(X_train, X_test, y_test, y_train, 0.1, 2, X.shape[1], 2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
[[0.34628984 0.65371007]
 [0.31081352 0.68918645]
 [0.41040498 0.589595  ]
 ...
 [0.32424608 0.67575395]
 [0.28210023 0.71789974]
 [0.34617102 0.65382904]] [[0 1]
 [1 0]
 [0 1]
 ...
 [0 1]
 [1 0]
 [0 1]]
Accuracy  0.539708265802269
tn  0  fp  284  fn  0  tp  333
Precision  0.539708265802269
Recall  1.0
F1 Score  0.7010526315789474
----------------------------------------------------------------------



In [29]:
run_RNN(X_train, X_test, y_test, y_train, 0.7, 2, X.shape[1], 2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
[[0.68586177 0.31413823]
 [0.6815812  0.31841883]
 [0.68035376 0.31964627]
 ...
 [0.67795324 0.3220467 ]
 [0.6796351  0.32036495]
 [0.68579036 0.31420967]] [[0 1]
 [1 0]
 [0 1]
 ...
 [0 1]
 [1 0]
 [0 1]]
Accuracy  0.46029173419773095
tn  284  fp  0  fn  333  tp  0
Precision  1.0
Recall  0.0
F1 Score  0.0
----------------------------------------------------------------------



# LSTM

In [36]:
def run_LSTM(X_train, X_test, y_test, y_train, drop, no_of_layers, embedding_input_length, output_dimension ):
    model=keras.Sequential()

    #embedding layer
    model.add(Embedding(len(t.word_counts)+1, 128, input_length = embedding_input_length))
    
    #lstm layers
    if no_of_layers == 3:
        model.add(LSTM(128, return_sequences=True))
        model.add(Dropout(drop))
    model.add(LSTM(128, return_sequences=True))
    model.add(Dropout(drop))
    model.add(LSTM(128))
    model.add(Dropout(drop))

    #Dense Layer
    model.add(Dense(output_dimension, activation='softmax'))

    #Adding loss function, optimizer, metrics
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    batch_size = 32
    epochs = 5

    # mc=ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', save_best_only=True,verbose=1)  


    #Training the Model
    model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size)
    predictions_double = model.predict(X_test)
    real, predictions = call_predictions(predictions_double,y_test)
    if output_dimension == 2:
        binary_scores(real, predictions)
    else:
        multi_scores(real, predictions)

In [37]:
run_LSTM(X_train, X_test, y_test, y_train, 0.3, 2, X.shape[1], 2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
[[5.17094553e-01 4.82905537e-01]
 [3.31640661e-01 6.68359339e-01]
 [9.99995291e-01 4.69449424e-06]
 ...
 [8.82149875e-01 1.17850184e-01]
 [9.99997914e-01 2.05987908e-06]
 [6.61219747e-05 9.99933839e-01]] [[0 1]
 [1 0]
 [0 1]
 ...
 [0 1]
 [1 0]
 [0 1]]
Accuracy  0.5526742301458671
tn  136  fp  148  fn  128  tp  205
Precision  0.5807365439093485
Recall  0.6156156156156156
F1 Score  0.5976676384839651
----------------------------------------------------------------------



# GRU

In [31]:
def run_GRU(X_train, X_test, y_test, y_train, drop, no_of_layers, embedding_input_length, output_dimension ):
    model=keras.Sequential()
    model.add(Embedding(len(t.word_counts)+1, 128, input_length = embedding_input_length))
    if no_of_layers == 3:
        model.add(GRU(128, return_sequences=True))
        model.add(Dropout(drop))
    model.add(GRU(128, return_sequences=True))
    model.add(Dropout(drop))
    model.add(GRU(128))
    model.add(Dropout(drop))
    model.add(Dense(output_dimension, activation='softmax'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    batch_size = 32
    epochs = 5
    model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size)
    predictions_double = model.predict(X_test)
    real, predictions = call_predictions(predictions_double,y_test)
    if output_dimension == 2:
        binary_scores(real, predictions)
    else:
        multi_scores(real, predictions)

In [32]:
run_GRU(X_train, X_test, y_test, y_train, 0.3, 2, X.shape[1], 2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
[[6.8774825e-04 9.9931222e-01]
 [3.9440945e-01 6.0559058e-01]
 [9.9996489e-01 3.5081517e-05]
 ...
 [9.8953044e-01 1.0469511e-02]
 [9.9375749e-01 6.2425495e-03]
 [2.1870380e-02 9.7812963e-01]] [[0 1]
 [1 0]
 [0 1]
 ...
 [0 1]
 [1 0]
 [0 1]]
Accuracy  0.5413290113452188
tn  137  fp  147  fn  136  tp  197
Precision  0.5726744186046512
Recall  0.5915915915915916
F1 Score  0.5819793205317578
----------------------------------------------------------------------

