In [None]:
from tensorflow.keras.callbacks import TensorBoard, CSVLogger
from  tensorflow.keras.preprocessing.text import text_to_word_sequence
from  tensorflow.keras.preprocessing import sequence
from  tensorflow.keras.preprocessing.text import Tokenizer
from  tensorflow.keras.models import Sequential
from  tensorflow.keras.layers import Dense,Flatten,LSTM,Conv1D,GlobalMaxPool1D,Dropout,Bidirectional,GRU
from  tensorflow.keras.layers import Embedding
from  tensorflow.keras import optimizers
from  tensorflow.keras.layers import Input
from  tensorflow.keras.models import Model
from  tensorflow.keras.utils import plot_model
from  tensorflow.keras.models import load_model
import pandas as pd

import pickle

####
# Load datasets from previously saved pickle files
####
liar_df_train = pickle.load(open('/content/augmemted_train2.pkl', 'rb'))
liar_df_val= pickle.load(open('/content/augmemted_val.pkl','rb'))
liar_df_test= pickle.load(open('/content/augmemted_test.pkl','rb'))

import numpy as np

####
# Load GloVe 100 dimension
####
embeddings = {}
with open("/content/glove.6B.100d.txt", encoding="utf8") as file_object:
    for line in file_object:
        word_embed = line.split()
        word = word_embed[0]
        embed = np.array(word_embed[1:], dtype="float32")
        embeddings[word.lower()]= embed
    
print('Found %s word vectors.' % len(embeddings))
print(len(embeddings[word]), " : Embedding Dimension")

In [None]:
####
# Create embedding matrix
####
EMBED_DIM=100
num_words = len(vocabulary_dict) + 1
embedding_matrix = np.zeros((num_words, EMBED_DIM))
for word, i in vocabulary_dict.items():
    embedding_vector = embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

embeddings_index = None

In [None]:
vocab_length = len(vocabulary_dict.keys())
hidden_size = EMBED_DIM #Has to be same as EMBED_DIM

#### Set maximum for CNN
MAX_SEQUENCE_LENGTH=3196
print(MAX_SEQUENCE_LENGTH)


kernel_sizes = [2,4,5]
filter_size = 128

X_train = liar_df_train['word_id']
X_val = liar_df_val['word_id']
X_test = liar_df_test['word_id']

from tensorflow import keras

Y_train = liar_df_train['numer_truth']
Y_train = keras.utils.to_categorical(Y_train, num_classes=2)

Y_val = liar_df_val['numer_truth']
Y_val = keras.utils.to_categorical(Y_val, num_classes=2)

#### Introduce padding
X_train = sequence.pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH, padding='post',truncating='post')
X_val = sequence.pad_sequences(X_val, maxlen=MAX_SEQUENCE_LENGTH, padding='post',truncating='post')
X_test = sequence.pad_sequences(X_test, maxlen=MAX_SEQUENCE_LENGTH, padding='post',truncating='post')

In [None]:
#####
# Define CNN model
####

kernel_stmt = []


statement_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32', name='main_input')
x_stmt = Embedding(vocab_length+1,EMBED_DIM,weights=[embedding_matrix],input_length=MAX_SEQUENCE_LENGTH ,trainable=False)(statement_input) 

for kernel in kernel_sizes:
    x_1 = Conv1D(filters=filter_size,kernel_size=kernel,activation='relu')(x_stmt)
    x_1 = GlobalMaxPool1D()(x_1)
    kernel_stmt.append(x_1)
    

conv_in1 = keras.layers.concatenate(kernel_stmt)
conv_in1 = Dropout(0.5)(conv_in1)
conv_in1 = Dense(128, activation='relu')(conv_in1)  


main_output = Dense(2, activation='softmax', name='main_output')(conv_in1)

model_cnn = Model(inputs=[statement_input], outputs=[main_output])
model_cnn.summary()

In [None]:
####
# Create folder to save checkpoints in
####
import os
try:
    os.mkdir('modelweights2')
except FileExistsError:
    pass

In [None]:
####
# Prepare model for training
####
INIT_LR = 1e-4
EPOCHS = 20
BS = 128

#opt = optimizers.Adadelta(lr=INIT_LR, decay=INIT_LR / EPOCHS)

import math

opt = optimizers.Adam(lr=INIT_LR, decay=INIT_LR / EPOCHS)
model_cnn.compile(loss="binary_crossentropy", optimizer=opt,metrics=['accuracy'])

print("[INFO] training...")


checkpoint_filepath = 'modelweights2/weights.{epoch:02d}.hdf5'
checkpoint_dir = os.path.dirname(checkpoint_filepath)

model_checkpoint_callback = keras.callbacks.ModelCheckpoint(filepath= checkpoint_filepath,save_weights_only=True,
    monitor='val_acc', #monitor validation accuracy
    mode='max', #mode
    save_best_only=False) #only

In [None]:
#### Train model

H =  model_cnn.fit(x=X_train, 
                   y=Y_train,
                   batch_size = BS,
                   steps_per_epoch=math.ceil(len(X_train) / BS),
                   validation_data = (X_val,Y_val),
                   validation_steps=math.ceil(len(X_val) / BS),
                   epochs=EPOCHS,
                   callbacks=[model_checkpoint_callback])

In [None]:
#### Load model with best validation accuracy

model_cnn.load_weights('modelweights2/weights.10.hdf5')

In [None]:
#### Predict test set and display classification report

from sklearn.metrics import classification_report
predictions = model_cnn.predict([X_test], batch_size=BS)

Y_test = liar_df_test['numer_truth']
Y_test = keras.utils.to_categorical(Y_test, num_classes=2)

print(classification_report(Y_test.argmax(axis=1), np.argmax(predictions, axis=1), digits=4))

In [None]:
#### Display confusion matrix

from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
import seaborn as sns
import matplotlib.pyplot as plt

mat_CNN = confusion_matrix(Y_test.argmax(axis=1), np.argmax(predictions, axis=1))
sns.heatmap(mat_CNN.T, square=False, annot=True, fmt='d', cbar=False,
            xticklabels=True, yticklabels=True)
plt.xlabel('actual label')
plt.ylabel('predicted label')
plt.show()

In [None]:
#### Display ROC-Curve

fpr, tpr, thresholds = roc_curve(Y_test.argmax(axis=1), np.argmax(predictions, axis=1))

def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)

plt.figure(figsize=(8, 6))
plot_roc_curve(fpr, tpr)
#save_fig("roc_curve_plot")
plt.show()

In [None]:
#### Set maximum for RNN based models and repeat other steps accordingly
MAX_SEQUENCE_LENGTH=40
print(MAX_SEQUENCE_LENGTH)

X_train = liar_df_train['word_id']
X_val = liar_df_val['word_id']
X_test = liar_df_test['word_id']

Y_train = liar_df_train['numer_truth']
Y_train = keras.utils.to_categorical(Y_train, num_classes=2)

Y_val = liar_df_val['numer_truth']
Y_val = keras.utils.to_categorical(Y_val, num_classes=2)

X_train = sequence.pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH, padding='post',truncating='post')
X_val = sequence.pad_sequences(X_val, maxlen=MAX_SEQUENCE_LENGTH, padding='post',truncating='post')
X_test = sequence.pad_sequences(X_test, maxlen=MAX_SEQUENCE_LENGTH, padding='post',truncating='post')

In [None]:
#### Define LSTM model

statement_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32', name='main_input')
x_stmt = Embedding(vocab_length+1,EMBED_DIM,weights=[embedding_matrix],input_length=MAX_SEQUENCE_LENGTH ,trainable=False)(statement_input)
x1 = LSTM(50, return_sequences=False)(x_stmt)
main_output = Dense(2, activation='softmax', name='main_output')(x1)

model_lstm = Model(inputs=[statement_input], outputs=[main_output])
model_lstm.summary()

In [None]:
####
# Prepare model for training
####
EPOCHS = 30
BS = 128

opt = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.01, amsgrad=False)
model_lstm.compile(loss="binary_crossentropy", optimizer=opt,metrics=['accuracy'])

print("[INFO] training...")

#### Overwriting checkpoints from previous model!
checkpoint_filepath = 'modelweights2/weights.{epoch:02d}.hdf5'
checkpoint_dir = os.path.dirname(checkpoint_filepath)

model_checkpoint_callback = keras.callbacks.ModelCheckpoint(filepath= checkpoint_filepath,save_weights_only=True,
    monitor='val_acc', #monitor validation accuracy
    mode='max', #mode
    save_best_only=False) #only

In [None]:
#### Training model
H =  model_lstm.fit(x=X_train, 
                   y=Y_train,
                   batch_size = BS,
                   steps_per_epoch=math.ceil(len(X_train) / BS),
                   validation_data = (X_val,Y_val),
                   validation_steps=math.ceil(len(X_val) / BS),
                   epochs=EPOCHS,
                   callbacks=[model_checkpoint_callback])

In [None]:
#### Load best model
model_lstm.load_weights('modelweights2/best_model')

In [None]:
predictions = model_lstm.predict([X_test], batch_size=BS)

Y_test = liar_df_test['numer_truth']
Y_test = keras.utils.to_categorical(Y_test, num_classes=2)

print(classification_report(Y_test.argmax(axis=1), np.argmax(predictions, axis=1), digits=4))

In [None]:
mat_lstm = confusion_matrix(Y_test.argmax(axis=1), np.argmax(predictions, axis=1))
sns.heatmap(mat_lstm.T, square=False, annot=True, fmt='d', cbar=False,
            xticklabels=True, yticklabels=True)
plt.xlabel('actual label')
plt.ylabel('predicted label')
plt.show()

In [None]:
fpr, tpr, thresholds = roc_curve(Y_test.argmax(axis=1), np.argmax(predictions, axis=1))

def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)

plt.figure(figsize=(8, 6))
plot_roc_curve(fpr, tpr)
#save_fig("roc_curve_plot")
plt.show()

In [None]:
######################################
#### Define Bi-directional LSTM model
######################################
statement_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32', name='main_input')
x_stmt = Embedding(vocab_length+1,EMBED_DIM,weights=[embedding_matrix],input_length=MAX_SEQUENCE_LENGTH ,trainable=False)(statement_input)
x1 = Bidirectional(LSTM(50, return_sequences=False))(x_stmt)
main_output = Dense(2, activation='softmax', name='main_output')(x1)

model_bi_lstm = Model(inputs=[statement_input], outputs=[main_output])
model_bi_lstm.summary()

In [None]:
####
# Prepare model for training
####
EPOCHS = 30
BS = 128

opt = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.01, amsgrad=False)
model_bi_lstm.compile(loss="binary_crossentropy", optimizer=opt,metrics=['accuracy'])

print("[INFO] training...")

#### Overwriting checkpoints from previous model!
checkpoint_filepath = 'modelweights2/weights.{epoch:02d}.hdf5'
checkpoint_dir = os.path.dirname(checkpoint_filepath)

model_checkpoint_callback = keras.callbacks.ModelCheckpoint(filepath= checkpoint_filepath,save_weights_only=True,
    monitor='val_acc', #monitor validation accuracy
    mode='max', #mode
    save_best_only=False) #only

In [None]:
#### Training model
H =  model_bi_lstm.fit(x=X_train, 
                   y=Y_train,
                   batch_size = BS,
                   steps_per_epoch=math.ceil(len(X_train) / BS),
                   validation_data = (X_val,Y_val),
                   validation_steps=math.ceil(len(X_val) / BS),
                   epochs=EPOCHS,
                   callbacks=[model_checkpoint_callback])

In [None]:
#### Load best model
model_bi_lstm.load_weights('modelweights2/best_model')

predictions = model_bi_lstm.predict([X_test], batch_size=BS)

Y_test = liar_df_test['numer_truth']
Y_test = keras.utils.to_categorical(Y_test, num_classes=2)

print(classification_report(Y_test.argmax(axis=1), np.argmax(predictions, axis=1), digits=4))

In [None]:
mat_bi_lstm = confusion_matrix(Y_test.argmax(axis=1), np.argmax(predictions, axis=1))
sns.heatmap(mat_bi_lstm.T, square=False, annot=True, fmt='d', cbar=False,
            xticklabels=True, yticklabels=True)
plt.xlabel('actual label')
plt.ylabel('predicted label')
plt.show()

In [None]:
fpr, tpr, thresholds = roc_curve(Y_test.argmax(axis=1), np.argmax(predictions, axis=1))

def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)

plt.figure(figsize=(8, 6))
plot_roc_curve(fpr, tpr)
#save_fig("roc_curve_plot")
plt.show()

In [None]:
#######################################
#### Define Bi-directional LSTM model
#######################################
statement_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32', name='main_input')
x_stmt = Embedding(vocab_length+1,EMBED_DIM,weights=[embedding_matrix],input_length=MAX_SEQUENCE_LENGTH ,trainable=False)(statement_input)
x1 = GRU(100, return_sequences=False)(x_stmt)
main_output = Dense(2, activation='softmax', name='main_output')(x1)

model_gru = Model(inputs=[statement_input], outputs=[main_output])
model_gru.summary()

In [None]:
####
# Prepare model for training
####
EPOCHS = 30
BS = 128

opt = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.01, amsgrad=False)
model_gru.compile(loss="binary_crossentropy", optimizer=opt,metrics=['accuracy'])

print("[INFO] training...")

#### Overwriting checkpoints from previous model!
checkpoint_filepath = 'modelweights2/weights.{epoch:02d}.hdf5'
checkpoint_dir = os.path.dirname(checkpoint_filepath)

model_checkpoint_callback = keras.callbacks.ModelCheckpoint(filepath= checkpoint_filepath,save_weights_only=True,
    monitor='val_acc', #monitor validation accuracy
    mode='max', #mode
    save_best_only=False) #only

In [None]:
#### Training model
H =  model_gru.fit(x=X_train, 
                   y=Y_train,
                   batch_size = BS,
                   steps_per_epoch=math.ceil(len(X_train) / BS),
                   validation_data = (X_val,Y_val),
                   validation_steps=math.ceil(len(X_val) / BS),
                   epochs=EPOCHS,
                   callbacks=[model_checkpoint_callback])

In [None]:
#### Load best model
model_gru.load_weights('modelweights2/best_model')

predictions = model_gru.predict([X_test], batch_size=BS)

Y_test = liar_df_test['numer_truth']
Y_test = keras.utils.to_categorical(Y_test, num_classes=2)

print(classification_report(Y_test.argmax(axis=1), np.argmax(predictions, axis=1), digits=4))

In [None]:
mat_gru = confusion_matrix(Y_test.argmax(axis=1), np.argmax(predictions, axis=1))
sns.heatmap(mat_gru.T, square=False, annot=True, fmt='d', cbar=False,
            xticklabels=True, yticklabels=True)
plt.xlabel('actual label')
plt.ylabel('predicted label')
plt.show()

In [None]:
fpr, tpr, thresholds = roc_curve(Y_test.argmax(axis=1), np.argmax(predictions, axis=1))

def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)

plt.figure(figsize=(8, 6))
plot_roc_curve(fpr, tpr)
#save_fig("roc_curve_plot")
plt.show()