<a href="https://colab.research.google.com/github/Kalit31/Data-Mining-Assignment/blob/main/S%2BM-LSTMs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
#Import libraries
import pandas as pd
import numpy as np
import cv2
import keras
import matplotlib.pyplot as plt
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, Callback, TensorBoard
from keras.layers import Input, Embedding, Bidirectional, LSTM, Conv1D, MaxPooling1D, BatchNormalization, Flatten, Dense, Dropout, Reshape, Concatenate, Masking
from keras.regularizers import l2
from keras.utils import Sequence, to_categorical
from keras.optimizers import Adam
from keras.backend import epsilon
from keras.models import model_from_json
from keras.models import Model, Sequential

import nltk
from nltk.tokenize import RegexpTokenizer

from sklearn.linear_model import  LogisticRegression
from sklearn.metrics import confusion_matrix, f1_score, classification_report,accuracy_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn import svm
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings("ignore")

**Import dataset** 

In [3]:
trainFilePath = 'dataset/train2.tsv'
testFilePath = 'dataset/test2.tsv'
validationFilePath = 'dataset/val2.tsv'

PATH = "/content/drive/My Drive/"

In [4]:
df_train = pd.read_csv(PATH+trainFilePath, delimiter='\t',  names=["ID", "Label", "Statement", "Subject", "Speaker", "Job Title", "State", "Party",
                         "Barely True Cnt", "False Cnt", "Half True Cnt", "Mostly True Cnt", "Pants on Fire Cnt", "Context", "Justification"])

df_test = pd.read_csv(PATH+testFilePath, delimiter='\t',  names=["ID", "Label", "Statement", "Subject", "Speaker", "Job Title", "State", "Party",
                         "Barely True Cnt", "False Cnt", "Half True Cnt", "Mostly True Cnt", "Pants on Fire Cnt", "Context", "Justification"])


df_validation = pd.read_csv(PATH+validationFilePath, delimiter='\t', names=["ID", "Label", "Statement", "Subject", "Speaker", "Job Title", "State", "Party",
                         "Barely True Cnt", "False Cnt", "Half True Cnt", "Mostly True Cnt", "Pants on Fire Cnt", "Context", "Justification"])

In [5]:
df_train.head()

Unnamed: 0,ID,Label,Statement,Subject,Speaker,Job Title,State,Party,Barely True Cnt,False Cnt,Half True Cnt,Mostly True Cnt,Pants on Fire Cnt,Context,Justification
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer,That's a premise that he fails to back up. Ann...
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.,"Surovell said the decline of coal ""started whe..."
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver,Obama said he would have voted against the ame...
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release,The release may have a point that Mikulskis co...
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN,"Crist said that the economic ""turnaround start..."


**Data Preprocessing**

In [6]:
def dataCleaning(df,field):
    df[field] = df[field].str.replace(r"@\S+", "")
    df[field] = df[field].str.replace(r"[^A-Za-z0-9]", " ")
    df[field] = df[field].str.replace(r"(),!?@\'\`\"\_\n", " ")
    df[field] = df[field].str.replace(r"@", "at")
    df[field] = df[field].str.replace(r"http\S+", "")
    df[field] = df[field].str.replace(r"http", "")
    df[field] = df[field].str.lower()
    return df

def dataPreprocessing(df):
    df = df[df['ID'].notna()]
    df = df[df['Barely True Cnt'].notna()]
    df = df[df['False Cnt'].notna()]
    df = df[df['Mostly True Cnt'].notna()]
    df = df[df['Pants on Fire Cnt'].notna()]
    df = df[df['Half True Cnt'].notna()]

    df['ID'] = df['ID'].str.split(".", n = 1, expand = True) 
    
    df = dataCleaning(df,'Statement')
    df = dataCleaning(df,'Subject')
    df = dataCleaning(df,'Speaker')
    df = dataCleaning(df,'Job Title')
    df = dataCleaning(df,'State')
    df = dataCleaning(df,'Party')
    df = dataCleaning(df,'Context')
    df = dataCleaning(df,'Justification')    
    
    le_multi = LabelEncoder()
    df.loc[:, 'Multi Class Label'] = le_multi.fit_transform(df.Label)
    print("Label assignments: " + str({l: i for i, l in enumerate(le_multi.classes_)}))   
    
    df['Binary Label'] = df.Label.apply(lambda x: 1 if x in ['false','pants-fire','barely-true']  else 0)

    df = df.fillna('None')

    return df

In [7]:
df_train = dataPreprocessing(df_train)
df_validation = dataPreprocessing(df_validation)
df_test = dataPreprocessing(df_test)

Label assignments: {'barely-true': 0, 'false': 1, 'half-true': 2, 'mostly-true': 3, 'pants-fire': 4, 'true': 5}
Label assignments: {'barely-true': 0, 'false': 1, 'half-true': 2, 'mostly-true': 3, 'pants-fire': 4, 'true': 5}
Label assignments: {'barely-true': 0, 'false': 1, 'half-true': 2, 'mostly-true': 3, 'pants-fire': 4, 'true': 5}


**Embed data fields**

In [8]:
def embed_metadata(df):
  meta_data = df.values[:,8:13]
  for i in range(meta_data.shape[0]):
    for j in range(meta_data.shape[1]):
      if meta_data[i][j] == 'None':
        meta_data[i][j] = 0
  
  meta_data = np.asarray(meta_data).astype(np.float32)

  return meta_data

In [9]:
t = Tokenizer()
  
def get_embeddings(data, max_length):
  t.fit_on_texts(data)
  encoded_docs = t.texts_to_sequences(data)
  padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
  return padded_docs

In [10]:
def embed_fields(df):
  return [np.asarray(get_embeddings(df['Statement'], 50)).astype(np.int32),np.asarray(get_embeddings(df['Subject'], 10)).astype(np.int32),np.asarray(get_embeddings(df['Context'], 25)).astype(np.int32),
          np.asarray(get_embeddings(df['Speaker'], 5)).astype(np.int32),np.asarray(get_embeddings(df['Party'], 5)).astype(np.int32),np.asarray(get_embeddings(df['Job Title'], 20)).astype(np.int32),
          np.asarray(get_embeddings(df['State'], 5)).astype(np.int32),np.asarray(get_embeddings(df['Justification'], 150)).astype(np.int32),embed_metadata(df)]

In [11]:
train_stmt_x = get_embeddings(df_train['Statement'], 50)
train_sub_x = get_embeddings(df_train['Subject'], 10)
train_cxt_x = get_embeddings(df_train['Context'], 25)
train_spkr_x = get_embeddings(df_train['Speaker'], 5)
train_prty_x = get_embeddings(df_train['Party'], 5)
train_job_x= get_embeddings(df_train['Job Title'], 20)
train_state_x= get_embeddings(df_train['State'], 5)
train_metadata = embed_metadata(df_train)

val_stmt_x = get_embeddings(df_validation['Statement'], 50)
val_sub_x= get_embeddings(df_validation['Subject'], 10)
val_cxt_x = get_embeddings(df_validation['Context'], 25)
val_spkr_x = get_embeddings(df_validation['Speaker'], 5)
val_prty_x = get_embeddings(df_validation['Party'], 5)
val_job_x = get_embeddings(df_validation['Job Title'], 20)
val_state_x = get_embeddings(df_validation['State'], 5)
val_metadata = embed_metadata(df_validation)

test_stmt_x = get_embeddings(df_test['Statement'], 50)
test_sub_x= get_embeddings(df_test['Subject'], 10)
test_cxt_x= get_embeddings(df_test['Context'], 25)
test_spkr_x = get_embeddings(df_test['Speaker'], 5)
test_prty_x = get_embeddings(df_test['Party'], 5)
test_job_x = get_embeddings(df_test['Job Title'], 20)
test_state_x= get_embeddings(df_test['State'], 5)
test_metadata = embed_metadata(df_test)

In [12]:
vocab_size = len(t.word_index) + 1
print("VOCABULARY SIZE: "+str(vocab_size))

VOCABULARY SIZE: 16338


In [13]:
embeddings_index = dict()
f = open(PATH + 'dataset/glove.6B.100d.txt')
for line in f:
	values = line.split()
	word = values[0]
	coefs = np.asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()

In [14]:
def create_embedding_matrix():
  embedding_matrix = np.zeros((vocab_size, 100))
  for word, i in t.word_index.items():
	  embedding_vector = embeddings_index.get(word)
	  if embedding_vector is not None:
		  embedding_matrix[i] = embedding_vector
  return embedding_matrix

In [15]:
embedding_matrix = create_embedding_matrix()

In [16]:
def get_features(input_length):
  input_tensor = Input((input_length,))
  X = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=input_length, trainable=False)(input_tensor)
  X = Bidirectional(LSTM(32, return_sequences=True))(X)
  X = Bidirectional(LSTM(16, return_sequences=True))(X)
  X = Flatten()(X)
  X = Dense(1024, activation='relu', kernel_regularizer=l2(0.0))(X)
  X = Dropout(0.3)(X)
  X = Dense(128, activation='relu', kernel_regularizer=l2(0.01))(X)
  X = Reshape((8,16))(X)
  X = Conv1D(128,3, padding='same', activation='relu', kernel_regularizer=l2(0.0))(X)
  X = MaxPooling1D(2)(X)
  X = BatchNormalization()(X)
  X = Flatten()(X)
  output_tensor = Dense(128, activation='relu', kernel_regularizer=l2(0.0))(X)
  output_tensor = Dropout(0.3)(output_tensor)
  return input_tensor, output_tensor

**Binary Classification Model**

In [17]:
stmt_input, stmt_ftrs = get_features(train_stmt_x.shape[1])
sub_input, sub_ftrs = get_features(train_sub_x.shape[1])
cxt_input, cxt_ftrs = get_features(train_cxt_x.shape[1])
spkr_input, spkr_ftrs = get_features(train_spkr_x.shape[1])
prty_input, prty_ftrs = get_features(train_prty_x.shape[1])
job_input, job_ftrs = get_features(train_job_x.shape[1])
state_input, state_ftrs = get_features(train_state_x.shape[1])

input_tensor_3 = Input((5,))
metadata = Dense(256, activation='relu')(input_tensor_3)

out_1 = Concatenate()([stmt_ftrs, sub_ftrs, cxt_ftrs, spkr_ftrs, prty_ftrs, job_ftrs, state_ftrs])
out = Dense(1028, activation='relu', kernel_regularizer=l2(0.0))(out_1)
out = Concatenate()([out, metadata])
out = Dropout(0.3)(out)
out = Dense(256, activation='relu', kernel_regularizer=l2(0.0))(out)
out = Dropout(0.3)(out)
out = Dense(2, activation='sigmoid')(out)
model_binary = Model(inputs = [stmt_input, sub_input, cxt_input, spkr_input, prty_input, job_input, state_input, input_tensor_3], outputs = out)
model_binary.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 50)]         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 10)]         0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 25)]         0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 5)]          0                                            
______________________________________________________________________________________________

In [18]:
train_y = to_categorical(df_train['Binary Label'], 2)
val_y = to_categorical(df_validation['Binary Label'], 2)
test_y = to_categorical(df_test['Binary Label'], 2)

In [19]:
stop = EarlyStopping(monitor="val_acc", patience=20, mode="max")
reduce_lr = ReduceLROnPlateau(monitor="val_acc", factor=0.1, patience=10, min_lr=1e-6, verbose=1, mode="max")
optimizer = Adam(lr = 0.001)
model_binary.compile(optimizer, loss='binary_crossentropy', metrics=['acc'])

In [20]:
model_binary.fit([train_stmt_x, train_sub_x, train_cxt_x, train_spkr_x, train_prty_x, train_job_x, train_state_x, train_metadata], train_y,
          epochs=10, verbose=1, validation_data=([val_stmt_x, val_sub_x, val_cxt_x, val_spkr_x, val_prty_x, val_job_x, val_state_x,
                                                      val_metadata], val_y), callbacks=[reduce_lr, stop])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f86e8c0b110>

In [21]:
val_loss, val_acc = model_binary.evaluate([val_stmt_x, val_sub_x, val_cxt_x, val_spkr_x, val_prty_x, val_job_x, val_state_x, val_metadata], val_y,
                                   verbose=1)
print('Validation Accuracy: %f' % (val_acc))
test_loss, test_acc = model_binary.evaluate([test_stmt_x, test_sub_x, test_cxt_x, test_spkr_x, test_prty_x, test_job_x, test_state_x,  test_metadata],
                                     test_y, verbose=1)
print('Test Accuracy: %f' % (test_acc))

Validation Accuracy: 0.715732
Test Accuracy: 0.734807


**Multi class classification model**

In [22]:
train_y = to_categorical(df_train['Multi Class Label'], 6)
val_y = to_categorical(df_validation['Multi Class Label'], 6)
test_y = to_categorical(df_test['Multi Class Label'], 6)

In [23]:
stmt_input, stmt_ftrs = get_features(train_stmt_x.shape[1])
sub_input, sub_ftrs = get_features(train_sub_x.shape[1])
cxt_input, cxt_ftrs = get_features(train_cxt_x.shape[1])
spkr_input, spkr_ftrs = get_features(train_spkr_x.shape[1])
prty_input, prty_ftrs = get_features(train_prty_x.shape[1])
job_input, job_ftrs = get_features(train_job_x.shape[1])
state_input, state_ftrs = get_features(train_state_x.shape[1])

input_tensor_3 = Input((5,))
metadata = Dense(256, activation='relu')(input_tensor_3)

out_1 = Concatenate()([stmt_ftrs, sub_ftrs, cxt_ftrs, spkr_ftrs, prty_ftrs, job_ftrs, state_ftrs])
out = Dense(1028, activation='relu', kernel_regularizer=l2(0.0))(out_1)
out = Concatenate()([out, metadata])
out = Dropout(0.3)(out)
out = Dense(256, activation='relu', kernel_regularizer=l2(0.0))(out)
out = Dropout(0.3)(out)
out = Dense(6, activation='softmax')(out)
model_multi = Model(inputs = [stmt_input, sub_input, cxt_input, spkr_input, prty_input, job_input, state_input, input_tensor_3], outputs = out)
model_multi.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_9 (InputLayer)            [(None, 50)]         0                                            
__________________________________________________________________________________________________
input_10 (InputLayer)           [(None, 10)]         0                                            
__________________________________________________________________________________________________
input_11 (InputLayer)           [(None, 25)]         0                                            
__________________________________________________________________________________________________
input_12 (InputLayer)           [(None, 5)]          0                                            
____________________________________________________________________________________________

In [24]:
stop = EarlyStopping(monitor="val_acc", patience=20, mode="max")
reduce_lr = ReduceLROnPlateau(monitor="val_acc", factor=0.1, patience=10, min_lr=1e-6, verbose=1, mode="max")
optimizer = Adam(lr = 0.001)
model_multi.compile(optimizer, loss='binary_crossentropy', metrics=['acc'])

In [25]:
model_multi.fit([train_stmt_x, train_sub_x, train_cxt_x, train_spkr_x, train_prty_x, train_job_x, train_state_x, train_metadata], train_y,
          epochs=10, verbose=1, validation_data=([val_stmt_x, val_sub_x, val_cxt_x, val_spkr_x, val_prty_x, val_job_x, val_state_x,
                                                      val_metadata], val_y), callbacks=[reduce_lr, stop])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f8694f8d150>

In [26]:
val_loss, val_acc = model_multi.evaluate([val_stmt_x, val_sub_x, val_cxt_x, val_spkr_x, val_prty_x, val_job_x, val_state_x, val_metadata], val_y,
                                   verbose=1)
print('Validation Accuracy: %f' % (val_acc))
test_loss, test_acc = model_multi.evaluate([test_stmt_x, test_sub_x, test_cxt_x, test_spkr_x, test_prty_x, test_job_x, test_state_x, test_metadata],
                                     test_y, verbose=1)
print('Test Accuracy: %f' % (test_acc))

Validation Accuracy: 0.442368
Test Accuracy: 0.442778
