In [None]:
import gensim
import numpy as np
import pandas as pd
import os
import re
import csv
import codecs
import sys
import pickle
import matplotlib as mpl
%matplotlib inline
from matplotlib import pyplot as plt
from keras.utils import plot_model 
from IPython.display import Image
import pydot
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
import tensorflow as tf
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from keras import backend as K
from keras.engine.topology import Layer
#from keras import initializations
from keras import initializers, regularizers, constraints
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from keras.layers.merge import concatenate
from keras.layers.normalization import BatchNormalization
from keras.preprocessing.text import text_to_word_sequence, Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import callbacks
from keras import optimizers
from string import punctuation
from keras.models import Sequential, Model
from keras.layers import Input, Dense, LSTM, Embedding, Dropout, BatchNormalization, Activation, Bidirectional

In [None]:
!git clone https://github.com/Jayagn/Fake-news-challenge.git

In [None]:
#Specifying the folder locations
GloVe_DIR = './data/glove.twitter.27B.50d.txt'
DATA_DIR = './data'
MAX_SENT_LEN = 150 #to be tested on (150, 300 and 700)
MAX_VOCAB_SIZE = 28000 #vocabulary
BATCH_SIZE = 512
N_EPOCHS = 20 
LSTM_DIM = 50
EMBEDDING_DIM = 50 

In [None]:
seed = 1
np.random.seed(seed)

In [None]:
# Read the text files of fnc data
bodies = pd.read_csv(DATA_DIR + '/body_id.csv')
train_df = pd.read_csv(DATA_DIR + '/train.csv')
#validation_df = pd.read_csv(DATA_DIR + '/validation_data.csv')
test_df = pd.read_csv(DATA_DIR + '/test.csv')

In [None]:
train_df.replace('agree',2,True)
train_df.replace('disagree',3,True)
train_df.replace('discuss',4,True)
combine_df_train = train_df.join(bodies.set_index('Body ID'), on='Body ID')
combine_df_test = test_df.join(bodies.set_index('Body ID'), on='Body ID')

In [None]:
#Pre-processing the data
word_seq_head_train = [text_to_word_sequence(head) for head in combine_df_train['Headline']]
word_seq_bodies_train = [text_to_word_sequence(body) for body in combine_df_train['articleBody']]
word_seq_head_test = [text_to_word_sequence(head) for head in combine_df_test['Headline']]
word_seq_bodies_test = [text_to_word_sequence(body) for body in combine_df_test['articleBody']]

In [None]:
word_seq = []
for i in range(len(word_seq_head_train)):
    word_seq.append(word_seq_head_train[i])
for i in range(len(word_seq_bodies_train)):
    word_seq.append(word_seq_bodies_train[i])
for i in range(len(word_seq_head_test)):
    word_seq.append(word_seq_head_test[i])
for i in range(len(word_seq_bodies_test)):
    word_seq.append(word_seq_bodies_test[i])

In [None]:
#Tokenizing sentences
filter_list = '!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'

tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, filters=filter_list)
tokenizer.fit_on_texts([seq for seq in word_seq])

In [None]:
#Combining headline and body together
word_seq_train = [list(i) for i in word_seq_head_train]
for i in range(len(word_seq_head_train)):
    word_seq_train[i].extend(word_seq_bodies_train[i]) 
    
word_seq_test = [list(i) for i in word_seq_head_test]
for i in range(len(word_seq_head_test)):
    word_seq_test[i].extend(word_seq_bodies_test[i])

In [None]:
#Padding the data
X_train = tokenizer.texts_to_sequences([' '.join(seq[:MAX_SENT_LEN]) for seq in word_seq_train])
X_train = pad_sequences(X_train, maxlen=MAX_SENT_LEN, padding='post', truncating='post')
y_train = combine_df_train['Stance']

In [None]:
#Converting the sequence of words to sequnce of indices
X_test = tokenizer.texts_to_sequences([' '.join(seq[:MAX_SENT_LEN]) for seq in word_seq_test])
X_test = pad_sequences(X_test, maxlen=MAX_SENT_LEN, padding='post', truncating='post')

In [None]:
#One hot encoding
encoder_train = LabelEncoder()
encoder_train.fit(y_train)
encoded_train = encoder_train.transform(y_train)
dummy_y_train = np_utils.to_categorical(encoded_train)

In [None]:
X_train, X_vali, y_train, y_vali = train_test_split(X_train, dummy_y_train, random_state=10, test_size=0.1)

In [None]:
#GloVes embedding
glove_input_file = GloVe_DIR
word2vec_output_file = 'glove.50d.txt.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)
embeddings = gensim.models.KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

In [None]:
#Create an embedding matrix containing only the word's in our vocabulary
embeddings_matrix = np.random.uniform(-0.05, 0.05, size=(len(tokenizer.word_index)+1, EMBEDDING_DIM))
for word, i in tokenizer.word_index.items(): 
    try:
        embeddings_vector = embeddings[word]
    except KeyError:
        embeddings_vector = None

    if embeddings_vector is not None:
        embeddings_matrix[i] = embeddings_vector
        
del embeddings

In [None]:
#Only LSTM
#Build a sequential model by stacking neural net units
model_2 = Sequential()
model_2.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=EMBEDDING_DIM,
                          weights = [embeddings_matrix], trainable=False, name='word_embedding_layer', 
                          mask_zero=True)) # trainable=True results in overfitting

model_2.add(LSTM(LSTM_DIM, return_sequences=False, name='lstm_layer'))
model_2.add(Activation(activation='relu', name='activation_1'))
model_2.add(Dense(4, activation='softmax', name='output_layer'))
model_2.summary()

In [None]:
plot_model(model_2, to_file='lstm.png')

In [None]:
optimizer = optimizers.Adam(lr=0.001) #Tried varying learning rate
model_2.compile(loss='categorical_crossentropy',optimizer=optimizer,metrics=['accuracy'])

In [None]:
history_2 = model_2.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=20,validation_data=(X_vali, y_vali))
model_2.save('lstm_700.h5')

In [None]:
plt.plot(history_2.history['accuracy'])
plt.plot(history_2.history['val_accuracy'])
plt.title('Model accuracy with only LSTM(Truncation: 300 Epoch: 20)')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
history_2_df = pd.DataFrame(history_2.history)
history_2_df.to_csv("history_2_700.csv",header=True,index=False)

# LSTM with BatchNormalization and Dropouts

In [None]:
#LSTM Model with BatchNormalization and Dropouts
#Build a sequential model by stacking neural net units 
#dense layer: simply a layer connect units 
#dropout layer: for reduce overfitting a regularization method
model_3 = Sequential()
model_3.add(Embedding(input_dim=len(tokenizer.word_index)+1,output_dim=EMBEDDING_DIM, weights = [embeddings_matrix], trainable=False, name='word_embedding_layer', 
                          mask_zero=True)) # trainable=True results in overfitting

model_3.add(LSTM(LSTM_DIM, return_sequences=False, name='lstm_layer')) # Can try Bidirectional-LSTM

#model.add(Dense(32, name='dense_1'))
model_3.add(BatchNormalization(name='bn_1')) # BN did not really help with performance 
model_3.add(Dropout(rate=0.4, name='dropout_1')) # Can try varying dropout rates, in paper suggest 0.2
model_3.add(Activation(activation='relu', name='activation_1'))


#model.add(Dense(8, name='dense_2'))
model_3.add(BatchNormalization(name='bn_2'))
model_3.add(Dropout(rate=0.2, name='dropout_2'))
#model_3.add(Activation(activation='relu', name='activation_2'))

model_3.add(Dense(4, activation='softmax', name='output_layer'))
model_3.summary()

In [None]:
plot_model(model_3, to_file='lstm_with_dropout.png')

In [None]:
optimizer = optimizers.Adam(lr=0.001)
model_3.compile(loss='categorical_crossentropy',optimizer=optimizer, metrics=['accuracy'])

In [None]:
history_3 = model_3.fit(X_train, y_train, batch_size=BATCH_SIZE,epochs=20, validation_data=(X_vali, y_vali))
model_3.save('lstm_with_dropout_700.h5')

In [None]:
model_3.save('lstm_with_dropout.h5')

In [None]:
plt.plot(history_3.history['accuracy'])
plt.plot(history_3.history['val_accuracy'])
plt.title('Model accuracy of only LSTM with Dropouts(Truncation: 300 Epoch: 20)')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
history_3_df = pd.DataFrame(history_3.history)
history_3_df.to_csv("history_3_700.csv",header=True,index=False)