# Importing Libraries

In [None]:
import os
import re
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, Bidirectional, LSTM, Dropout, SimpleRNN
import keras_tuner

## Loading the dataset

In [None]:
import pickle 

with open('news_final_dataset.pickle', 'rb') as file:
    news = pickle.load(file)

In [None]:
X_train, X_test, y_train, y_test = news['X_train'], news['X_test'], news['y_train'], news['y_test']

In [None]:
# filter for needed columns
X_train = X_train[['title_vect','all_text_vect']]
X_test = X_test[['title_vect','all_text_vect']]

In [None]:
# further split train set into smaller train set + validation set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state=1, test_size=0.25) # 0.25 x 0.8 = 0.2, #20% valid

In [None]:
X_train.head()

# Hypothesis 2: Testing Neural Network models

## Define Tokenizing, Padding, Plotting functions & Earlystopping

In [None]:
def tokenize(train_data,val_data,test_data):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(train_data)

    train_data = tokenizer.texts_to_sequences(train_data)
    val_data = tokenizer.texts_to_sequences(val_data)
    test_data = tokenizer.texts_to_sequences(test_data)
    vocab = tokenizer.word_index
    return train_data,val_data,test_data,vocab

def pad(train_data,val_data,test_data,max_len):
    train_data = pad_sequences(train_data, padding='post', maxlen=max_len)
    val_data = pad_sequences(val_data, padding='post', maxlen=max_len)
    test_data = pad_sequences(test_data, padding='post', maxlen=max_len)
    return train_data,val_data,test_data

def plotHistory(history):
    history_dict = history.history

    acc = history_dict['accuracy']
    val_acc = history_dict['val_accuracy']
    loss = history_dict['loss']
    val_loss = history_dict['val_loss']
    epochs = history.epoch

    plt.figure()
    plt.plot(epochs, loss, 'r', label='Training loss')
    plt.plot(epochs, val_loss, 'b', label='Validation loss')
    plt.title('Training and validation loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

    plt.figure()
    plt.plot(epochs, acc, 'g', label='Training acc')
    plt.plot(epochs, val_acc, 'b', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.show()

early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

## 1. Training on titles only

In [None]:
X_train1, X_val1, X_test1, vocab1 = tokenize(X_train['title'], X_val['title'], X_test['title'])
max_vocab1 = len(vocab1) + 1 # Adding 1 because of reserved 0 index
# Embedding Layer creates one more vector for "UNKNOWN" words, or padded words (0s). This Vector is filled with zeros.
print("Vocab Size for Titles: {}".format(max_vocab1))

# get length of longest title and pad all shorter titles to match length
max_len1 = max([len(x) for x in X_train1] + [len(x) for x in X_test1])
X_train1, X_val1, X_test1 = pad(X_train1, X_val1, X_test1, max_len1)
print("Max Title Length: {}".format(max_len1))

### Model 1: Training a simple RNN model on titles only

In [None]:
rnn_title_model = Sequential([
    Embedding(max_vocab1, 100, input_length=max_len1),
    Bidirectional(SimpleRNN(128)),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

rnn_title_model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

rnn_title_model.summary()

In [None]:
%%time
rnn_title_history = rnn_title_model.fit(X_train1, y_train, epochs=10, 
                    validation_data=(X_val1, y_val), batch_size=30, 
                    shuffle=True, callbacks=[early_stop])

In [None]:
plotHistory(rnn_title_history)

In [None]:
rnn_title_model.evaluate(X_train1, y_train)

In [None]:
rnn_title_model.evaluate(X_val1, y_val)

In [None]:
rnn_title_model.evaluate(X_test1, y_test)

### Model 2: Training an LSTM model on titles only

In [None]:
lstm_title_model = Sequential([
    Embedding(max_vocab1, 100, input_length=max_len1),
    Bidirectional(LSTM(128)),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

lstm_title_model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

lstm_title_model.summary()

In [None]:
%%time
lstm_title_history = lstm_title_model.fit(X_train1, y_train, epochs=10, 
                    validation_data=(X_val1, y_val), batch_size=30, 
                    shuffle=True, callbacks=[early_stop])

In [None]:
plotHistory(lstm_title_history)

In [None]:
lstm_title_model.evaluate(X_train1, y_train)

In [None]:
lstm_title_model.evaluate(X_val1, y_val)

In [None]:
lstm_title_model.evaluate(X_test1, y_test)

## 2. Training on all text (title + text)

### Training the word2vec model on the dataset

In [None]:
X_train2 = X_train['all_text']
X_val2 = X_val['all_text']
X_test2 = X_test['all_text']

In [None]:
 # all tokens generated in the first article
print(X_train2[0])

In [None]:
# training word2vec on the 'text' corpus to form the embedding layer
from gensim.models import Word2Vec

EMBEDDING_DIM = 100
w2v = Word2Vec(
    sentences = X_train2,
    vector_size = EMBEDDING_DIM,
    window = 5,
    min_count = 1
)

In [None]:
num = len(w2v.wv)
print('There are a total of %d words in the vocabulary of our trained word2vec model.' % num)

In [None]:
w2v.wv["donald"]

In [None]:
w2v.wv.most_similar("trump")

### Preparing the neural network model inputs

In [None]:
X_train2, X_val2, X_test2, vocab2 = tokenize(X_train2, X_val2, X_test2)
max_vocab2 = len(vocab2) + 1
print("Vocab Size for All Text: {}".format(max_vocab2))

In [None]:
# all token indexes generated from the first article
print(X_train2[0])

In [None]:
article_lens = np.array([len(index) for index in X_train2])
plt.hist(article_lens, bins=500)
plt.show()

In [None]:
len_1000 = article_lens[article_lens < 1000]
print('%d out of %d articles have less than 1000 words' % (len(len_1000),len(article_lens)))

<li>Since the inputs to the neural network have to be of the same size, we have to pad each article in the dataset.</li>
<li>Since the majority of articles have less than 1000 words, we pad shorter news articles and truncate longer articles.</li>

In [None]:
max_len2 = 1000
X_train2, X_val2, X_test2 = pad(X_train2, X_val2, X_test2, max_len2)
print("Max All Text Length: {}".format(max_len2))

In [None]:
# creating a weight matrix to retain weights learned by word2vec in the embedding layer later on
def get_weight_matrix(model, vocab):
    # total vocabulary size plus 0 for unknown words
    vocab_size = len(vocab) + 1
    # create weight matrix with number of rows = vocab_size and number of columns = number of embedding dimensions
    weight_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
    # for each word in the vocab, store its vector created by the word2vec model
    for word, row_index in vocab.items():
        weight_matrix[row_index] = model.wv[word]
    return weight_matrix

In [None]:
embedding_vectors = get_weight_matrix(w2v, vocab2)

### Model 3: Training a simple RNN model on all text

In [None]:
rnn_alltext_model = Sequential([
    Embedding(max_vocab2, output_dim=EMBEDDING_DIM, weights=[embedding_vectors], input_length=max_len2, trainable=False),
    Bidirectional(SimpleRNN(units=128)),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

rnn_alltext_model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

rnn_alltext_model.summary()

In [None]:
%%time
rnn_alltext_history = rnn_alltext_model.fit(X_train2, y_train, epochs=10, 
                                validation_data=(X_val2, y_val), batch_size=30, 
                                shuffle=True, callbacks=[early_stop])

In [None]:
plotHistory(rnn_alltext_history)

In [None]:
rnn_alltext_model.evaluate(X_train2, y_train)

In [None]:
rnn_alltext_model.evaluate(X_val2, y_val)

In [None]:
rnn_alltext_model.evaluate(X_test2, y_test)

### Model 4: Training an LSTM model on all text

In [None]:
lstm_alltext_model = Sequential([
    Embedding(max_vocab2, output_dim=EMBEDDING_DIM, weights=[embedding_vectors], input_length=max_len2, trainable=False),
    Bidirectional(LSTM(units=128)),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

lstm_alltext_model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

lstm_alltext_model.summary()

In [None]:
%%time
lstm_alltext_history = lstm_alltext_model.fit(X_train2, y_train, epochs=10, 
                                validation_data=(X_val2, y_val), batch_size=30, 
                                shuffle=True, callbacks=[early_stop])

In [None]:
plotHistory(lstm_alltext_history)

In [None]:
lstm_alltext_model.evaluate(X_train2, y_train)

In [None]:
lstm_alltext_model.evaluate(X_val2, y_val)

In [None]:
lstm_alltext_model.evaluate(X_test2, y_test)

### Hyperparameter tuning for LSTM model for all text

In [None]:
def hyperparameterTuning(hp): #https://keras.io/api/keras_tuner/hyperparameters/
    model = Sequential([
        Embedding(max_vocab2, output_dim=EMBEDDING_DIM, weights=[embedding_vectors], input_length=max_len2, trainable=False),
        Bidirectional(LSTM(units=hp.Int('units', min_value=64, max_value=192, step=64))), #https://keras.io/api/layers/recurrent_layers/simple_rnn/
        Dropout(rate=hp.Float('rate', min_value=0.3, max_value=0.7, step=0.2)), #https://keras.io/api/layers/regularization_layers/dropout/
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])
    return model

In [None]:
#https://medium.com/swlh/hyperparameter-tuning-in-keras-tensorflow-2-with-keras-tuner-randomsearch-hyperband-3e212647778f
tuner = keras_tuner.BayesianOptimization(hyperparameterTuning,
                              objective='val_loss',
                             directory='lstm_tuner2')

In [None]:
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
tuner.search(X_train2, y_train, epochs=10, 
            validation_data=(X_val2, y_val), batch_size=30, 
            shuffle=True, callbacks=[early_stop])

In [None]:
tuner.results_summary()

In [None]:
best_lstm_model = tuner.get_best_models(num_models=1)[0]

In [None]:
best_lstm_model.summary()

In [None]:
best_lstm_model.evaluate(X_train2, y_train)

In [None]:
best_lstm_model.evaluate(X_val2, y_val)

In [None]:
best_lstm_model.evaluate(X_test2, y_test)