#**Fake News Classifier (NLP)**

110740 Mwandware Dalton Zai 

110855 Wesley Joel Odhiambo

Dataset:
https://www.kaggle.com/c/fake-news

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
nltk.download('stopwords')
# import spacy

import re as re
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
from google.colab import drive
drive.mount('/content/drive')

#**Fetch Dataset**

In [None]:
train_data=pd.read_csv('/content/drive/MyDrive/Datasets/fake_news.csv')
train_data = pd.DataFrame(train_data)

In [None]:
train_data.shape

In [None]:
train_data.drop('id',inplace=True,axis=1)

In [None]:
train_data

In [None]:
len(list(train_data.columns))

#**Check for Null values in Dataset**

In [None]:
train_data.isnull().sum()


In [None]:
features_missing_values=list(train_data.columns[train_data.isna().any()])
len(features_missing_values)

#**Drop missing Values**

In [None]:
new_train_data=train_data.dropna(axis=0,inplace=False)

In [None]:
new_train_data.shape

In [None]:
new_train_data.isnull().sum()

In [None]:
train_final=new_train_data.to_numpy()

In [None]:
new_train_data

In [None]:
new_train_data.reset_index(inplace=True)
new_train_data.head(10)

In [None]:
stemmer = PorterStemmer()
corpus = []

In [None]:
len(new_train_data["text"].values), type(new_train_data['text'].values)

#**Tokenizing**

In [None]:
UNIQUE_WORDS = 1500
corpus  = list(new_train_data["text"].values)
tokenizer = Tokenizer(num_words=1500, oov_token="<OOV>")
tokenizer.fit_on_texts(corpus)
word_indices = tokenizer.word_index
word_count = tokenizer.document_count


In [None]:
encoded_corpus = tokenizer.texts_to_sequences(corpus)

In [None]:
encoded_corpus = pad_sequences(encoded_corpus, maxlen=50, padding='post')
ds_y = new_train_data["label"].values

In [None]:
# Coversion to Tensor
ds_corpus = tf.data.Dataset.from_tensor_slices((encoded_corpus, ds_y))

In [None]:
for encoded_seq, label in ds_corpus.take(5):
  print(f'Sequence shape: {encoded_seq.shape}, Label is: {label}')

#**Split Dataset**

In [None]:
BATCH_SIZE = 64

dataset_size = ds_corpus.cardinality().numpy()

train_size = dataset_size * 0.7
val_size = dataset_size * 0.2
test_size = dataset_size * 0.1

ds_train = ds_corpus.take(train_size).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
ds_val = ds_corpus.skip(train_size).take(val_size).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
ds_test = ds_corpus.skip(train_size + val_size).take(test_size)

#**Build Model**

In [None]:
from keras.models import *
from keras.layers import *
from keras.callbacks import *
from tensorflow.keras.optimizers import Adam

In [None]:
model = keras.models.Sequential([
      keras.layers.Embedding(UNIQUE_WORDS, 512, input_length=50),
      keras.layers.GRU(100, return_sequences=True, dropout=0.2, recurrent_dropout=0.3),
      keras.layers.GRU(100, return_sequences=False, dropout=0.2, recurrent_dropout=0.3),
      keras.layers.Dense(1,  activation="sigmoid")
      ])

**Compile and Save Model**

In [None]:
model.compile(optimizer=Adam(learning_rate=0.01), loss='binary_crossentropy', metrics=['accuracy'])
es=EarlyStopping(monitor='loss', verbose=1, patience=3)
mc=ModelCheckpoint('best_model',save_best_only=True,verbose=1)

In [None]:
model.summary()

#**Train Model**

In [None]:
history = model.fit(ds_train, validation_data=ds_val, epochs=100, callbacks=[es,mc])

# **Model performance Graph**

In [None]:
plt.style.use("ggplot")
plt.figure()
plt.plot(history.history["loss"], label="train_loss")
plt.plot(history.history["val_loss"], label="val_loss")
plt.plot(history.history["accuracy"], label="train_acc")
plt.plot(history.history["val_accuracy"], label="val_acc")
plt.title("Training Loss and Accuracy")
plt.xlabel("Epoch #")
plt.ylabel("Loss/Accuracy")
plt.legend()

In [None]:
best_model = keras.models.load_model('best_model')

In [None]:
len(model.predict(ds_test.take(1)))

#**Hyperparameter Tuning**

In [None]:
pip install -q -U keras-tuner

In [None]:
import keras_tuner as kt

**Model Definition**

In [None]:
def model_builder(hp):
  hp_units = hp.Int('units', min_value=32, max_value=512, step=32)

  model = keras.models.Sequential([
      keras.layers.Embedding(UNIQUE_WORDS, 512, input_length=50),
      keras.layers.GRU(units=hp_units, return_sequences=True, dropout=0.2, recurrent_dropout=0.3),
      keras.layers.GRU(units=hp_units, return_sequences=False, dropout=0.2, recurrent_dropout=0.3),
      keras.layers.Dense(1,  activation="sigmoid")
      ])

  # Tune the learning rate for the optimizer
  # Choose an optimal value from 0.01, 0.001, or 0.0001
  hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

  model.compile(optimizer=Adam(learning_rate=hp_learning_rate), loss='binary_crossentropy', metrics=['accuracy'])

  return model

**Tuner Definition**

In [None]:
tuner = kt.Hyperband(model_builder,
                     objective='val_accuracy',
                     max_epochs=10,
                     factor=3)

In [None]:
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

**Tuning**

In [None]:
tuner.search(encoded_corpus, ds_y, epochs=50, validation_split=0.2, callbacks=[stop_early])

# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"""
The hyperparameter search is complete. The optimal number of units in the first densely-connected
layer is {best_hps.get('units')} and the optimal learning rate for the optimizer
is {best_hps.get('learning_rate')}.
""")

In [None]:
model = tuner.hypermodel.build(best_hps)
history = model.fit(encoded_corpus, ds_y, epochs=50, validation_split=0.2, callbacks=[stop_early])

val_acc_per_epoch = history.history['val_accuracy']
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))

In [None]:
hypermodel = tuner.hypermodel.build(best_hps)

# Retrain the model
hypermodel.fit(encoded_corpus, ds_y, epochs=best_epoch, validation_split=0.2)

In [None]:
eval_result = hypermodel.evaluate(encoded_corpus, ds_y)
print("[test loss, test accuracy]:", eval_result)

#**Tuned Model**


In [None]:
model = keras.models.Sequential([
      keras.layers.Embedding(UNIQUE_WORDS, 512, input_length=50),
      keras.layers.GRU(384, return_sequences=True, dropout=0.2, recurrent_dropout=0.3),
      keras.layers.GRU(384, return_sequences=False, dropout=0.2, recurrent_dropout=0.3),
      keras.layers.Dense(1,  activation="sigmoid")
      ])

In [None]:
model.compile(optimizer=Adam(learning_rate=0.01), loss='binary_crossentropy', metrics=['accuracy'])
es=EarlyStopping(monitor='loss', verbose=1, patience=3)
mc=ModelCheckpoint('best_model',save_best_only=True,verbose=1)

In [None]:
model.summary()

In [None]:
history = model.fit(ds_train, validation_data=ds_val, epochs=100, callbacks=[es,mc])

#**Model performance Graph**

In [None]:
plt.style.use("ggplot")
plt.figure()
plt.plot(history.history["loss"], label="train_loss")
plt.plot(history.history["val_loss"], label="val_loss")
plt.plot(history.history["accuracy"], label="train_acc")
plt.plot(history.history["val_accuracy"], label="val_acc")
plt.title("Training Loss and Accuracy")
plt.xlabel("Epoch #")
plt.ylabel("Loss/Accuracy")
plt.legend()