# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
import string
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import tensorflow as tf
from sklearn.metrics import confusion_matrix, roc_curve, auc, classification_report
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout


import warnings
warnings.filterwarnings('ignore')

# Import and Combine Data

In [None]:
df1 = pd.read_csv("Dataset/Fake.csv")
df2 = pd.read_csv("Dataset/True.csv")

df1['label'] = 1
df2['label'] = 0 

df1 = df1.drop(columns = ['subject', 'date'])
df2 = df2.drop(columns = ['subject', 'date'])

df3 = pd.read_csv("Dataset/news.csv")

# Use replace to change 'FAKE' to 1 and 'REAL' to 0
df3['label'] = df3['label'].replace({'FAKE': 1, 'REAL': 0})
df3 = df3.drop(columns = df3.columns[0])

df4 = pd.read_csv("Dataset/WELFake_Dataset.csv")

df4 = df4.drop(columns = df4.columns[0])
df4['label'] = df4['label'].replace({1: 0, 0: 1})

In [None]:
df = pd.concat([df1, df2, df3, df4], ignore_index=True)

# Data Processing

In [None]:
#Remove Rows with empty values
print(df.shape[0])
df = df.dropna()
print(df.shape[0])

In [None]:
#Remove duplciate data
print(df.shape[0])
df = df.drop_duplicates(subset=['title'], keep='first')
print(df.shape[0])

In [None]:
# Shuffle (randomize) the rows
df = df.sample(frac=1, random_state=42) 

In [None]:
#Combine Title and Text
df['final'] = df['title'] + df['text']
df.head()

In [None]:
# Apply preprocessing: remove punctuation by character
df['final'] = df['final'] .apply(lambda x: x.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation))))

#Remove Numbers
df['final']= df['final'].str.replace(r'\b\d+\b', '', regex=True)

# Load English stopwords
stopwords_set = set(stopwords.words('english'))

# Apply preprocessing: remove stopwords
df['final'] = df['final'].apply(lambda x: ' '.join([word.lower() for word in x.split() if word.lower() not in stopwords_set]))

# Apply preprocessing: convert text to lowercase
df['final'] = df['final'].apply(lambda x: x.lower())

#Reduce words to root form
stemmer = PorterStemmer()
df['final'] = df['final'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

In [None]:
#Save DataFrame
df.to_pickle('Dataframe_1.pkl')

In [None]:
# To load the DataFrame back
df = pd.read_pickle('../Dataframe_1.pkl')

# LSTM Implementation 

## Base LSTM

### Prepare the data

In [None]:
#Define variables 
vocab_size = 30000  
max_length = 150 
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'

# tokenizing the text from our dataset
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(df['final'])
sequences = tokenizer.texts_to_sequences(df['final'])
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# splitting data (train and test)
labels = df['label'].values
X_train, X_test, y_train, y_test = train_test_split(padded, labels, test_size=0.2)

### Build Base Model

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 16, input_length=max_length))
model.add(LSTM(32))
model.add(Dense(1, activation='sigmoid'))

In [None]:
#Define Optimizer
model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

### Train Model

In [None]:
#Train Model
model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))

In [None]:
model.save('BaseLSTM.h5')

In [None]:
model = load_model('BaseLSTM.h5')

### Evaluating the Model

#### Basic Evaluaiton

In [None]:
# Predictions
predictions = model.predict(X_test)
predictions = [1 if p > 0.5 else 0 for p in predictions]

# Evaluation
print(classification_report(y_test, predictions))

#### Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, predictions)

# Normalize the confusion matrix
normalized_cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
sns.heatmap(normalized_cm, annot=True, fmt='.2f')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Base LSTM Confusion Matrix')
plt.show()

#### Receiver Operating Characteristic (ROC) Curve and Area Under Curve (AUC)

In [None]:
fpr, tpr, _ = roc_curve(y_test, model.predict(X_test).ravel())
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC - Base LSTM')
plt.legend(loc="lower right")
plt.show()

## LSTM - Additional Layer and Dropout Function

### Prepare the data

In [None]:
#Define Variables
vocab_size = 30000  
max_length = 150 
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'

# tokenizing the text from our dataset
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(df['final'])
sequences = tokenizer.texts_to_sequences(df['final'])
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# splitting data (train and test)
labels = df['label'].values
X_train, X_test, y_train, y_test = train_test_split(padded, labels, test_size=0.2)

In [None]:
# Save tokenizer
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Save sequences (tokens)
np.save('padded_sequences.npy', padded)

# Save labels
np.save('labels.npy', labels)

### Build Adapted Model

In [None]:
#Define Model
model_1 = Sequential()
model_1.add(Embedding(vocab_size, 16, input_length=max_length))
model_1.add(LSTM(32, return_sequences=True))
model_1.add(Dropout(0.2))
model_1.add(LSTM(32))
model_1.add(Dense(1, activation='sigmoid'))

In [None]:
#Define Optimizer
model_1.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

### Train Adapted Model

In [None]:
#Train Model
model_1.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))

In [None]:
model_1.save('AddLSTM.h5')

In [None]:
model_1 = load_model('AddLSTM.h5')

### Evaluating Model

#### Basic Evaluation

In [None]:
# Predictions
predictions = model_1.predict(X_test)
predictions = [1 if p > 0.5 else 0 for p in predictions]

# Evaluation
print(classification_report(y_test, predictions))

#### Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, predictions)

# Normalize the confusion matrix
normalized_cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
sns.heatmap(normalized_cm, annot=True, fmt='.2f')
plt.xlabel('Predicted')
plt.title('Adapted LSTM Confusion Matrix')
plt.ylabel('True')
plt.show()

#### Receiver Operating Characteristic (ROC) Curve and Area Under Curve (AUC)

In [None]:
fpr, tpr, _ = roc_curve(y_test, model_1.predict(X_test).ravel())
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC - Adapted LSTM')
plt.legend(loc="lower right")
plt.show()