### Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import string
from tqdm.notebook import tqdm
from nltk.sentiment import SentimentIntensityAnalyzer

In [2]:
# Create a Kaggle API client
import os
os.environ['KAGGLE_USERNAME'] = 'mustabshiribnamin'
os.environ['KAGGLE_KEY'] = '0eff4183442e547cfe97d8be3e922d42' 
import kaggle as kg
kg.api.dataset_download_files(dataset = "bhavikjikadara/fake-news-detection", path='dataset', unzip=True)

Dataset URL: https://www.kaggle.com/datasets/bhavikjikadara/fake-news-detection


### View Data

In [3]:
true_df = pd.read_csv('dataset/true.csv')
fake_df = pd.read_csv('dataset/fake.csv')

In [4]:
true_df.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [5]:
true_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21417 entries, 0 to 21416
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    21417 non-null  object
 1   text     21417 non-null  object
 2   subject  21417 non-null  object
 3   date     21417 non-null  object
dtypes: object(4)
memory usage: 669.4+ KB


In [6]:
fake_df.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [7]:
fake_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23481 entries, 0 to 23480
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    23481 non-null  object
 1   text     23481 non-null  object
 2   subject  23481 non-null  object
 3   date     23481 non-null  object
dtypes: object(4)
memory usage: 733.9+ KB


### Data Preprocessing

In [8]:
true_df.duplicated().sum()

206

In [9]:
true_df.drop_duplicates(inplace=True)

In [10]:
fake_df.duplicated().sum()

3

In [11]:
fake_df.drop_duplicates(inplace=True)

In [12]:
true_df.isnull().sum()

title      0
text       0
subject    0
date       0
dtype: int64

In [13]:
fake_df.isnull().sum()

title      0
text       0
subject    0
date       0
dtype: int64

In [14]:
# labeling the true and false state
true_df['label'] = 0
fake_df['label'] = 1

In [15]:
# concating true and fake data
df = pd.concat([fake_df, true_df], ignore_index=True)
# shuffling the rows of the DataFrame and resetting the index
df = df.sample(frac=1).reset_index(drop=True)

In [16]:
#replace underscores and making title format
df.columns = df.columns.str.replace('_', ' ').str.title()

In [17]:
df.head()

Unnamed: 0,Title,Text,Subject,Date,Label
0,ACTOR JAMES WOODS DESTROYS Leftist TIME For Ar...,"Yesterday, the parents of the now deceased Ame...",left-news,"Sep 28, 2017",1
1,Trump Moronically Claims Entire Russia Invest...,Donald Trump went on another uncontrollable ra...,News,"June 27, 2017",1
2,"Republican senators criticize CBO, welcome hea...",WASHINGTON (Reuters) - Senate Majority Leader ...,politicsNews,"March 14, 2017",0
3,Police arrest Japanese man after body parts fo...,TOKYO (Reuters) - A Japanese man was arrested ...,worldnews,"October 31, 2017",0
4,Orlando shooter traveled to Saudi Arabia in 20...,WASHINGTON (Reuters) - Orlando gay nightclub s...,politicsNews,"June 13, 2016",0


In [18]:
# Clean text
stopword = set(stopwords.words('english'))
stemmer = PorterStemmer()

def clean(text):
    text = str(text).lower()
    # remove text within square brackets
    text = re.sub('\[.*?\]', '', text) 
    # remove http links
    text = re.sub('https?://\S+|www\.\S+', '', text)
    # remove html tags
    text = re.sub('<.*?>+', '', text)
    # remove punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    # remove newline chars
    text = re.sub('\n', '', text)
    # remove all word containing numbers
    text = re.sub('\w*\d\w*', '', text)
    # remove stopwords
    text = [word for word in text.split(' ') if word not in stopword]
    # applies stemming to words
    text = [stemmer.stem(word) for word in text]
    text = " ".join(text)
    return text

df['Text'] = df['Text'].apply(lambda x: clean(x))
df['Title'] = df['Title'].apply(lambda x: clean(x))

In [19]:
df.head()

Unnamed: 0,Title,Text,Subject,Date,Label
0,actor jame wood destroy leftist time articl su...,yesterday parent deceas american student cinci...,left-news,"Sep 28, 2017",1
1,trump moron claim entir russia investig lie c...,donald trump went anoth uncontrol rant media t...,News,"June 27, 2017",1
2,republican senat critic cbo welcom healthcar plan,washington reuter senat major leader mitch mc...,politicsNews,"March 14, 2017",0
3,polic arrest japanes man bodi part found apart,tokyo reuter japanes man arrest tuesday part ...,worldnews,"October 31, 2017",0
4,orlando shooter travel saudi arabia msnbc,washington reuter orlando gay nightclub shoot...,politicsNews,"June 13, 2016",0


### Sentiment Analysis

In [20]:
sia = SentimentIntensityAnalyzer()
result = {}

for i, row in tqdm(df.iterrows(), total=len(df), desc="Sentiment Analysis"):
    text = row['Text']
    my_id = row['Title'] 
    result[my_id] = sia.polarity_scores(text)

Sentiment Analysis:   0%|          | 0/44689 [00:00<?, ?it/s]

In [21]:
vaders = pd.DataFrame(result).T
vaders = vaders.reset_index().rename(columns={'index': 'Title'})
vaders = vaders.merge(df, how='left')

In [22]:
vaders.head()

Unnamed: 0,Title,neg,neu,pos,compound,Text,Subject,Date,Label
0,actor jame wood destroy leftist time articl su...,0.169,0.754,0.077,-0.9753,yesterday parent deceas american student cinci...,left-news,"Sep 28, 2017",1
1,actor jame wood destroy leftist time articl su...,0.169,0.754,0.077,-0.9753,yesterday parent deceas american student cinci...,politics,"Sep 28, 2017",1
2,trump moron claim entir russia investig lie c...,0.196,0.729,0.076,-0.9901,donald trump went anoth uncontrol rant media t...,News,"June 27, 2017",1
3,republican senat critic cbo welcom healthcar plan,0.14,0.86,0.0,-0.8225,washington reuter senat major leader mitch mc...,politicsNews,"March 14, 2017",0
4,polic arrest japanes man bodi part found apart,0.144,0.803,0.053,-0.891,tokyo reuter japanes man arrest tuesday part ...,worldnews,"October 31, 2017",0


### Model Training

In [23]:
from imblearn.over_sampling import RandomOverSampler
from keras.utils import plot_model
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
import keras
from keras.callbacks import EarlyStopping
from sklearn.metrics import confusion_matrix


In [24]:
df1 = vaders.copy()

In [25]:
df1['content'] = df1['Title'] + ' ' + df1['Text']

In [26]:
X = df1['content']
y = df1['Label']

In [27]:
#Scaling "Label" oversampling for balanced class distribution
ros = RandomOverSampler(random_state=42)
X_res, y_res = ros.fit_resample(df1['content'].values.reshape(-1, 1), df1['Label'])
X_res = X_res.flatten()

In [28]:
# Tokenization
tokenizer = Tokenizer(num_words=10000, oov_token='OOV')
tokenizer.fit_on_texts(X_res)
sequences = tokenizer.texts_to_sequences(X_res)
padded = pad_sequences(sequences, maxlen=1000)

In [29]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(padded, y_res, test_size=0.2, random_state=42)

In [117]:
model = keras.Sequential()

# converts integer-encoded word indices (from the tokenized text) into dense vectors of fixed size
model.add(keras.layers.Embedding(input_dim=10000, output_dim=32))  
model.add(keras.layers.SimpleRNN(32, activation='relu', dropout=0.2, recurrent_dropout=0.2))
model.add(keras.layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()


In [118]:
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

model_checkpoint = keras.callbacks.ModelCheckpoint('model.{epoch:02d}-{val_loss:.2f}.keras', monitor='val_loss', save_best_only=True)

history = model.fit(X_train, y_train, epochs=100, validation_data=(X_test, y_test), batch_size=32, callbacks=[early_stopping])

Epoch 1/100
[1m1174/1174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 78ms/step - accuracy: 0.8191 - loss: 21138.0820 - val_accuracy: 0.9089 - val_loss: 0.2400
Epoch 2/100
[1m1174/1174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 78ms/step - accuracy: 0.9178 - loss: 0.2099 - val_accuracy: 0.9239 - val_loss: 0.1946
Epoch 3/100
[1m1174/1174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m129s[0m 110ms/step - accuracy: 0.9368 - loss: 0.1632 - val_accuracy: 0.9228 - val_loss: 0.1887
Epoch 4/100
[1m1174/1174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m171s[0m 146ms/step - accuracy: 0.9488 - loss: 0.1399 - val_accuracy: 0.9241 - val_loss: 0.1811
Epoch 5/100
[1m1174/1174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m188s[0m 160ms/step - accuracy: 0.9510 - loss: 0.1281 - val_accuracy: 0.9288 - val_loss: 0.1734
Epoch 6/100
[1m1174/1174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m197s[0m 168ms/step - accuracy: 0.9554 - loss: 0.1155 - val_accuracy: 0.9307 - val

In [119]:
evaluation_result = model.evaluate(X_test, y_test)

print("Test Loss:", evaluation_result[0])
print("Test Accuracy:", evaluation_result[1])

[1m294/294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.9372 - loss: 0.1591
Test Loss: 0.16331534087657928
Test Accuracy: 0.9371805787086487


In [128]:
model.summary()
model.save('simple_rnn_model.keras')

In [123]:
model_lstm = keras.Sequential()
model_lstm.add(keras.layers.Embedding(input_dim=10000, output_dim=32))
model_lstm.add(keras.layers.Bidirectional(keras.layers.LSTM(32, return_sequences=True)))
model_lstm.add(keras.layers.Dropout(0.5))
model_lstm.add(keras.layers.LSTM(32))
model_lstm.add(keras.layers.Dense(1, activation='sigmoid', kernel_regularizer=keras.regularizers.l2(0.01)))
model_lstm.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_lstm.summary()


In [124]:
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

model_lstm_checkpoint = keras.callbacks.ModelCheckpoint('model_lstm.{epoch:02d}-{val_loss:.2f}.keras', monitor='val_loss', save_best_only=True)

history = model_lstm.fit(X_train, y_train, epochs=100, validation_data=(X_test, y_test), batch_size=32, callbacks=[early_stopping])

Epoch 1/100
[1m1174/1174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m285s[0m 234ms/step - accuracy: 0.9298 - loss: 0.1930 - val_accuracy: 0.9961 - val_loss: 0.0349
Epoch 2/100
[1m1174/1174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m278s[0m 237ms/step - accuracy: 0.9961 - loss: 0.0327 - val_accuracy: 0.9956 - val_loss: 0.0314
Epoch 3/100
[1m1174/1174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m276s[0m 235ms/step - accuracy: 0.9948 - loss: 0.0350 - val_accuracy: 0.9805 - val_loss: 0.0700
Epoch 4/100
[1m1174/1174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m280s[0m 239ms/step - accuracy: 0.9942 - loss: 0.0362 - val_accuracy: 0.9965 - val_loss: 0.0253
Epoch 5/100
[1m1174/1174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m280s[0m 238ms/step - accuracy: 0.9985 - loss: 0.0217 - val_accuracy: 0.9980 - val_loss: 0.0234
Epoch 6/100
[1m1174/1174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m286s[0m 244ms/step - accuracy: 0.9983 - loss: 0.0209 - val_accuracy: 0.9955 - val

In [126]:
evaluation_result = model_lstm.evaluate(X_test, y_test)

print("Test Loss:", evaluation_result[0])
print("Test Accuracy:", evaluation_result[1])

[1m294/294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 61ms/step - accuracy: 0.9987 - loss: 0.0200
Test Loss: 0.019394686445593834
Test Accuracy: 0.9991481900215149


In [129]:
model_lstm.summary()
model_lstm.save('lstm_model.keras')

In [30]:
new_model = keras.models.load_model('lstm_model.keras')
predictions = new_model.predict(X_test)

  saveable.load_own_variables(weights_store.get(inner_path))


[1m294/294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 60ms/step
