# Fake news Classification using:

1. RNN

2. Bi-RNN

3. LSTM

4. Bi-LSTM

In [None]:
import pandas as pd
import numpy as np

import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, LSTM, Embedding, Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot

## Importing data

In [None]:
from google.colab import files
uploaded = files.upload()

Saving cleaned_fake_news_data.csv to cleaned_fake_news_data.csv


In [None]:
df = pd.read_csv('cleaned_fake_news_data.csv',  encoding="ISO-8859-1")
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We DidnÃ¢ÂÂt Even See ComeyÃ...,Darrell Lucus,House Dem Aide: We DidnÃ¢ÂÂt Even See ComeyÃ...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [None]:
df.tail()

Unnamed: 0,id,title,author,text,label
424,495,Contaminated Food from China Now Entering the ...,noreply@blogger.com (Alexander Light),Contaminated Food from China Now Entering the ...,1
425,496,Ten Famous People on What to Read This Summer ...,T Magazine,"For his bookshop and website One Grand Books, ...",0
426,498,Hillary Clinton KNEW 5 years ago Anthony Weine...,The European Union Times,\nA WikiLeakÃ¢ÂÂs email released on Monday r...,1
427,500,A $150 Million Stairway to Nowhere on the Far ...,Ted Loos,By the look of the renderings officially unvei...,0
428,501,Cyber War ÃÂ From Trifle to Catastrophe ...,Ernest Partridge,Cyber War - From Trifle to Catastrophe By Ern...,1


In [None]:
df.shape

(429, 5)

In [None]:
# Drop NaN values

df = df.dropna()

In [None]:
# Separating dependent and independent features
X = df.drop('label', axis = 1)
y = df['label']

In [None]:
X.shape, y.shape

((429, 4), (429,))

In [None]:
X.reset_index(inplace = True)

In [None]:
X.shape, y.shape

((429, 5), (429,))

In [None]:
X

Unnamed: 0,index,id,title,author,text
0,0,0,House Dem Aide: We DidnÃ¢ÂÂt Even See ComeyÃ...,Darrell Lucus,House Dem Aide: We DidnÃ¢ÂÂt Even See ComeyÃ...
1,1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...
2,2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ..."
3,3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...
4,4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...
...,...,...,...,...,...
424,424,495,Contaminated Food from China Now Entering the ...,noreply@blogger.com (Alexander Light),Contaminated Food from China Now Entering the ...
425,425,496,Ten Famous People on What to Read This Summer ...,T Magazine,"For his bookshop and website One Grand Books, ..."
426,426,498,Hillary Clinton KNEW 5 years ago Anthony Weine...,The European Union Times,\nA WikiLeakÃ¢ÂÂs email released on Monday r...
427,427,500,A $150 Million Stairway to Nowhere on the Far ...,Ted Loos,By the look of the renderings officially unvei...


In [None]:
y

Unnamed: 0,label
0,1
1,0
2,1
3,1
4,1
...,...
424,1
425,0
426,1
427,0


## Preprocessing data

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
corpus = []
stemmer = PorterStemmer()

for i in range(0, len(X)):
  sent = re.sub('[^a-zA-Z]', ' ', X['title'][i])
  sent = sent.lower()
  sent = sent.split()

  sent = [stemmer.stem(word) for word in sent if not word in stopwords.words('english')]
  sent = ' '.join(sent)
  corpus.append(sent)

print(len(corpus))
print(corpus)

429
['hous dem aid even see comey letter jason chaffetz tweet', 'flynn hillari clinton big woman campu breitbart', 'truth might get fire', 'civilian kill singl us airstrik identifi', 'iranian woman jail fiction unpublish stori woman stone death adulteri', 'jacki mason hollywood would love trump bomb north korea lack tran bathroom exclus video breitbart', 'beno hamon win french socialist parti presidenti nomin new york time', 'back channel plan ukrain russia courtesi trump associ new york time', 'obama organ action partner soro link indivis disrupt trump agenda', 'bbc comedi sketch real housew isi caus outrag', 'russian research discov secret nazi militari base treasur hunter arctic photo', 'us offici see link trump russia', 'major leagu soccer argentin find home success new york time', 'well fargo chief abruptli step new york time', 'anonym donor pay million releas everyon arrest dakota access pipelin', 'fbi close hillari', 'chuck todd buzzfe donald trump polit favor breitbart', 'monic

NOTE: We took all the data in 429 rows of `column == Topic` in a list named `corpus`. Now we will work on this data.

# Preparing Input sequence

for the model (especially at embedding layer at first)

```corpus -> OHE coded -> padding(pre/post) -> array -> train-test split```

In [None]:
vocab_size = 5000
onehot_rep = [one_hot(word, vocab_size) for word in corpus]
onehot_rep

[[1132, 4632, 3620, 1737, 2953, 827, 337, 2955, 3305, 474],
 [2658, 2199, 2839, 1075, 935, 4288, 2985],
 [1716, 516, 4923, 1239],
 [4053, 943, 2162, 305, 94, 2461],
 [4276, 935, 4950, 4048, 2212, 3190, 935, 956, 2979, 3903],
 [1200,
  3388,
  4010,
  887,
  2537,
  4453,
  1374,
  3649,
  2116,
  1902,
  4650,
  2786,
  1300,
  709,
  2985],
 [207, 4038, 806, 577, 4574, 2697, 1246, 2853, 1842, 3718, 1066],
 [2586, 3354, 565, 543, 2562, 4362, 4453, 2139, 1842, 3718, 1066],
 [3056, 2850, 763, 680, 3838, 549, 3032, 4234, 4453, 4405],
 [3696, 3514, 24, 555, 719, 1109, 3549, 1486],
 [2782, 3266, 4348, 3542, 4878, 4420, 4180, 4459, 117, 1180, 1949],
 [305, 3444, 2953, 549, 4453, 2562],
 [2203, 3885, 3016, 4790, 3929, 2769, 4813, 1842, 3718, 1066],
 [2839, 491, 4618, 4839, 1489, 1842, 3718, 1066],
 [2341, 590, 4999, 838, 2205, 4705, 1595, 3588, 3578, 4944],
 [2585, 5, 2199],
 [4530, 1840, 1304, 3999, 4453, 431, 1038, 2985],
 [2562, 1464, 2839, 993, 112, 4898, 1272, 1394, 3190],
 [2018, 2634, 

In [None]:
# padding to make same length input sequence
sent_length = 50
embedded_docs = pad_sequences(onehot_rep, maxlen = sent_length, padding = 'pre')
embedded_docs

array([[   0,    0,    0, ..., 2955, 3305,  474],
       [   0,    0,    0, ...,  935, 4288, 2985],
       [   0,    0,    0, ...,  516, 4923, 1239],
       ...,
       [   0,    0,    0, ...,  817, 2612,  903],
       [   0,    0,    0, ..., 1842, 3718, 1066],
       [   0,    0,    0, ..., 4192, 1830, 3050]], dtype=int32)

In [None]:
sent_length = 50
embedded_docs1 = pad_sequences(onehot_rep, maxlen = sent_length, padding = 'post')
embedded_docs1

array([[1132, 4632, 3620, ...,    0,    0,    0],
       [2658, 2199, 2839, ...,    0,    0,    0],
       [1716,  516, 4923, ...,    0,    0,    0],
       ...,
       [2199, 2839, 4326, ...,    0,    0,    0],
       [ 838, 4734, 2835, ...,    0,    0,    0],
       [3737, 4558, 4192, ...,    0,    0,    0]], dtype=int32)

In [None]:
# converting to array

X_input = np.array(embedded_docs)
y_input = np.array(y)

In [None]:
X.shape, y.shape          # still same

((429, 5), (429,))

In [None]:
# train-test split

X_train, X_test, y_train, y_test =  train_test_split(X_input, y_input, test_size = 0.33, random_state = 42)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((287, 50), (287,), (142, 50), (142,))

## Creating models

In [None]:
embed_vector_features = 50

model = Sequential()
model.add(Embedding(vocab_size, embed_vector_features, input_length = sent_length))
model.add(SimpleRNN(100, return_sequences= False))
model.add(Dense(1, activation = 'sigmoid'))
model.build(input_shape = (None, 50))

model.summary()



In [None]:
# Bidirectional RNN model
embed_vector_features = 50

biRNN_model = Sequential()
biRNN_model.add(Embedding(vocab_size, embed_vector_features, input_length = sent_length))
biRNN_model.add(Bidirectional(SimpleRNN(100, return_sequences= False)))
biRNN_model.add(Dense(1, activation = 'sigmoid'))
biRNN_model.build(input_shape = (None, 50))

biRNN_model.summary()

NOTE: Here, I noticed the output shape for Bi-RNN is 200 whereas in SimpleRNN, it's 100.

In [None]:
# LSTM model
embed_vector_features = 50

LSTM_model = Sequential()
LSTM_model.add(Embedding(vocab_size, embed_vector_features, input_length = sent_length))
LSTM_model.add(LSTM(100, return_sequences= False))
LSTM_model.add(Dense(1, activation = 'sigmoid'))
LSTM_model.build(input_shape = (None, 50))

LSTM_model.summary()

In [None]:
# Bi-directional LSTM
embed_vector_features = 50

biLSTM_model = Sequential()
biLSTM_model.add(Embedding(vocab_size, embed_vector_features, input_length = sent_length))
biLSTM_model.add(Bidirectional(LSTM(100, return_sequences= False)))
biLSTM_model.add(Dense(1, activation = 'sigmoid'))
biLSTM_model.build(input_shape = (None, 50))

biLSTM_model.summary()

NOTE: Here, similar to RNN, in LSTM also, I noticed same that output shape for Bi-LSTM is 200 whereas in LSTM, it's 100.

In [None]:
# compiling all 4 models
model.compile(optimizer = 'adam', metrics = ['accuracy'], loss = 'binary_crossentropy')
biRNN_model.compile(optimizer = 'adam', metrics = ['accuracy'], loss = 'binary_crossentropy')
LSTM_model.compile(optimizer = 'adam', metrics = ['accuracy'], loss = 'binary_crossentropy')
biLSTM_model.compile(optimizer = 'adam', metrics = ['accuracy'], loss = 'binary_crossentropy')

# Training models

In [None]:
model.fit(X_train, y_train, validation_data = (X_test, y_test), epochs = 5, batch_size = 64)

Epoch 1/5
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 469ms/step - accuracy: 0.5857 - loss: 0.6640 - val_accuracy: 0.7394 - val_loss: 0.5764
Epoch 2/5
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.8060 - loss: 0.5315 - val_accuracy: 0.7746 - val_loss: 0.4954
Epoch 3/5
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.9073 - loss: 0.4043 - val_accuracy: 0.8169 - val_loss: 0.4211
Epoch 4/5
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.9764 - loss: 0.2854 - val_accuracy: 0.8592 - val_loss: 0.3618
Epoch 5/5
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.9893 - loss: 0.2001 - val_accuracy: 0.8592 - val_loss: 0.3295


<keras.src.callbacks.history.History at 0x79ec7412f320>

In [None]:
biRNN_model.fit(X_train, y_train, validation_data = (X_test, y_test), epochs = 5, batch_size = 64)

Epoch 1/5
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 545ms/step - accuracy: 0.5259 - loss: 0.6333 - val_accuracy: 0.8028 - val_loss: 0.5042
Epoch 2/5
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.8670 - loss: 0.4179 - val_accuracy: 0.8380 - val_loss: 0.3710
Epoch 3/5
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.9894 - loss: 0.1877 - val_accuracy: 0.8803 - val_loss: 0.3151
Epoch 4/5
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 0.9934 - loss: 0.0922 - val_accuracy: 0.8803 - val_loss: 0.2789
Epoch 5/5
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 0.9988 - loss: 0.0404 - val_accuracy: 0.8803 - val_loss: 0.2875


<keras.src.callbacks.history.History at 0x79ec7412da90>

In [None]:
LSTM_model.fit(X_train, y_train, validation_data = (X_test, y_test), epochs = 5, batch_size = 64)

Epoch 1/5
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 77ms/step - accuracy: 0.6155 - loss: 0.6877 - val_accuracy: 0.5915 - val_loss: 0.6682
Epoch 2/5
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.6048 - loss: 0.6558 - val_accuracy: 0.5915 - val_loss: 0.6312
Epoch 3/5
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.5905 - loss: 0.6146 - val_accuracy: 0.6690 - val_loss: 0.5722
Epoch 4/5
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.6834 - loss: 0.5197 - val_accuracy: 0.7887 - val_loss: 0.5155
Epoch 5/5
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.8970 - loss: 0.4525 - val_accuracy: 0.7887 - val_loss: 0.4398


<keras.src.callbacks.history.History at 0x79ec508f9970>

In [None]:
biLSTM_model.fit(X_train, y_train, validation_data = (X_test, y_test), epochs = 5, batch_size = 64)

Epoch 1/5
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 101ms/step - accuracy: 0.6188 - loss: 0.6873 - val_accuracy: 0.5915 - val_loss: 0.6670
Epoch 2/5
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - accuracy: 0.6250 - loss: 0.6536 - val_accuracy: 0.5915 - val_loss: 0.6449
Epoch 3/5
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - accuracy: 0.5675 - loss: 0.6424 - val_accuracy: 0.6268 - val_loss: 0.6023
Epoch 4/5
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - accuracy: 0.6315 - loss: 0.5723 - val_accuracy: 0.7042 - val_loss: 0.5399
Epoch 5/5
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - accuracy: 0.7427 - loss: 0.4822 - val_accuracy: 0.8169 - val_loss: 0.4528


<keras.src.callbacks.history.History at 0x79ec57527530>

## Prediction

In [None]:
y_pred1 = (model.predict(X_test) > 0.5).astype("int64")
y_pred1

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 87ms/step


array([[0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [0],
       [1],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [1],
       [0],
       [1],
       [1],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
    

In [None]:
y_pred2 = (biRNN_model.predict(X_test) > 0.5).astype("int64")
y_pred3 = (LSTM_model.predict(X_test) > 0.5).astype("int64")
y_pred4 = (biLSTM_model.predict(X_test) > 0.5).astype("int64")

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 124ms/step




[1m1/5[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m0s[0m 93ms/step



[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step


# Performance metrics

## 1. SimpleRNN

In [None]:
cm = confusion_matrix(y_test, y_pred1)
ac_sc = accuracy_score(y_test, y_pred1)
cl_report = classification_report(y_test, y_pred1)

print("Confusion matrix: \n", cm)
print("\nAccuracy score: ", ac_sc)
print("\nClassification report: \n", cl_report)

Confusion matrix: 
 [[74 10]
 [10 48]]

Accuracy score:  0.8591549295774648

Classification report: 
               precision    recall  f1-score   support

           0       0.88      0.88      0.88        84
           1       0.83      0.83      0.83        58

    accuracy                           0.86       142
   macro avg       0.85      0.85      0.85       142
weighted avg       0.86      0.86      0.86       142



## 2. Bi-directional RNN

In [None]:
cm = confusion_matrix(y_test, y_pred2)
ac_sc = accuracy_score(y_test, y_pred2)
cl_report = classification_report(y_test, y_pred2)

print("Confusion matrix: \n", cm)
print("\nAccuracy score: ", ac_sc)
print("\nClassification report: \n", cl_report)

Confusion matrix: 
 [[77  7]
 [10 48]]

Accuracy score:  0.8802816901408451

Classification report: 
               precision    recall  f1-score   support

           0       0.89      0.92      0.90        84
           1       0.87      0.83      0.85        58

    accuracy                           0.88       142
   macro avg       0.88      0.87      0.88       142
weighted avg       0.88      0.88      0.88       142



## 3. LSTM

In [None]:
cm = confusion_matrix(y_test, y_pred3)
ac_sc = accuracy_score(y_test, y_pred3)
cl_report = classification_report(y_test, y_pred3)

print("Confusion matrix: \n", cm)
print("\nAccuracy score: ", ac_sc)
print("\nClassification report: \n", cl_report)

Confusion matrix: 
 [[81  3]
 [27 31]]

Accuracy score:  0.7887323943661971

Classification report: 
               precision    recall  f1-score   support

           0       0.75      0.96      0.84        84
           1       0.91      0.53      0.67        58

    accuracy                           0.79       142
   macro avg       0.83      0.75      0.76       142
weighted avg       0.82      0.79      0.77       142



## 4. Bi-LSTM

In [None]:
cm = confusion_matrix(y_test, y_pred4)
ac_sc = accuracy_score(y_test, y_pred4)
cl_report = classification_report(y_test, y_pred4)

print("Confusion matrix: \n", cm)
print("\nAccuracy score: ", ac_sc)
print("\nClassification report: \n", cl_report)

Confusion matrix: 
 [[68 16]
 [10 48]]

Accuracy score:  0.8169014084507042

Classification report: 
               precision    recall  f1-score   support

           0       0.87      0.81      0.84        84
           1       0.75      0.83      0.79        58

    accuracy                           0.82       142
   macro avg       0.81      0.82      0.81       142
weighted avg       0.82      0.82      0.82       142



# Results:

1. RNN
```
[[57 27]
 [ 2 56]]
```

2. BiRNN
```
 [[78  6]
 [ 9 49]]
```

3. LSTM
```
[[63 21]
 [ 7 51]]
```

4. BiLSTM
```
 [[81  3]
 [24 34]]
```


# Analysis:
