<a href="https://colab.research.google.com/github/Najaf-Ali12/DL-Projects/blob/main/Fake_News_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#! pip install tensorflow

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot

**Loading the Dataset



In [3]:
data=pd.read_csv("FakeNewsNet.csv")
data.head()

Unnamed: 0,title,news_url,source_domain,tweet_num,real
0,Kandi Burruss Explodes Over Rape Accusation on...,http://toofab.com/2017/05/08/real-housewives-a...,toofab.com,42.0,1.0
1,People's Choice Awards 2018: The best red carp...,https://www.today.com/style/see-people-s-choic...,www.today.com,0.0,1.0
2,Sophia Bush Sends Sweet Birthday Message to 'O...,https://www.etonline.com/news/220806_sophia_bu...,www.etonline.com,63.0,1.0
3,Colombian singer Maluma sparks rumours of inap...,https://www.dailymail.co.uk/news/article-33655...,www.dailymail.co.uk,20.0,1.0
4,Gossip Girl 10 Years Later: How Upper East Sid...,https://www.zerchoo.com/entertainment/gossip-g...,www.zerchoo.com,38.0,1.0


*Preprocessing*

In [4]:
data.columns
data.dtypes

Unnamed: 0,0
title,object
news_url,object
source_domain,object
tweet_num,float64
real,float64


In [5]:
data.isnull().sum()
# we will not remove the null values b/c we will only use title and the real column in our model which have zero null value

Unnamed: 0,0
title,0
news_url,157
source_domain,158
tweet_num,1
real,1


In [6]:
# To check whether the outputs/data is balanced or not
data['real'].value_counts()

# We got a big issue which is imbalanced data that will compel our model to predict majority values multiple times and minority values very few times.


Unnamed: 0_level_0,count
real,Unnamed: 1_level_1
1.0,8555
0.0,2740


***Handling Imbalanced Data Issue***

In [7]:
minority_class=data[data['real']==0]
majority_class=data[data['real']==1]
print(majority_class.value_counts().sum())
print(minority_class.value_counts().sum())

8521
2617


***Resampling Techniques (using scikit-learn for basic methods) ---***
This increases the number of samples in the minority class by randomly duplicating them.

In [8]:
from sklearn.utils import resample
print("OverSampling minority class")
over_sampled_minority=resample(minority_class,replace=True,n_samples=len(majority_class),random_state=42)
print(over_sampled_minority.value_counts().sum())
print(majority_class.value_counts().sum())
print(minority_class.value_counts().sum())
data.value_counts().sum()

# Resample method returns a dataframe here name is over_sampled_minority

OverSampling minority class
8174
8521
2617


np.int64(11138)

In [9]:
# Concatenating over_sampled_minority and majority class to get new balanced dataframe
concatenated_df=pd.concat([majority_class,over_sampled_minority])
print(type(concatenated_df))

# Shuffling the concatenated df so that our model doesnot learn an Artificial Order
concatenated_df=concatenated_df.sample(frac=1,random_state=42).reset_index(drop=True)

<class 'pandas.core.frame.DataFrame'>


In [10]:
# Creating X and y
X=concatenated_df['title']
y=concatenated_df['real']
print(X.head())
print(y.head())

0    Justin Timberlake Stops Concert for Epic Pregn...
1    Mandy Moore Celebrates Engagement Party With F...
2    Sarah Jessica Parker reflects on Sex and the C...
3    Angelina Jolie, Fragile At 79 Lbs, Battles Bra...
4    Kylie Jenner & Travis Scott Fighting Over Baby...
Name: title, dtype: object
0    1.0
1    1.0
2    1.0
3    0.0
4    0.0
Name: real, dtype: float64


In [22]:
import nltk
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [23]:

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
corpus=[]
for i in range(1,len(X)):
    review=re.sub("[^a-zA-Z]"," ",X[i])
    review=review.lower()
    review=review.split()
    # Using WordNetLemmatizer to lemmatize the words
    review=[WordNetLemmatizer().lemmatize(word) for word in review if word not in stopwords.words("english")]
    review=" ".join(review)
    corpus.append(review)

In [27]:
# Finding the vocabulory size
voc=[]
for each in corpus:
  for word in each.split(" "):
    if word not in voc:
      voc.append(word)
print(voc)
voc_size=len(voc)
print("Total number of unique words=",voc_size)

Total number of unique words= 11345


In [32]:
for i in range(0,5):
  print(corpus[i])

mandy moore celebrates engagement party friend pic
sarah jessica parker reflects sex city th anniversary
angelina jolie fragile lb battle brad pitt career co star fear fate jennifer aniston
kylie jenner travis scott fighting baby name set choice pushing away
taylor momsen


In [33]:
# One-Hot Encoding
one_hot_repr=[one_hot(words,voc_size) for words in corpus]
print(one_hot_repr)

[[4544, 7327, 3491, 2775, 7318, 1228, 7507], [7719, 4806, 4882, 7050, 4025, 3536, 2248, 3793], [1378, 8818, 1373, 385, 1219, 7692, 3411, 3908, 2798, 2717, 4373, 6873, 8195, 7556], [5164, 9371, 6205, 1023, 5457, 4284, 858, 7474, 6981, 2630, 8480], [1121, 3298], [505, 7679, 1619, 10863, 1897, 11198, 4967], [1845, 102, 9903, 4645, 3747, 2216, 9102, 3747, 6206, 9535, 6998], [1386, 6473, 7880, 868, 6612, 4505, 2810], [9389, 5330, 370, 7140, 3562, 596, 10781], [3232, 10290], [3077, 11180, 10992, 6005, 11106, 1386, 2970, 5623, 11092, 7328], [7896, 10940, 6525, 2669, 9149, 4574, 3207, 5737, 9589, 154, 10166, 6531], [9389, 308, 5592, 7900, 4866, 7168, 7716, 4071, 6919, 3320, 4107, 6266, 8064], [2873, 6324, 9869, 3578, 9371, 3940, 243, 10440, 5848], [6215, 5999, 370, 7828, 7073, 11136, 8457, 2173], [7794, 3887, 5162, 8394, 2438, 9851, 6298, 7154], [4807, 460, 6142, 6196, 1778, 11294, 6635, 9682, 9911], [2319, 7031, 7388, 2962, 8967, 6317, 8373, 8711], [9688, 7992, 10408, 6123, 6570, 10722, 6548,

In [71]:
# Finding the max length sentence in corpus
max=len(one_hot_repr[1])
for each in one_hot_repr:
  if len(each)>max:
    max=len(each)

print("The largest sentence in the corpus has",max,"words")

The largest sentence in the corpus has 26 words


In [72]:
# Padding so that values can be passed to model
from tensorflow.keras.preprocessing.sequence import pad_sequences
equal_sized_corpus=pad_sequences(one_hot_repr,padding="post",maxlen=max)
print("Equal_sized_corpus: \n")
print(equal_sized_corpus)

Equal_sized_corpus: 

[[4544 7327 3491 ...    0    0    0]
 [7719 4806 4882 ...    0    0    0]
 [1378 8818 1373 ...    0    0    0]
 ...
 [3380 9689 3232 ...    0    0    0]
 [4806  988 6899 ...    0    0    0]
 [1121 5984 4778 ...    0    0    0]]


In [73]:
# Deep learning and Model Initialization
from tensorflow.keras.layers import LSTM
model=Sequential()
model.add(Embedding(input_dim=voc_size,output_dim=20,input_length=max))
model.add(Bidirectional(LSTM(150)))
model.add(Dense(1,activation='sigmoid'))
model.build((None,20))
model.summary()



In [74]:
# Compiling Model
model.compile(optimizer="adam",metrics=['accuracy'],loss="categorical_crossentropy")

In [75]:
# Printing no of rows in X and y
print(equal_sized_corpus.shape)
print(y.shape)

(17109, 26)
(17109,)


In [76]:
# Removing one element from y so that the no of rows in both X and y should be same
y.drop(y.iloc[0],inplace=True)

KeyError: '[1.0] not found in axis'

In [77]:
# Train-test-split before training
X_train,X_test,y_train,y_test=train_test_split(equal_sized_corpus,y,test_size=0.2,random_state=42)



In [78]:
# Training Model
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=20,batch_size=40)

Epoch 1/20


  return self.fn(y_true, y_pred, **self._fn_kwargs)


[1m343/343[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.5058 - loss: 0.0000e+00

  return self.fn(y_true, y_pred, **self._fn_kwargs)


[1m343/343[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 11ms/step - accuracy: 0.5058 - loss: 0.0000e+00 - val_accuracy: 0.4863 - val_loss: 0.0000e+00
Epoch 2/20
[1m343/343[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8ms/step - accuracy: 0.5043 - loss: 0.0000e+00 - val_accuracy: 0.4863 - val_loss: 0.0000e+00
Epoch 3/20
[1m343/343[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - accuracy: 0.5043 - loss: 0.0000e+00 - val_accuracy: 0.4863 - val_loss: 0.0000e+00
Epoch 4/20
[1m343/343[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step - accuracy: 0.5032 - loss: 0.0000e+00 - val_accuracy: 0.4863 - val_loss: 0.0000e+00
Epoch 5/20
[1m343/343[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 11ms/step - accuracy: 0.5009 - loss: 0.0000e+00 - val_accuracy: 0.4863 - val_loss: 0.0000e+00
Epoch 6/20
[1m343/343[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 13

<keras.src.callbacks.history.History at 0x7c23f63235d0>

In [80]:
# Predication
y_pred=model.predict(X_test)

[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


In [81]:
# Accuracy score of model
accuracy=accuracy_score(y_test,y_pred.round())
print("Accuracy of model=",accuracy)

Accuracy of model= 0.4862653419053185


In [82]:
# Confusion matrix
confusion_matrix(y_test,y_pred.round())

array([[1664,    0],
       [1758,    0]])

In [84]:
# Classification report
print(classification_report(y_test,y_pred.round()))

              precision    recall  f1-score   support

         0.0       0.49      1.00      0.65      1664
         1.0       0.00      0.00      0.00      1758

    accuracy                           0.49      3422
   macro avg       0.24      0.50      0.33      3422
weighted avg       0.24      0.49      0.32      3422



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
