# Importing the Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the Dataset

In [2]:
dataset = pd.read_csv('FakeNewsNet.csv' ,  quoting = 3, on_bad_lines='skip')

# Data Analysis

In [3]:
dataset.isnull().sum()

Unnamed: 0,0
title,0
news_url,328
source_domain,328
tweet_num,0
real,0


In [4]:
dataset.shape

(19896, 5)

In [5]:
dataset.head()

Unnamed: 0,title,news_url,source_domain,tweet_num,real
0,Kandi Burruss Explodes Over Rape Accusation on...,http://toofab.com/2017/05/08/real-housewives-a...,toofab.com,42,1
1,People's Choice Awards 2018: The best red carp...,https://www.today.com/style/see-people-s-choic...,www.today.com,0,1
2,Sophia Bush Sends Sweet Birthday Message to 'O...,https://www.etonline.com/news/220806_sophia_bu...,www.etonline.com,63,1
3,Colombian singer Maluma sparks rumours of inap...,https://www.dailymail.co.uk/news/article-33655...,www.dailymail.co.uk,20,1
4,Gossip Girl 10 Years Later: How Upper East Sid...,https://www.zerchoo.com/entertainment/gossip-g...,www.zerchoo.com,38,1


# Modelling on Balanced data

In [6]:
from sklearn.utils import resample

print(dataset['real'].value_counts())

real
1    15017
0     4879
Name: count, dtype: int64


In [7]:
from sklearn.utils import resample

# Separate majority and minority classes
real_news = dataset[dataset['real'] == 1]
fake_news = dataset[dataset['real'] == 0]

# Upsample fake news to match real news count
fake_news_upsampled = resample(fake_news,
                               replace=True,
                               n_samples=len(real_news),
                               random_state=42)

# Combine upsampled fake news with real news
upsampled_dataset = pd.concat([real_news, fake_news_upsampled])
upsampled_dataset = upsampled_dataset.sample(frac=1, random_state=42).reset_index(drop=True)

# Split into features and target
X_balanced = upsampled_dataset.drop('real', axis=1)
y_balanced = upsampled_dataset['real']


In [8]:
print(X_balanced.head())

                                               title  \
0  7 Celebrity Summer Fashion Trends That Don't L...   
1  Meghan Markle Is Invited to Pippa Middleton’s ...   
2  Selena Gomez: Miley Cyrus Publicly Rejects Ple...   
3                              General Gabbery: DWTS   
4  Rihanna Sports Possible Baby Bump In Secret Ph...   

                                            news_url         source_domain  \
0  http://bojtv.com/7-celebrity-summer-fashion-tr...             bojtv.com   
1  https://www.theknotnews.com/meghan-markle-harr...   www.theknotnews.com   
2  hollywoodlife.com/2015/04/15/selena-gomez-mile...     hollywoodlife.com   
3  http://forums.previously.tv/topic/3512-general...  forums.previously.tv   
4      now100fm.com/rihanna-baby-bump-hassan-jameel/          now100fm.com   

   tweet_num  
0         82  
1         94  
2         45  
3         59  
4         16  


In [9]:
print(y_balanced.head())

0    1
1    1
2    0
3    1
4    0
Name: real, dtype: int64


# Cleaning the Dataset

In [10]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(X_balanced.shape[0]):
  text = str(X_balanced['title'][i]) + ' ' + str(X_balanced['source_domain'][i])
  review = re.sub('[^a-zA-Z]',' ',text)
  review = review.lower()
  review = review.split()
  ps = PorterStemmer()
  review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
  review = ' '.join(review)
  corpus.append(review)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
print(corpus)

['celebr summer fashion trend look obviou bojtv com', 'meghan markl invit pippa middleton wed www theknotnew com', 'selena gomez miley cyru publicli reject plea end feud hollywoodlif com', 'gener gabberi dwt forum previous tv', 'rihanna sport possibl babi bump secret photo shoot pic fm com', 'robert pattinson charli hunnam lost citi z nan', 'deep throat download hq porn glabellar rssing com', 'nicol kidman prove clap normal seal like oscar mishap internet hyster www thesun co uk', 'skinni jean hater love bella hadid look www whowhatwear com', 'watch grammi award onlin www cb com', 'vanderpump rule cheat drama time vanderpump rule cast cheat www cosmopolitan com', 'democrat nation convent web archiv org', 'kim kardashian kany west secret pact fake marriag debunk www inquisitr com', 'tv show movi nomin golden globe www vultur com', 'tiffani haddish issa rae star jay z friend inspir moonlight video watch www etonlin com', 'madonna exhaust overwhelm book googl com', 'ashton kutcher celebr 

# Creating the bag of words model

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
y = y_balanced.values

In [13]:
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 313150 stored elements and shape (30034, 13249)>
  Coords	Values
  (0, 1864)	0.25738724233205645
  (0, 11272)	0.38681565245294486
  (0, 3992)	0.3403241031481714
  (0, 12062)	0.39730560828533357
  (0, 6849)	0.28141046168979217
  (0, 8225)	0.5247012681806217
  (0, 1241)	0.3889040668790725
  (0, 2271)	0.06858261390191382
  (1, 2271)	0.07379921872276067
  (1, 7336)	0.27300974352680707
  (1, 7151)	0.2782187874967749
  (1, 5770)	0.42664566691275
  (1, 8859)	0.4080032683032982
  (1, 7445)	0.3191975485536433
  (1, 12798)	0.2719929462637921
  (1, 13110)	0.09216135242360167
  (1, 11693)	0.5565604340923833
  (2, 2271)	0.06177430556304451
  (2, 10282)	0.23282321280635432
  (2, 4747)	0.2352482898286267
  (2, 7475)	0.2979228077354235
  (2, 2726)	0.29506316548532147
  (2, 9243)	0.4104267976509394
  (2, 9580)	0.39916470183099323
  (2, 8917)	0.4026467846651289
  :	:
  (30031, 9336)	0.36712937830576653
  (30031, 9985)	0.4056184811178999
  (30

In [14]:
print(y)

[1 1 0 ... 1 0 0]


# Splitting the Data set into the Traning set and Test set

In [15]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = 0.2 ,stratify=y, random_state = 2)

# Training The Model Logistic Regression

In [16]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression(random_state = 0, C= 10, max_iter= 100, penalty= 'l2', solver= 'liblinear')
LR.fit(X_train , y_train)

# Evaluation

In [17]:
from sklearn.metrics import confusion_matrix , accuracy_score
LR_y_pred = LR.predict(X_test)
cm = confusion_matrix(y_test , LR_y_pred)
print(cm)
accuracy_score(y_test , LR_y_pred)

[[2804  200]
 [ 356 2647]]


0.9074413184617945

# K - Fold Cross Validation

In [18]:
from sklearn.model_selection import cross_val_score
LR_accuracies = cross_val_score(estimator = LR, X = X_train, y = y_train , cv = 10)
print("Accuracy : {:.2f}%".format(LR_accuracies.mean()*100))
print("Standard Deviation : {:.2f}%".format(LR_accuracies.std()*100))

Accuracy : 90.44%
Standard Deviation : 0.52%


# Applying Grid Search

In [19]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# Define the parameter grid
params = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1, 10],
    'solver': ['liblinear'],
    'max_iter': [100, 200]
}

# Create base model
lr = LogisticRegression(random_state=0)

# Set up Grid Search
grid = GridSearchCV(
    estimator=lr,
    param_grid=params,
    scoring='accuracy',
    cv=5,
    n_jobs=-1,
    error_score='raise'
)

# Fit on training data
grid.fit(X_train, y_train)

# Show best parameters and score
print("Best Parameters:", grid.best_params_)
print("Best Accuracy (CV):", grid.best_score_)

# Use best model to predict
best_lr = grid.best_estimator_
y_pred = best_lr.predict(X_test)

# Evaluate on test data
print("Classification Report:\n", classification_report(y_test, y_pred))


Best Parameters: {'C': 10, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}
Best Accuracy (CV): 0.9055229956657541
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.94      0.92      3004
           1       0.94      0.89      0.91      3003

    accuracy                           0.91      6007
   macro avg       0.92      0.91      0.91      6007
weighted avg       0.92      0.91      0.91      6007



# Training the Random Forest model

In [20]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators = 100, criterion = 'entropy',random_state = 0)
RF.fit(X_train , y_train)

In [21]:
RF_y_pred = RF.predict(X_test)
print(RF_y_pred)

[1 1 0 ... 0 0 1]


# Evaluation

In [22]:
RF_y_pred = RF.predict(X_test)
cm = confusion_matrix(y_test , RF_y_pred)
print(cm)
accuracy_score(y_test , RF_y_pred)

[[2893  111]
 [ 214 2789]]


0.9458964541368403

# Training the model with Aritficial Neural Network

In [23]:
#importing the libraries
import tensorflow as tf
from tensorflow import keras
tf.random.set_seed(3)
from tensorflow.keras.layers import Dropout
from tensorflow.keras.callbacks import EarlyStopping

#Building the Neural Network

In [24]:
#initializing neural network
ann = tf.keras.models.Sequential()

In [25]:
#adding the input layer
ann.add(tf.keras.layers.Dense(units = 32 , activation = 'relu'))
ann.add(Dropout(0.5))

In [26]:
#adding the second hidden layer
ann.add(tf.keras.layers.Dense(units = 32, activation = 'relu'))
ann.add(Dropout(0.5))

In [27]:
#adding the third hidden layer
ann.add(tf.keras.layers.Dense(units = 64, activation = 'relu'))
ann.add(Dropout(0.5))

In [28]:
#adding the output layer
ann.add(tf.keras.layers.Dense(units = 1,activation = 'sigmoid'))

# Training the Neural network

In [29]:
#compiling the neural network
ann.compile(optimizer = 'adam' , loss = 'binary_crossentropy' ,metrics = ['accuracy'])

In [30]:
early_stop = EarlyStopping(
    monitor='val_accuracy',
    patience=6,
    restore_best_weights=True,
    mode='max'
)

In [None]:
#training the neural network
history = ann.fit(X_train, y_train, validation_split=0.2, epochs=20 ,batch_size=32, callbacks=[early_stop], verbose=1)

Epoch 1/20
[1m601/601[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 10ms/step - accuracy: 0.6518 - loss: 0.5978 - val_accuracy: 0.8700 - val_loss: 0.3208
Epoch 2/20
[1m601/601[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 11ms/step - accuracy: 0.8710 - loss: 0.3181 - val_accuracy: 0.8926 - val_loss: 0.2690
Epoch 3/20
[1m601/601[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step - accuracy: 0.9212 - loss: 0.2113 - val_accuracy: 0.9109 - val_loss: 0.2494
Epoch 4/20
[1m601/601[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 10ms/step - accuracy: 0.9421 - loss: 0.1591 - val_accuracy: 0.9153 - val_loss: 0.2530
Epoch 5/20
[1m601/601[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 9ms/step - accuracy: 0.9543 - loss: 0.1254 - val_accuracy: 0.9249 - val_loss: 0.2407
Epoch 6/20
[1m601/601[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 10ms/step - accuracy: 0.9628 - loss: 0.1043 - val_accuracy: 0.9238 - val_loss: 0.2673
Epoch 7/20
[1m601/6

In [None]:
loss, accuracy = ann.evaluate(X_test, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy )

In [None]:
#visualizing training accuracy and validation accuracy
h = history

plt.plot(h.history['accuracy'],label='train_accuracy')
plt.plot(h.history['val_accuracy'],label='val_accuracy')
plt.legend()
plt.show()

In [None]:
#visualizing training loss and validation loss

plt.plot(h.history['loss'] , label = 'train_loss')
plt.plot(h.history['val_loss'] , label = 'validation_loss')
plt.legend()
plt.show()

# Save , Load and building a predictive system

In [None]:
#saving the model
ann.save("FakeNewsNet.keras")

import joblib
joblib.dump(vectorizer, "vectorizer.joblib")

In [None]:
#loading the model
from tensorflow.keras.models import load_model
model = load_model("FakeNewsNet.keras")

vectorizer = joblib.load("vectorizer.joblib")

In [None]:
#building a prediction system
input_title = 'Gwen Stefani Got Dumped by Blake Shelton Over "Jealousy and Drama" (EXCLUSIVE)'
input_domain = 'www.intouchweekly.com'
user_input = input_title + ' ' + input_domain
user_input = re.sub('[^a-zA-Z]',' ',user_input)
user_input = user_input.lower()
user_input = user_input.split()
user_input = [ps.stem(word) for word in user_input if not word in set(stopwords.words('english'))]
user_input = ' '.join(user_input)
user_input_transformed = vectorizer.transform([user_input])
prediction = model.predict(user_input_transformed)
if float(prediction[0]) >= 0.5:
  print('It is a Fake News')
else:
  print('The News is Real')