# Dataset Link

https://github.com/pillai-ashwin/Fake-news-Detection/blob/master/datasets/train.csv

In [64]:
import pandas as pd

In [65]:
df = pd.read_csv(
    "train.csv",
    engine="python",
    quoting=3,      # Ignore quotes
    on_bad_lines="skip"
)

In [66]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,Economic Growth Data Released Today,Times of India,The study was conducted over five years and re...,1
1,1,Viral Claim Spreads Across Internet,Anonymous Source,This claim has spread rapidly on social media ...,0
2,2,Health Ministry Announces New Guidelines,Reuters,The study was conducted over five years and re...,1
3,3,Breaking News That Media Is Hiding,Independent Reporter,This claim has spread rapidly on social media ...,0
4,4,Scientists Publish Findings in Peer-Reviewed J...,Associated Press,The study was conducted over five years and re...,1


In [67]:
df.isnull()

Unnamed: 0,id,title,author,text,label
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False
...,...,...,...,...,...
17991,False,False,False,False,False
17992,False,False,False,False,False
17993,False,False,False,False,False
17994,False,False,False,False,False


# Drop the Missing Values

In [68]:
df=df.dropna()

# Get the Independent Values and Dependent Values

In [69]:
X = df.drop("label", axis=1)
Y = df['label']

In [70]:
X.shape

(17996, 4)

In [71]:
Y.shape

(17996,)

In [72]:
import tensorflow as tf

In [73]:
tf.__version__

'2.19.0'

In [74]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM, Input
from tensorflow.keras.layers import Dense

# OneHotEncoding Representation

In [75]:
voc_size = 5000

In [76]:
messages = X.copy()

In [77]:
messages['title'][1]

'Viral Claim Spreads Across Internet'

In [78]:
messages

Unnamed: 0,id,title,author,text
0,0,Economic Growth Data Released Today,Times of India,The study was conducted over five years and re...
1,1,Viral Claim Spreads Across Internet,Anonymous Source,This claim has spread rapidly on social media ...
2,2,Health Ministry Announces New Guidelines,Reuters,The study was conducted over five years and re...
3,3,Breaking News That Media Is Hiding,Independent Reporter,This claim has spread rapidly on social media ...
4,4,Scientists Publish Findings in Peer-Reviewed J...,Associated Press,The study was conducted over five years and re...
...,...,...,...,...
17991,19993,Viral Claim Spreads Across Internet,Independent Reporter,Experts warn that this misleading information ...
17992,19994,Government Releases Official Policy Report,Times of India,Data released today shows a steady increase co...
17993,19995,Hidden Facts Revealed About Elections,Social Media Post,Experts warn that this misleading information ...
17994,19997,Hidden Facts Revealed About Elections,Anonymous Source,This claim has spread rapidly on social media ...


In [79]:
messages.reset_index(inplace=True)

In [80]:
messages

Unnamed: 0,index,id,title,author,text
0,0,0,Economic Growth Data Released Today,Times of India,The study was conducted over five years and re...
1,1,1,Viral Claim Spreads Across Internet,Anonymous Source,This claim has spread rapidly on social media ...
2,2,2,Health Ministry Announces New Guidelines,Reuters,The study was conducted over five years and re...
3,3,3,Breaking News That Media Is Hiding,Independent Reporter,This claim has spread rapidly on social media ...
4,4,4,Scientists Publish Findings in Peer-Reviewed J...,Associated Press,The study was conducted over five years and re...
...,...,...,...,...,...
17991,17991,19993,Viral Claim Spreads Across Internet,Independent Reporter,Experts warn that this misleading information ...
17992,17992,19994,Government Releases Official Policy Report,Times of India,Data released today shows a steady increase co...
17993,17993,19995,Hidden Facts Revealed About Elections,Social Media Post,Experts warn that this misleading information ...
17994,17994,19997,Hidden Facts Revealed About Elections,Anonymous Source,This claim has spread rapidly on social media ...


# Stopwards and Stemming

In [81]:
import nltk
import re
from nltk.corpus import stopwords

In [82]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [83]:
from nltk.stem.porter import PorterStemmer ##stemming purpose
ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['title'][i])
    review = review.lower()
    review = review.split()

    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [84]:
corpus

['econom growth data releas today',
 'viral claim spread across internet',
 'health ministri announc new guidelin',
 'break news media hide',
 'scientist publish find peer review journal',
 'shock truth govern polici',
 'econom growth data releas today',
 'break news media hide',
 'shock truth govern polici',
 'hidden fact reveal elect',
 'health ministri announc new guidelin',
 'break news media hide',
 'econom growth data releas today',
 'viral claim spread across internet',
 'scientist publish find peer review journal',
 'shock truth govern polici',
 'econom growth data releas today',
 'break news media hide',
 'break news media hide',
 'scientist publish find peer review journal',
 'shock truth govern polici',
 'scientist publish find peer review journal',
 'hidden fact reveal elect',
 'health ministri announc new guidelin',
 'viral claim spread across internet',
 'scientist publish find peer review journal',
 'viral claim spread across internet',
 'govern releas offici polici repo

In [85]:
corpus[1]

'viral claim spread across internet'

In [86]:
onehot_repr=[one_hot(words,voc_size)for words in corpus]
onehot_repr

[[2724, 758, 1818, 1192, 1787],
 [1640, 2524, 1936, 2626, 2808],
 [2088, 572, 4468, 4768, 3942],
 [748, 473, 88, 194],
 [2798, 1083, 331, 768, 1809, 4695],
 [4901, 1695, 2024, 1170],
 [2724, 758, 1818, 1192, 1787],
 [748, 473, 88, 194],
 [4901, 1695, 2024, 1170],
 [4464, 3971, 1729, 2430],
 [2088, 572, 4468, 4768, 3942],
 [748, 473, 88, 194],
 [2724, 758, 1818, 1192, 1787],
 [1640, 2524, 1936, 2626, 2808],
 [2798, 1083, 331, 768, 1809, 4695],
 [4901, 1695, 2024, 1170],
 [2724, 758, 1818, 1192, 1787],
 [748, 473, 88, 194],
 [748, 473, 88, 194],
 [2798, 1083, 331, 768, 1809, 4695],
 [4901, 1695, 2024, 1170],
 [2798, 1083, 331, 768, 1809, 4695],
 [4464, 3971, 1729, 2430],
 [2088, 572, 4468, 4768, 3942],
 [1640, 2524, 1936, 2626, 2808],
 [2798, 1083, 331, 768, 1809, 4695],
 [1640, 2524, 1936, 2626, 2808],
 [2024, 1192, 4971, 1170, 4439],
 [1640, 2524, 1936, 2626, 2808],
 [2798, 1083, 331, 768, 1809, 4695],
 [748, 473, 88, 194],
 [1291, 2798, 321],
 [1640, 2524, 1936, 2626, 2808],
 [1291, 2

In [87]:
corpus[1]

'viral claim spread across internet'

In [88]:
onehot_repr[1]

[1640, 2524, 1936, 2626, 2808]

# Padding

In [89]:
sent_length=20
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[   0    0    0 ... 1818 1192 1787]
 [   0    0    0 ... 1936 2626 2808]
 [   0    0    0 ... 4468 4768 3942]
 ...
 [   0    0    0 ... 3971 1729 2430]
 [   0    0    0 ... 3971 1729 2430]
 [   0    0    0 ... 1695 2024 1170]]


In [90]:
embedded_docs[1]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0, 1640, 2524, 1936, 2626, 2808], dtype=int32)

# Embedding and LSTM Model

In [91]:
embedding_vector_features = 40

model = Sequential()
model.add(Input(shape=(sent_length,)))
model.add(Embedding(voc_size, embedding_vector_features))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))

model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model.summary()

# Testing and Evaluating

In [92]:
len(embedded_docs),Y.shape

(17996, (17996,))

In [93]:
import numpy as np
X_final=np.array(embedded_docs)
y_final=np.array(Y)

In [94]:
X_final.shape,y_final.shape

((17996, 20), (17996,))

In [95]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)

In [97]:
### Finally Training
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=1,batch_size=64)

[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 52ms/step - accuracy: 1.0000 - loss: 2.4248e-05 - val_accuracy: 1.0000 - val_loss: 1.9239e-05


<keras.src.callbacks.history.History at 0x7856055a5a90>

Performance Metrics and Accuracy

In [98]:
y_pred=model.predict(X_test)

[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step


In [99]:
y_pred=np.where(y_pred > 0.6, 1,0) ##AUC ROC Curve

In [100]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)

array([[3352,    0],
       [   0, 2587]])

In [101]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

1.0

In [102]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3352
           1       1.00      1.00      1.00      2587

    accuracy                           1.00      5939
   macro avg       1.00      1.00      1.00      5939
weighted avg       1.00      1.00      1.00      5939

