Importing Libraries

In [31]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import re
from wordcloud import WordCloud

In [32]:
from sklearn.utils.class_weight import compute_class_weight

In [33]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [34]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [35]:
from tensorflow.keras.callbacks import EarlyStopping

In [36]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Embedding,LSTM,MaxPool1D,Conv1D



Importing Datasets

In [37]:
fake_news = pd.read_csv('/content/Fake.csv')
true_news = pd.read_csv('/content/True.csv')

WordCloud


In [None]:

def plot_wordcloud(text, title):
    wordcloud = WordCloud(width=1920, height=1080).generate(" ".join(text))
    plt.figure(figsize=(10, 10))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title(title)
    plt.show()

plot_wordcloud(fake_news['title'], "Fake News WordCloud")
plot_wordcloud(true_news['title'], "True News WordCloud")

Dataset preprocessing

In [39]:
true_news['class'] = 1
fake_news['class'] = 0



data = pd.concat([fake_news, true_news], axis=0).sample(frac=1).reset_index(drop=True)



def clean_text(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = text.lower().split()
    text = [word for word in text if word not in stop_words]
    return " ".join(text)

data['title'] = data['title'].fillna('').apply(clean_text)

Visualizing data distribution

In [None]:
label_counts = data['class'].value_counts()
labels = ['Fake News', 'Real News']
sizes = [label_counts[0], label_counts[1]]
colors = ['lightcoral', 'lightgreen']
plt.figure(figsize=(6, 6))
plt.pie(sizes, labels=labels, colors=colors,autopct='%1.1f%%', startangle=90, shadow=True)
plt.title('Fake vs Real News Distribution')
plt.axis('equal')
plt.show()


In [41]:
data.head()

Unnamed: 0,title,text,subject,date,class
0,donald trump diet lifelong liberal tells lefti...,"Every single day, liberals provide more and mo...",politics,"Dec 8, 2017",0
1,largest gun control study history completed nr...,While Republicans and their NRA puppet masters...,News,"March 2, 2016",0
2,pope makes visit nuns obama regime suing confo...,Leave it to our Community Organizer In Chief t...,left-news,"Sep 24, 2015",0
3,trump speaks qatar emir gulf unity terrorism f...,WASHINGTON (Reuters) - President Donald Trump ...,worldnews,"September 8, 2017",1
4,exclusive post election liberal meltdown expla...,If you re like me . . . it s probably been an ...,politics,"Feb 23, 2017",0


In [42]:
df = pd.DataFrame(data)
X = df['title'].values
y = df['class'].values

Tokenization

In [43]:
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)
maxlen = 20
X_padded = pad_sequences(sequences, maxlen=maxlen, padding='post')

Using and importing LSTM model

In [44]:
model = Sequential([
    Embedding(input_dim=5000, output_dim=64, input_length=300),
    LSTM(128, dropout=0.3, recurrent_dropout=0.3),
    Dense(1, activation='sigmoid')
])
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.3, random_state=42)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
class_weights = compute_class_weight(class_weight='balanced', classes=np.array([0, 1]), y=y_train)
model.fit(X_train, y_train, epochs=10, batch_size=128, validation_split=0.1, callbacks=[early_stop],class_weight={0: class_weights[0], 1: class_weights[1]})


Epoch 1/10




[1m221/221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 125ms/step - accuracy: 0.8057 - loss: 0.3754 - val_accuracy: 0.9399 - val_loss: 0.1538
Epoch 2/10
[1m221/221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 125ms/step - accuracy: 0.9599 - loss: 0.1147 - val_accuracy: 0.9453 - val_loss: 0.1490
Epoch 3/10
[1m221/221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 123ms/step - accuracy: 0.9712 - loss: 0.0859 - val_accuracy: 0.9380 - val_loss: 0.1527
Epoch 4/10
[1m221/221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 119ms/step - accuracy: 0.9765 - loss: 0.0728 - val_accuracy: 0.9418 - val_loss: 0.1562


<keras.src.callbacks.history.History at 0x7963027cf8d0>

In [61]:
from tensorflow.keras.layers import Bidirectional,Embedding
# Sequential Model
model = Sequential()
total_words = 5000
# embeddidng layer
model.add(Embedding(total_words, output_dim = 128))
# model.add(Embedding(total_words, output_dim = 240))


# Bi-Directional RNN and LSTM
model.add(Bidirectional(LSTM(128))) # no of neurons

# Dense layers
model.add(Dense(128, activation = 'relu'))
model.add(Dense(1,activation= 'sigmoid')) # reason: we do binary classification here
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.summary()
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.3, random_state=42)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
class_weights = compute_class_weight(class_weight='balanced', classes=np.array([0, 1]), y=y_train)
model.fit(X_train, y_train, epochs=10, batch_size=128, validation_split=0.1, callbacks=[early_stop],class_weight={0: class_weights[0], 1: class_weights[1]})


Epoch 1/10
[1m221/221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 163ms/step - accuracy: 0.8351 - loss: 0.3329 - val_accuracy: 0.9475 - val_loss: 0.1330
Epoch 2/10
[1m221/221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 157ms/step - accuracy: 0.9699 - loss: 0.0838 - val_accuracy: 0.9472 - val_loss: 0.1297
Epoch 3/10
[1m221/221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 169ms/step - accuracy: 0.9836 - loss: 0.0499 - val_accuracy: 0.9475 - val_loss: 0.1434
Epoch 4/10
[1m221/221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 156ms/step - accuracy: 0.9866 - loss: 0.0385 - val_accuracy: 0.9497 - val_loss: 0.1589


<keras.src.callbacks.history.History at 0x7962906f59d0>

In [62]:
y_pred = (model.predict(X_test) > 0.5).astype("int32")
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy Score:", accuracy)

[1m421/421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 23ms/step
Accuracy Score: 0.9525612472160356


Predicting

In [64]:
def predict_news(text):

    tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
    tokenizer.fit_on_texts(text)
    sequences = tokenizer.texts_to_sequences(text)
    maxlen = 20
    padded = pad_sequences(sequences, maxlen=maxlen, padding='post')
    pred = model.predict(padded)[0][0]



    label = "Real News" if pred > 0.5 else "Fake News"
    print(f"Prediction: {label} ({pred:.2f})")


In [69]:
predict_news("NASA Confirms Presence of Water on the Moon")
predict_news("World Health Organization Approves Malaria Vaccine")
predict_news("Apple Unveils Next-Generation iPhone with New AI Features")
predict_news("UN Climate Agreement Signed by 150 Nations")
predict_news("Pfizer Announces New Drug to Treat Lung Cancer")

predict_news("Bill Gates Installs Tracking Chips in COVID Vaccines")
predict_news("Aliens Found Working at Go cgjvhbnklhxdtybhonjbfvyuogle Headquarters")
predict_news("5G Towers Responsible for Birhvjhkjcfh kgjbhkjkld Deaths Across the Globe")
predict_news("The Earth is Flat and NASA Faked Allfgxitgxcyktv Space Missions")
predict_news("Drinking Bleach Can Cure Coronavirus, Experts Say")


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step 
Prediction: Real News (0.71)
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
Prediction: Real News (0.70)
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
Prediction: Real News (0.98)
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
Prediction: Real News (0.89)
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
Prediction: Fake News (0.49)
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
Prediction: Fake News (0.36)
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Prediction: Real News (0.97)
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Prediction: Real News (0.84)
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
Prediction: Real News (0.97)
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
Prediction: Fake N