In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import tensorflow as tf 
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Dense 
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM 
from tensorflow.keras.preprocessing.sequence import pad_sequences 
from tensorflow.keras.utils import to_categorical
# Alternative: you can also use tf.one_hot directly
from tensorflow.keras.layers import Dropout 
from tensorflow.keras.layers import Bidirectional 

In [2]:
df = pd.read_csv('WELFake_Dataset.csv')
df.head()  

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [3]:
df = df.dropna()

In [4]:
x = df.drop('label', axis=1)

In [5]:
y = df['label']

In [6]:
y.value_counts()

label
1    36509
0    35028
Name: count, dtype: int64

In [7]:
vocab_size = 10000
msgs = x.copy()

In [8]:
msgs['title'].iloc[1]

'UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MOST CHARLOTTE RIOTERS WERE “PEACEFUL” PROTESTERS…In Her Home State Of North Carolina [VIDEO]'

In [9]:
msgs.reset_index(inplace=True)

In [10]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [11]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\moham\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
# Import the one_hot function from TensorFlow (this is for text-to-integer encoding)
from tensorflow.keras.preprocessing.text import one_hot


In [13]:
ps = PorterStemmer()
corpus = []
for i in range(0, len(msgs)):
    review = re.sub('[^a-zA-Z]', ' ', msgs['title'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [14]:
one_hot_repr = [one_hot(words, vocab_size) for words in corpus]

In [15]:
sent_length = 20
embedded_docs = pad_sequences(one_hot_repr, padding='pre', maxlen=sent_length)

In [16]:
print(embedded_docs)

[[   0    0    0 ... 3821 8776 1091]
 [   0    0    0 ... 6165 6029 1091]
 [   0    0    0 ... 4977 6849 2620]
 ...
 [   0    0    0 ... 4559 3331 9183]
 [   0    0    0 ... 4902 2182 4472]
 [   0    0    0 ... 8185 6545 3926]]


In [17]:
embedding_vector_features = 40
model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_features, input_length=sent_length))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])



In [18]:
embedding_vector_features = 40 
model1 = Sequential()
model1.add(Embedding(vocab_size, embedding_vector_features, input_length=sent_length))
model1.add(Bidirectional(LSTM(100)))
model1.add(Dropout(0.3))
model1.add(Dense(1, activation='sigmoid'))
model1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [19]:
x_final = np.array(embedded_docs)
y_final = np.array(y)

In [20]:
x_final.shape, y_final.shape

((71537, 20), (71537,))

In [21]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x_final, y_final, test_size=0.33, random_state=42)

In [22]:
model1.fit(xtrain, ytrain, validation_data=(xtest, ytest), epochs=10, batch_size=64)

Epoch 1/10
[1m749/749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 18ms/step - accuracy: 0.8094 - loss: 0.3951 - val_accuracy: 0.9035 - val_loss: 0.2440
Epoch 2/10
[1m749/749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 16ms/step - accuracy: 0.9228 - loss: 0.1977 - val_accuracy: 0.9053 - val_loss: 0.2329
Epoch 3/10
[1m749/749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 16ms/step - accuracy: 0.9392 - loss: 0.1594 - val_accuracy: 0.9057 - val_loss: 0.2397
Epoch 4/10
[1m749/749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 16ms/step - accuracy: 0.9528 - loss: 0.1260 - val_accuracy: 0.8997 - val_loss: 0.2626
Epoch 5/10
[1m749/749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 16ms/step - accuracy: 0.9614 - loss: 0.1049 - val_accuracy: 0.9002 - val_loss: 0.2845
Epoch 6/10
[1m749/749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 16ms/step - accuracy: 0.9710 - loss: 0.0824 - val_accuracy: 0.8986 - val_loss: 0.3206
Epoch 7/10
[1m7

<keras.src.callbacks.history.History at 0x15f74043530>

In [23]:
model.fit(xtrain, ytrain, validation_data=(xtest, ytest), epochs=10, batch_size=64)

Epoch 1/10
[1m749/749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 15ms/step - accuracy: 0.8155 - loss: 0.3902 - val_accuracy: 0.8975 - val_loss: 0.2429
Epoch 2/10
[1m749/749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 13ms/step - accuracy: 0.9188 - loss: 0.2025 - val_accuracy: 0.9044 - val_loss: 0.2323
Epoch 3/10
[1m749/749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 13ms/step - accuracy: 0.9364 - loss: 0.1635 - val_accuracy: 0.9037 - val_loss: 0.2455
Epoch 4/10
[1m749/749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 14ms/step - accuracy: 0.9503 - loss: 0.1313 - val_accuracy: 0.8994 - val_loss: 0.2725
Epoch 5/10
[1m749/749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 16ms/step - accuracy: 0.9606 - loss: 0.1063 - val_accuracy: 0.8996 - val_loss: 0.2959
Epoch 6/10
[1m749/749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 16ms/step - accuracy: 0.9675 - loss: 0.0860 - val_accuracy: 0.8936 - val_loss: 0.3197
Epoch 7/10
[1m7

<keras.src.callbacks.history.History at 0x15f757bd610>

In [24]:
y_pred1 = model1.predict(xtest)

[1m738/738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step


In [25]:
print(y_pred1)

[[9.99872029e-01]
 [1.04180435e-05]
 [1.09248513e-05]
 ...
 [1.00000000e+00]
 [9.99991775e-01]
 [1.56748338e-05]]


In [26]:
from sklearn.metrics import confusion_matrix

In [27]:
y_pred_classes = (y_pred1 > 0.5).astype(int)

In [28]:
confusion_matrix(ytest, y_pred_classes)

array([[10339,  1354],
       [ 1151, 10764]], dtype=int64)

In [29]:
from sklearn.metrics import accuracy_score
accuracy_score(ytest, y_pred_classes)


0.8938919010504913

In [30]:
from sklearn.metrics import classification_report
print(classification_report(ytest, y_pred_classes))

              precision    recall  f1-score   support

           0       0.90      0.88      0.89     11693
           1       0.89      0.90      0.90     11915

    accuracy                           0.89     23608
   macro avg       0.89      0.89      0.89     23608
weighted avg       0.89      0.89      0.89     23608



In [31]:
# Save the trained model
model1.save('fake_news_detector.h5')
print("Model saved successfully!")




Model saved successfully!


In [32]:
# Create a prediction function for new text
def predict_fake_news(text):
    """
    Predict if a news article is fake or real
    Args:
        text (str): The news article text/title
    Returns:
        tuple: (prediction, confidence)
    """
    # Preprocess the text (same steps as training)
    review = re.sub('[^a-zA-Z]', ' ', text)
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    
    # Convert to one-hot representation
    one_hot_words = one_hot(review, vocab_size)
    
    # Pad the sequence
    embedded_words = pad_sequences([one_hot_words], padding='pre', maxlen=sent_length)
    
    # Make prediction
    prediction = model1.predict(embedded_words)[0][0]
    
    # Convert to human readable format
    if prediction > 0.5:
        result = "FAKE"
        confidence = prediction * 100
    else:
        result = "REAL"
        confidence = (1 - prediction) * 100
    
    return result, confidence

# Test the function
test_text = "Breaking: Scientists discover amazing new technology that will change everything!"
result, confidence = predict_fake_news(test_text)
print(f"Text: {test_text}")
print(f"Prediction: {result} (Confidence: {confidence:.2f}%)")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
Text: Breaking: Scientists discover amazing new technology that will change everything!
Prediction: FAKE (Confidence: 100.00%)
