In [38]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [86]:
data = pd.read_csv("WELFake_Dataset.csv")

In [87]:
data = data[['text', 'label']]

In [88]:
data.dropna(inplace=True)  # Remove rows with missing values

# Preprocessing
max_features = 5000  # Vocabulary size
max_len = 200  # Maximum length of sequences
tokenizer = Tokenizer(num_words=max_features, lower=True)
tokenizer.fit_on_texts(data['text'].values)

In [89]:
# Convert text to sequences and pad them
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X, maxlen=max_len)
y = data['label'].values

In [90]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [91]:
model = Sequential([
    Embedding(input_dim=max_features, output_dim=128, input_length=max_len),  # Specify input_length
    Bidirectional(LSTM(128, return_sequences=False)),  # BiLSTM layer
    Dropout(0.5),  # Dropout for regularization
    Dense(64, activation='relu'),  # Fully connected layer
    Dropout(0.5),  # Additional dropout
    Dense(1, activation='sigmoid')  # Output layer for binary classification
])



In [92]:
model.build(input_shape=(None, max_len)) 

In [93]:
# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

In [94]:
# Print the model summary
model.summary()


In [96]:
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.4)

Epoch 1/10
[1m1082/1082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m231s[0m 214ms/step - accuracy: 0.8998 - loss: 0.2556 - val_accuracy: 0.8593 - val_loss: 0.2938
Epoch 2/10
[1m1082/1082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m220s[0m 204ms/step - accuracy: 0.9315 - loss: 0.1794 - val_accuracy: 0.9492 - val_loss: 0.1551
Epoch 3/10
[1m1082/1082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m219s[0m 202ms/step - accuracy: 0.9655 - loss: 0.0998 - val_accuracy: 0.9506 - val_loss: 0.1451
Epoch 4/10
[1m1082/1082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m220s[0m 204ms/step - accuracy: 0.9767 - loss: 0.0725 - val_accuracy: 0.9539 - val_loss: 0.1526
Epoch 5/10
[1m1082/1082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m220s[0m 203ms/step - accuracy: 0.9850 - loss: 0.0452 - val_accuracy: 0.9544 - val_loss: 0.1535
Epoch 6/10
[1m1082/1082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m220s[0m 203ms/step - accuracy: 0.9894 - loss: 0.0315 - val_accuracy: 0.9453 - val_loss:

In [97]:
# Evaluate the model on the test set
y_pred = (model.predict(X_test) > 0.5).astype("int32")
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

[1m451/451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 60ms/step


In [98]:
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")

Accuracy: 0.9426451210208753
Precision: 0.9299712568591586
Recall: 0.9607234444594412
F1-Score: 0.945097258182301


In [140]:
# Function to predict if a given news is real or fake with a fixed threshold
def predict_news(news_text, threshold=0.4):  # Set a fixed threshold
    # Preprocess the input text
    seq = tokenizer.texts_to_sequences([news_text])
    padded_seq = pad_sequences(seq, maxlen=max_len)
    
    # Predict
    prediction = model.predict(padded_seq)[0][0]  # Extract the scalar value from the prediction array
    
    # Convert prediction into a confidence percentage
    confidence = round(prediction * 100, 2) if prediction > threshold else round((1 - prediction) * 100, 2)
    
    # Determine label
    label = "Fake" if prediction > threshold else "Real"
    
    return label, confidence
example_news_list = [
    # Fake News Examples
    "Did they post their votes for Hillary already?",
    "VP Joe Biden: Yeah, I’m going to run in 2020.",
    "The Atlantic said that The NRA has a new favorite toy, but there are no bullets involved.",
    
    # Real News Examples
    "ESPN’s Wilbon on Kaepernick: ’This Is a  - There’s No Question About It’ - Breitbart",
    "U.S. senator: Cuba ambassador will not be approved this year",
    "Travel Ban enhances Persian New Year Celebration - The New York Times"
]

# Function to test multiple examples with a fixed threshold
def test_examples(news_list, threshold=0.4):
    print("Testing Examples:\n")
    results = []
    for i, news_text in enumerate(news_list, 1):
        label, confidence = predict_news(news_text, threshold)
        results.append((news_text, label, confidence))
        print(f"Example {i}: \"{news_text}\" -> Classified as: {label} (Confidence: {confidence}%)")
    return results

# Test the examples with the fixed threshold
threshold = 0.9  # Experiment with this value
test_results = test_examples(example_news_list, threshold)

# Optional: Summarize results
real_count = sum(1 for _, label, _ in test_results if label == "Real")
fake_count = sum(1 for _, label, _ in test_results if label == "Fake")

print("\nSummary:")
print(f"Total Examples Tested: {len(test_results)}")
print(f"Classified as Real: {real_count}")
print(f"Classified as Fake: {fake_count}")

Testing Examples:

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
Example 1: "Did they post their votes for Hillary already?" -> Classified as: Fake (Confidence: 99.98%)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
Example 2: "VP Joe Biden: Yeah, I’m going to run in 2020." -> Classified as: Fake (Confidence: 99.99%)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
Example 3: "The Atlantic said that The NRA has a new favorite toy, but there are no bullets involved." -> Classified as: Real (Confidence: 99.54%)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
Example 4: "ESPN’s Wilbon on Kaepernick: ’This Is a  - There’s No Question About It’ - Breitbart" -> Classified as: Fake (Confidence: 100.0%)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
Example 5: "U.S. senator: Cuba ambassador will not be approved this year" -> Classified as: Real (Confidence: 99.91%)
[1m1/1

In [139]:
model.save("model.h5")




In [131]:
import pickle

# Assuming you used a tokenizer (e.g., from TensorFlow/Keras)
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)
