<a href="https://colab.research.google.com/github/LeeMinQi-25/Fake-News-Detection-with-Deep-Learning/blob/main/LSTM(Fine_Tuned).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import ModelCheckpoint

# === 1. Load and Clean Data ===
df = pd.read_csv("WELFake_Dataset.csv")  # Replace with your actual CSV filename

# Use only accessible rows (72,134)
df = df.dropna(subset=['title', 'text', 'label']).reset_index(drop=True)

# Combine Title and Text for a stronger signal
df['content'] = df['title'] + " " + df['text']
df = df[['content', 'label']].rename(columns={'label': 'Label'})

# Keep only labels that are 0 or 1
df = df[df['Label'].astype(str).isin(['0', '1'])]
df['Label'] = df['Label'].astype(int)

# Check dataset shape
print("✅ Dataset shape after cleaning:", df.shape)

print("Dataset shape after cleaning:", df.shape)

# --- Preprocess Text ---
def clean_text(text):
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Remove punctuation
    text = text.lower()
    return text

df['content'] = df['content'].apply(clean_text)

✅ Dataset shape after cleaning: (71537, 2)
Dataset shape after cleaning: (71537, 2)


In [None]:
# === 2. Tokenization ===
MAX_WORDS = 10000
MAX_LEN = 200

tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(df['content'])
sequences = tokenizer.texts_to_sequences(df['content'])
X = pad_sequences(sequences, maxlen=MAX_LEN)
y = df['Label'].values

# --- Train/Test Split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- LSTM Model ---
model = Sequential()
model.add(Embedding(input_dim=MAX_WORDS, output_dim=64, input_length=MAX_LEN))
model.add(LSTM(64, return_sequences=False))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# --- Model Checkpoint ---
checkpoint = ModelCheckpoint("lstm_model.h5", save_best_only=True, monitor="val_accuracy", mode="max")

# --- Train Model ---
history = model.fit(
    X_train, y_train,
    epochs=5,
    batch_size=64,
    validation_split=0.2,
    callbacks=[checkpoint]
)

# --- Save Final Model ---
model.save("lstm_final_model.h5")

Epoch 1/5




[1m714/716[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 11ms/step - accuracy: 0.8487 - loss: 0.3327



[1m716/716[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 12ms/step - accuracy: 0.8489 - loss: 0.3323 - val_accuracy: 0.9266 - val_loss: 0.1865
Epoch 2/5
[1m713/716[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 11ms/step - accuracy: 0.9561 - loss: 0.1242



[1m716/716[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 12ms/step - accuracy: 0.9561 - loss: 0.1242 - val_accuracy: 0.9454 - val_loss: 0.1555
Epoch 3/5
[1m716/716[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.9754 - loss: 0.0749



[1m716/716[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 11ms/step - accuracy: 0.9754 - loss: 0.0749 - val_accuracy: 0.9510 - val_loss: 0.1413
Epoch 4/5
[1m716/716[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 11ms/step - accuracy: 0.9837 - loss: 0.0496 - val_accuracy: 0.9465 - val_loss: 0.1669
Epoch 5/5
[1m716/716[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.9892 - loss: 0.0325



[1m716/716[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 12ms/step - accuracy: 0.9892 - loss: 0.0325 - val_accuracy: 0.9513 - val_loss: 0.1639




In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# --- Evaluate on test set ---
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {accuracy:.4f}")

# --- Predict classes ---
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int).flatten()

# --- Classification Report ---
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# --- Confusion Matrix ---
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Test Accuracy: 0.9490
[1m448/448[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.95      0.95      7081
           1       0.95      0.95      0.95      7227

    accuracy                           0.95     14308
   macro avg       0.95      0.95      0.95     14308
weighted avg       0.95      0.95      0.95     14308

Confusion Matrix:
[[6711  370]
 [ 360 6867]]
