In [13]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, GRU, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Bidirectional, Concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import joblib


In [14]:
# Load cleaned dataset
data = pd.read_csv("data/processed/combined_fake_news_dataset.csv")

# Optional: if needed, remove any nulls
data.dropna(inplace=True)

# Map labels
data["label"] = data["label"].map({"real": 0, "fake": 1})


In [15]:
# Tokenize
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(data["text"])

import joblib
joblib.dump(tokenizer, "models/tokenizer.pkl")


# Convert text to padded sequences
X = tokenizer.texts_to_sequences(data["text"])
X = pad_sequences(X, maxlen=300)

# Labels
y = data["label"].values


In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [17]:
#
input_layer = Input(shape=(300,))
embedding = Embedding(input_dim=10000, output_dim=128)(input_layer)

bilstm = Bidirectional(LSTM(64, return_sequences=True))(embedding)
cnn = Conv1D(64, 5, activation='relu')(bilstm)
pool = GlobalMaxPooling1D()(cnn)
drop = Dropout(0.5)(pool)
output = Dense(1, activation='sigmoid')(drop)

model = Model(inputs=input_layer, outputs=output)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()


In [None]:
#
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=64)


Epoch 1/5
[1m219/600[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m3:06[0m 490ms/step - accuracy: 0.7763 - loss: 0.4134

In [None]:
#
model.save("models/final_news_label_model.h5")  # Native format


In [None]:
#
def predict_news(text, model, tokenizer):
    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=300)
    pred = model.predict(padded)[0][0]
    return "FAKE" if pred > 0.5 else "REAL"
