In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, LSTM, Bidirectional, TimeDistributed, Concatenate, Flatten, Attention, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
import matplotlib.pyplot as plt
import tensorflow as tf

# Load and preprocess the data
all_news = "D:/Dataset/Fake_News_Dataset_Malayalam/mal_fake_train.csv"
all_df = pd.read_csv(all_news)

true_df = all_df[all_df["label"] == "original"]
fake_df = all_df[all_df["label"] == "Fake"]
true_df.loc[:, "label"] = 1  # 1 for true news
fake_df.loc[:, "label"] = 0  # 0 for fake news

# Combine the datasets
df = pd.concat([true_df, fake_df], ignore_index=True)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

# Tokenize the text
max_words = 15000  # Increased from 10000
max_len = 250  # Increased from 200

tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

# Convert to float32
X_train_pad = X_train_pad.astype('float32')
X_test_pad = X_test_pad.astype('float32')
y_train = y_train.astype('float32')
y_test = y_test.astype('float32')

# TextCNN model
def create_textcnn_model():
    inputs = Input(shape=(max_len,))
    embedding = Embedding(max_words, 128, input_length=max_len)(inputs)  # Increased embedding dim
    
    conv_layers = []
    for filter_size in [3, 4, 5]:
        conv = Conv1D(128, filter_size, activation='relu', kernel_regularizer=l2(0.01))(embedding)
        pool = GlobalMaxPooling1D()(conv)
        conv_layers.append(pool)
    
    concat = Concatenate()(conv_layers)
    dropout = Dropout(0.5)(concat)
    bn = BatchNormalization()(dropout)
    dense = Dense(64, activation='relu', kernel_regularizer=l2(0.01))(bn)
    outputs = Dense(1, activation='sigmoid')(dense)
    
    model = Model(inputs=inputs, outputs=outputs)
    return model

# RCNN model
def create_rcnn_model():
    inputs = Input(shape=(max_len,))
    embedding = Embedding(max_words, 128, input_length=max_len)(inputs)  # Increased embedding dim
    
    lstm = Bidirectional(LSTM(64, return_sequences=True, kernel_regularizer=l2(0.01)))(embedding)
    conv = Conv1D(128, 3, activation='relu', kernel_regularizer=l2(0.01))(lstm)
    pool = GlobalMaxPooling1D()(conv)
    
    dropout = Dropout(0.5)(pool)
    bn = BatchNormalization()(dropout)
    dense = Dense(64, activation='relu', kernel_regularizer=l2(0.01))(bn)
    outputs = Dense(1, activation='sigmoid')(dense)
    
    model = Model(inputs=inputs, outputs=outputs)
    return model

# HAN model
def create_han_model():
    inputs = Input(shape=(max_len,))
    embedding = Embedding(max_words, 128, input_length=max_len)(inputs)  # Increased embedding dim
    
    lstm = Bidirectional(LSTM(64, return_sequences=True, kernel_regularizer=l2(0.01)))(embedding)
    attention_layer = Attention()([lstm, lstm])
    
    flatten = Flatten()(attention_layer)
    dropout = Dropout(0.5)(flatten)
    bn = BatchNormalization()(dropout)
    dense = Dense(64, activation='relu', kernel_regularizer=l2(0.01))(bn)
    outputs = Dense(1, activation='sigmoid')(dense)
    
    model = Model(inputs=inputs, outputs=outputs)
    return model

# Create ensemble model
def create_ensemble_model(models):
    inputs = Input(shape=(max_len,))
    outputs = [model(inputs) for model in models]
    ensemble_output = Concatenate()(outputs)
    dropout = Dropout(0.5)(ensemble_output)
    bn = BatchNormalization()(dropout)
    dense = Dense(64, activation='relu', kernel_regularizer=l2(0.01))(bn)
    final_output = Dense(1, activation='sigmoid')(dense)
    ensemble_model = Model(inputs=inputs, outputs=final_output)
    return ensemble_model

# Create individual models
textcnn_model = create_textcnn_model()
rcnn_model = create_rcnn_model()
han_model = create_han_model()

# Create ensemble model
ensemble_model = create_ensemble_model([textcnn_model, rcnn_model, han_model])

# Compile and train the ensemble model
ensemble_model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.00001)

# Convert inputs to TensorFlow tensors
X_train_tensor = tf.convert_to_tensor(X_train_pad, dtype=tf.float32)
y_train_tensor = tf.convert_to_tensor(y_train, dtype=tf.float32)

history = ensemble_model.fit(
    X_train_tensor, y_train_tensor,
    epochs=30,  # Increased from 20
    batch_size=32,  # Reduced from 64
    validation_split=0.2,
    callbacks=[early_stopping, reduce_lr]
)

# Evaluate the model
X_test_tensor = tf.convert_to_tensor(X_test_pad, dtype=tf.float32)
y_test_tensor = tf.convert_to_tensor(y_test, dtype=tf.float32)
loss, accuracy = ensemble_model.evaluate(X_test_tensor, y_test_tensor)
print(f"Test loss: {loss:.4f}")
print(f"Test accuracy: {accuracy:.4f}")

# Plot training history
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()

# Function to predict on new data
def predict_fake_news(text):
    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=max_len, padding='post', truncating='post')
    padded = tf.convert_to_tensor(padded, dtype=tf.float32)
    prediction = ensemble_model.predict(padded)[0][0]
    return "Fake" if prediction < 0.5 else "Original", prediction

# Example usage
sample_text = "Your sample Malayalam text here"
result, confidence = predict_fake_news(sample_text)
print(f"Prediction: {result}")
print(f"Confidence: {confidence:.4f}")



Epoch 1/30


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30

KeyboardInterrupt: 