In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import torch
from transformers import BertTokenizer, BertModel
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, SpatialDropout1D
from tensorflow.keras.optimizers import Adam
from sklearn.ensemble import GradientBoostingClassifier  # Import this

# Load and preprocess data
all_news = "D:/Dataset/Fake_News_Dataset_Malayalam/mal_fake_train.csv"

all_df = pd.read_csv(all_news)

true_df = all_df[all_df["label"] == "original"]
fake_df = all_df[all_df["label"] == "Fake"]
true_df.loc[:, "label"] = 1  # 1 for true news
fake_df.loc[:, "label"] = 0  # 0 for fake news

combined_df = pd.concat([true_df, fake_df], ignore_index=True)
combined_df = combined_df.sample(frac=1, random_state=43).reset_index(drop=True)

X_train, X_test, y_train, y_test = train_test_split(combined_df['text'], combined_df['label'], test_size=0.2, random_state=43)

In [11]:
# Load mBERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model_bert = BertModel.from_pretrained('bert-base-multilingual-cased')

# Tokenize and encode text
def bert_encode(texts, tokenizer, max_len=128):
    input_ids = []
    attention_masks = []

    for text in texts:
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',  # Updated to avoid the deprecation warning
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])

    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

X_train_bert, attention_masks_train = bert_encode(X_train, tokenizer)
X_test_bert, attention_masks_test = bert_encode(X_test, tokenizer)

with torch.no_grad():
    X_train_bert = model_bert(X_train_bert, attention_mask=attention_masks_train)[0][:, 0, :]
    X_test_bert = model_bert(X_test_bert, attention_mask=attention_masks_test)[0][:, 0, :]


KeyboardInterrupt: 

In [None]:
# TF-IDF Vectorizer for SVM
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train SVM
svm = SVC(kernel='linear', probability=True)
svm.fit(X_train_tfidf, y_train)
svm_pred = svm.predict(X_test_tfidf)


In [None]:
# Tokenize and pad sequences for LSTM
max_len = 128
tokenizer_lstm = tf.keras.preprocessing.text.Tokenizer()
tokenizer_lstm.fit_on_texts(X_train)

X_train_lstm = tokenizer_lstm.texts_to_sequences(X_train)
X_test_lstm = tokenizer_lstm.texts_to_sequences(X_test)

X_train_lstm = pad_sequences(X_train_lstm, maxlen=max_len)
X_test_lstm = pad_sequences(X_test_lstm, maxlen=max_len)

# Build LSTM model
lstm_model = Sequential()
lstm_model.add(Embedding(len(tokenizer_lstm.word_index) + 1, 128, input_length=max_len))
lstm_model.add(SpatialDropout1D(0.2))
lstm_model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
lstm_model.add(Dense(1, activation='sigmoid'))

lstm_model.compile(optimizer=Adam(learning_rate=1e-4), loss='binary_crossentropy', metrics=['accuracy'])

# Train LSTM
lstm_model.fit(X_train_lstm, y_train, epochs=3, batch_size=64, validation_data=(X_test_lstm, y_test))
lstm_pred = (lstm_model.predict(X_test_lstm) > 0.5).astype(int).flatten()



Epoch 1/3


Epoch 2/3
Epoch 3/3


In [None]:
# Concatenate predictions
X_test_features = np.column_stack([X_test_bert.numpy(), svm_pred, lstm_pred])

# Train Gradient Boosting Classifier as the final model
gb = GradientBoostingClassifier()
gb.fit(X_test_features, y_test)

# Get final predictions
final_pred = gb.predict(X_test_features)

# Calculate accuracy
accuracy = accuracy_score(y_test, final_pred)
print("Final accuracy:", accuracy)

# Print confusion matrix
print("Confusion Matrix for Ensemble Classifier:")
print(confusion_matrix(y_test, final_pred))

# Precision, Recall, F1-Score
precision, recall, f1, _ = precision_recall_fscore_support(y_test, final_pred, average='binary')
print(f"Precision: {precision}\nRecall: {recall}\nF1-Score: {f1}")


Final accuracy: 1.0
Confusion Matrix for Ensemble Classifier:
[[326   0]
 [  0 326]]
Precision: 1.0
Recall: 1.0
F1-Score: 1.0
