# Imports 

In [None]:
from __future__ import print_function
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
import torch
import warnings
import pandas as pd
warnings.filterwarnings('ignore')

In [None]:
# Define device for torch
use_cuda = True
print("CUDA is available:", torch.cuda.is_available())
device = torch.device("cuda" if (use_cuda and torch.cuda.is_available()) else "cpu")

# Load Dataset
WELFake from HuggingFace

In [None]:
from datasets import load_dataset

dataset = load_dataset("lelexuanzz/WELFake_stylo_feats")
print(dataset)

## Split dataset

In [None]:
#convert dataset to pandas for easier implementation with sklearn

df = dataset["train"].to_pandas()
print(df)

In [None]:
from sklearn.model_selection import train_test_split

seed = 45

y = df["label"]
x = df.drop(labels=["label", "text"], axis=1)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=seed)

display(x_train)
display(y_train)

NB multinomial needs numeric, remove non numerics

Only run this if future parts have ERRORS regarding numeric (unclean input)

In [None]:
# Convert dataset to pandas for easier implementation with sklearn
df = dataset["train"].to_pandas()
print("Dataset columns:", df.columns)
print("Data types:", df.dtypes)

# Check for any non-numeric values that might cause problems
for col in df.columns:
    if df[col].dtype == 'object':  # Check if column contains strings
        print(f"Column {col} contains non-numeric data and will be dropped")
        df = df.drop(columns=[col])

# Make sure all remaining columns are numeric
for col in df.columns:
    try:
        df[col] = pd.to_numeric(df[col])
    except ValueError as e:
        print(f"Error converting column {col}: {e}")
        # If conversion fails, we'll examine the problematic values
        problematic_rows = df[~pd.to_numeric(df[col], errors='coerce').notna()]
        if not problematic_rows.empty:
            print(f"Sample of problematic values in {col}:")
            print(problematic_rows[col].head())
            # Drop the problematic column if we can't convert it
            df = df.drop(columns=[col])

print("Final columns after cleaning:", df.columns)

### Training

no further preprocessing

In [None]:
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np

Convert Negative values to 0 and scale features to [0,1]

In [None]:
scaler = MinMaxScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

Initialize and train multinomialNB classifier

In [None]:
nb_multinomial = MultinomialNB()
nb_multinomial.fit(x_train_scaled, y_train)

In [None]:
# Make predictions
y_pred = nb_multinomial.predict(x_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)

In [None]:
# Print detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)

## Visualize feature importance

examine log probabilities for naive Bayes ( not a tree )

In [None]:
feature_importance = nb_multinomial.feature_log_prob_[1] - nb_multinomial.feature_log_prob_[0]
feature_names = x_train.columns

# Sort features by importance
indices = np.argsort(feature_importance)
plt.figure(figsize=(10, 6))
plt.title('Feature Importance for MultinomialNB')
plt.barh(range(len(indices)), feature_importance[indices], color='b', align='center')
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel('Difference in Log Probability')
plt.tight_layout()
plt.show()

## Cross Validation

## Hyperparameter tuning

In [None]:
# RandomizedSearchCV for MultinomialNB
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

# Define parameter distribution for MultinomialNB
param_dist = {'alpha': uniform(0.001, 10)}  # Randomly sample alpha values between 0.001 and 10

# Use RandomizedSearchCV to find the best hyperparameters
rand_search = RandomizedSearchCV(MultinomialNB(), 
                                 param_distributions=param_dist, 
                                 n_iter=20, 
                                 cv=5, 
                                 n_jobs=-1, 
                                 random_state=seed)

# Fit the random search object to the data
rand_search.fit(x_train_scaled, y_train)


## Tuning alpha parameter with cross-validation

In [None]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {'alpha': [0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0]}

# Set up GridSearchCV
grid_search = GridSearchCV(MultinomialNB(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(x_train_scaled, y_train)

Compare the parameters

In [None]:
# Print best parameters
print("\nBest parameters found: ", grid_search.best_params_)
print("Best cross-validation accuracy: {:.4f}".format(grid_search.best_score_))

In [None]:
# print randomized search results
print("\nBest parameters found by RandomizedSearchCV: ", rand_search.best_params_)
print("Best cross-validation accuracy: {:.4f}".format(rand_search.best_score_))

Evaluate model with best parameters

In [None]:
best_nb = grid_search.best_estimator_
y_pred_best = best_nb.predict(x_test_scaled)
accuracy_best = accuracy_score(y_test, y_pred_best)
print("Test accuracy with best parameters: {:.4f}".format(accuracy_best))

In [None]:
# Generate predictions with the best model
y_pred = best_nb.predict(x_test)

# Create the confusion matrix
cm = confusion_matrix(y_test, y_pred)

ConfusionMatrixDisplay(confusion_matrix=cm).plot();

Save the results

In [None]:
import json
import os
from datetime import datetime

# Create a dictionary to store all results
results = {
    "model_name": "MultinomialNB_iter2",
    "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "dataset": "WELFake_stylo_feats",
    "metrics": {
        "accuracy": float(accuracy),
        "best_accuracy": float(accuracy_best),
        "best_alpha": grid_search.best_params_["alpha"],
        "confusion_matrix": conf_matrix.tolist(),
        "classification_report": {}
    },
    "feature_importance": {feature: float(importance) for feature, importance in 
                          zip(feature_names, feature_importance)}
}

# Get classification report metrics
report = classification_report(y_test, y_pred, output_dict=True)
for class_label, metrics in report.items():
    if isinstance(metrics, dict):
        results["metrics"]["classification_report"][class_label] = {
            k: float(v) for k, v in metrics.items()
        }

# Ensure the results directory exists
if not os.path.exists('model_results'):
    os.makedirs('model_results')

# Save the results to a JSON file
filename = f"model_results/nb_multinomial_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(filename, 'w') as f:
    json.dump(results, f, indent=4)

print(f"Results saved to {filename}")

# K-Fold Validation

In [None]:
from sklearn.model_selection import cross_val_score, KFold

# Setup k-fold cross-validation
k_folds = 5
kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)

# Create a new MultinomialNB classifier with the best alpha from grid search
best_alpha = grid_search.best_params_['alpha']
nb_cv = MultinomialNB(alpha=best_alpha)

# Perform k-fold cross-validation
cv_scores = cross_val_score(nb_cv, x_train_scaled, y_train, cv=kf, scoring='accuracy')

# Print cross-validation results
print(f"K-fold Cross-Validation Results ({k_folds} folds):")
print(f"CV Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
print(f"Individual fold scores: {cv_scores}")

# Train the final model on the entire training dataset
nb_final = MultinomialNB(alpha=best_alpha)
nb_final.fit(x_train_scaled, y_train)

# Test on the gossicop dataset

In [None]:
# Load the external test dataset
from datasets import load_dataset
import pandas as pd
import numpy as np

test_dataset = load_dataset("lelexuanzz/Gossipcop_Politifact_Test_Stylo")

In [None]:
test_df = test_dataset["train"].to_pandas()
display(test_df)


y_test_set = test_df["label"]
x_test_set = test_df.drop(labels=["label", "text"], axis=1)

display(x_test_set)
display(y_test_set)

In [None]:
y_test_pred = best_nb.predict(x_test_set)
accuracy = accuracy_score(y_test_set, y_test_pred)
precision = precision_score(y_test_set, y_test_pred)
recall = recall_score(y_test_set, y_test_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

# Feature Selection

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

# Remove the text column if present in your x_train dataset
features_for_selection = x_train.drop('text', axis=1, errors='ignore')

# Define the number of top features to select
num_features = 5  # You can adjust this number

# Use SelectKBest to identify top features
selector = SelectKBest(f_classif, k=num_features)
selector.fit(features_for_selection, y_train)

# Get selected feature indices and names
selected_indices = selector.get_support(indices=True)
selected_features = features_for_selection.columns[selected_indices]

print(f"\nTop {num_features} selected features:")
for i, feature in enumerate(selected_features):
    print(f"{i+1}. {feature} (Score: {selector.scores_[selected_indices[i]]:.2f})")

# Create datasets with only selected features
X_train_selected = features_for_selection.iloc[:, selected_indices]
X_test_selected = x_test.drop('text', axis=1, errors='ignore').iloc[:, selected_indices]

# Scale the selected features
X_train_selected_scaled = scaler.fit_transform(X_train_selected)
X_test_selected_scaled = scaler.transform(X_test_selected)

# Train and evaluate model with selected features
nb_selected = MultinomialNB(alpha=best_alpha)
nb_selected.fit(X_train_selected_scaled, y_train)
selected_predictions = nb_selected.predict(X_test_selected_scaled)

# Evaluate performance with selected features
selected_accuracy = accuracy_score(y_test, selected_predictions)
selected_report = classification_report(y_test, selected_predictions)

print("\nPerformance with selected features:")
print(f"Accuracy: {selected_accuracy:.4f}")
print("Classification Report:")
print(selected_report)

## Recursive feature Elimination

In [None]:
from sklearn.feature_selection import RFE
import pandas as pd

def recursive_feature_pruning(model, x, y, num_features_to_select=None):

    rfe = RFE(estimator=model, n_features_to_select=num_features_to_select or 1, step=1)
    rfe.fit(x, y)

    # Get selected features
    selected_features = x.columns[rfe.support_]
    
    # Get ranking of features
    feature_ranking = pd.DataFrame({
        'feature': x.columns,
        'rank': rfe.ranking_
    }).sort_values(by='rank')

    return selected_features.tolist(), feature_ranking


In [None]:
from sklearn.linear_model import LogisticRegression

# Use LogisticRegression as the estimator for RFE
logistic_model = LogisticRegression(max_iter=1000, random_state=seed)

# Perform recursive feature elimination
top_features, ranking_df = recursive_feature_pruning(logistic_model, x_train, y_train, num_features_to_select=10)

print("Selected Top Features:")
print(top_features)

print("\nFeature Rankings:")
print(ranking_df)


## Train with top features after Recursive Feature Elimination

In [None]:
display(x_train[top_features])

In [None]:
grid_search.fit(x_train[top_features], y_train)

best_nb_multinomial_rfe = rand_search.best_estimator_
print('Best hyperparameters:',  rand_search.best_params_)

In [None]:
y_pred_rfe = best_nb_multinomial_rfe.predict(x_test[top_features])

accuracy = accuracy_score(y_test, y_pred_rfe)
precision = precision_score(y_test, y_pred_rfe)
recall = recall_score(y_test, y_pred_rfe)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)