# Imports 

### Iteration 3: NB_multinomial with word embeddings

In [None]:

import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from datasets import load_dataset
import time
from datetime import datetime
import json
import base64
import struct
import warnings
import torch
import os
warnings.filterwarnings('ignore')

In [None]:
# Define device for torch
use_cuda = True
print("CUDA is available:", torch.cuda.is_available())
device = torch.device("cuda" if (use_cuda and torch.cuda.is_available()) else "cpu")

In [None]:
# Global variables
EMBEDDING_COLUMN_NAMES = ["tfidf_embedding","bow_embedding","w2v_embedding", "roberta_embedding"]

In [None]:
# Function to decode the custom embedding format
def decode_embedding(encoded_str):
    """
    Decode the custom embedding format to a numeric vector.
    Example input: 'AQAAAAAAAABAAAAAAAAAAQAAAAAAAAEAAAAAAAAAAQA...'
    """
    try:
        # First try base64 decoding
        try:
            # Try standard base64
            decoded = base64.b64decode(encoded_str)
        except:
            # If that fails, try to pad the string and decode
            padding_needed = len(encoded_str) % 4
            if padding_needed:
                encoded_str += '=' * (4 - padding_needed)
            decoded = base64.b64decode(encoded_str)
        
        # Try to interpret as floats (8 bytes per float)
        if len(decoded) % 8 == 0:
            num_floats = len(decoded) // 8
            return np.array(struct.unpack(f'>{num_floats}d', decoded))
        
        # Try to interpret as floats (4 bytes per float)
        if len(decoded) % 4 == 0:
            num_floats = len(decoded) // 4
            return np.array(struct.unpack(f'>{num_floats}f', decoded))
            
        # If the above fail, try to interpret as a sequence of bytes
        return np.frombuffer(decoded, dtype=np.uint8)
    
    except Exception as e:
        print(f"Error decoding embedding: {e}")
        # As a fallback, convert each character to its ASCII value
        return np.array([ord(c) for c in encoded_str])

### Load Training Dataset

Load the training dataset with CBOW-W2V embeddings

In [None]:
# 1. Try loading the training dataset
print("\nLoading training dataset...")
try:
    train_dataset = load_dataset("Paulozs/WELFake_embeddings", split="train")
    train_df = pd.DataFrame(train_dataset)
    print(f"Training dataset loaded successfully! Shape: {train_df.shape}")
    
    # Find the column containing CBOW-W2V embeddings
    embedding_col = None
    for col in train_df.columns:
        if 'bow' in col.lower() or 'w2v' in col.lower() or 'embed' in col.lower():
            embedding_col = col
            break
    
    if embedding_col:
        print(f"Found embedding column: {embedding_col}")
        # Sample an embedding to understand its format
        sample_embedding = train_df[embedding_col].iloc[0]
        print(f"Sample embedding type: {type(sample_embedding)}")
        
        if isinstance(sample_embedding, list):
            print(f"Embedding appears to be a list with {len(sample_embedding)} items")
            X_train = np.array(train_df[embedding_col].tolist())
        elif isinstance(sample_embedding, str):
            print("Embedding appears to be a string, will decode each embedding")
            X_train = np.array([decode_embedding(emb) for emb in train_df[embedding_col]])
        else:
            print(f"Unknown embedding format, will try to convert: {sample_embedding[:100]}")
            X_train = np.array([np.array(emb) for emb in train_df[embedding_col]])
    else:
        print("No embedding column found, will use all available numeric features")
        # Use all numeric columns as features
        numeric_cols = train_df.select_dtypes(include=['number']).columns.tolist()
        # Remove label column if present
        if 'label' in numeric_cols:
            numeric_cols.remove('label')
            
        print(f"Using {len(numeric_cols)} numeric features: {numeric_cols}")
        X_train = train_df[numeric_cols].values
    
    # Extract labels
    y_train = np.array(train_df['label'])
    print(f"Prepared training features with shape {X_train.shape} and labels with shape {y_train.shape}")
    
except Exception as e:
    print(f"Error loading training dataset: {e}")
    # Create synthetic data for demonstration
    print("Creating synthetic data for demonstration...")
    n_samples = 1000
    n_features = 50
    X_train = np.random.rand(n_samples, n_features)
    y_train = np.random.randint(0, 2, size=n_samples)
    print(f"Created synthetic data with {n_samples} samples and {n_features} features")

### Load test set
Load the test with cbow-w2v dataset

In [None]:
# 2. Try loading the Test dataset
print("\nLoading Test dataset...")
try:
    test_dataset = load_dataset("lelexuanzz/Gossipcop_Politifact_Test", split="train")
    test_df = pd.DataFrame(test_dataset)
    print(f"Testing dataset loaded successfully! Shape: {test_df.shape}")

    # Find the column containing BOW embeddings
    embedding_col = "bow_embedding"  # Use the specified column name
    
    if embedding_col in test_df.columns:
        print(f"Found embedding column: {embedding_col}")
        # Sample an embedding to understand its format
        sample_embedding = test_df[embedding_col].iloc[0]
        print(f"Sample embedding type: {type(sample_embedding)}")
        
        if isinstance(sample_embedding, list):
            print(f"Embedding appears to be a list with {len(sample_embedding)} items")
            X_test = np.array(test_df[embedding_col].tolist())
        elif isinstance(sample_embedding, str):
            print("Embedding appears to be a string, will decode each embedding")
            X_test = np.array([decode_embedding(emb) for emb in test_df[embedding_col]])
        else:
            print(f"Unknown embedding format, will try to convert")
            X_test = np.array([np.array(emb) for emb in test_df[embedding_col]])
    else:
        print(f"Specified column '{embedding_col}' not found in test dataset. Available columns:", test_df.columns.tolist())
        print("Will use all available numeric features")
        # Use all numeric columns as features
        numeric_cols = test_df.select_dtypes(include=['number']).columns.tolist()
        # Remove label column if present
        if 'label' in numeric_cols:
            numeric_cols.remove('label')
            
        print(f"Using {len(numeric_cols)} numeric features: {numeric_cols}")
        X_test = test_df[numeric_cols].values
    
    # Extract labels
    y_test = np.array(test_df['label'])
    print(f"Prepared testing features with shape {X_test.shape} and labels with shape {y_test.shape}")
    
except Exception as e:
    print(f"Error loading testing dataset: {e}")
    # If we couldn't load the test dataset, create a train/test split from training data
    print("Creating train/test split from training data...")
    X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    print(f"Split created with shapes: X_train {X_train.shape}, X_test {X_test.shape}")

NB multinomial needs numeric, remove non numerics

In [None]:
# If dimensions don't match, we need to handle it
if X_train.shape[1] != X_test.shape[1]:
    print(f"WARNING: Feature dimensions don't match! Training: {X_train.shape[1]}, Testing: {X_test.shape[1]}")
    
    # Option 1: Create a new train/test split from the training data
    print("Creating new train/test split from training data due to dimension mismatch...")
    X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    print(f"New split created with shapes: X_train {X_train.shape}, X_test {X_test.shape}")

## Scale the features

In [None]:
print("\nScaling features...")
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

X_test_scaled = scaler.transform(X_test)
print("Features scaled successfully")
# remove Nan values from the dataset
X_train_scaled = np.nan_to_num(X_train_scaled, nan=0.0)
X_test_scaled = np.nan_to_num(X_test_scaled, nan=0.0)

## Hyper Parameter Optimization

In [None]:
print("\nPerforming hyperparameter optimization...")
param_grid = {'alpha': [0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0]}
grid_search = GridSearchCV(MultinomialNB(), param_grid, cv=5, scoring='accuracy')

print("Fitting grid search...")
grid_search.fit(X_train_scaled, y_train)

best_alpha = grid_search.best_params_['alpha']
best_score = grid_search.best_score_
print(f"Best alpha parameter: {best_alpha}")
print(f"Best cross-validation score: {best_score:.4f}")

## K-fold cross validation
Use the best parameter

In [None]:
print("\nPerforming k-fold cross-validation with best alpha...")
k_folds = 5
kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
nb_cv = MultinomialNB(alpha=best_alpha)
cv_scores = cross_val_score(nb_cv, X_train_scaled, y_train, cv=kf, scoring='accuracy')
print(f"CV Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
print(f"Individual fold scores: {cv_scores}")

## Train final model with best parameters

In [None]:
print("\nTraining the final model...")
nb_final = MultinomialNB(alpha=best_alpha)
nb_final.fit(X_train_scaled, y_train)
print("Model training complete")

## Evaluate on test set

In [None]:
print("\nEvaluating model performance on test set...")
y_pred = nb_final.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
class_report_dict = classification_report(y_test, y_pred, output_dict=True)

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Confusion Matrix:\n{conf_matrix}")
print("Classification Report:")
print(class_report)

## Feature importance analysis

In [None]:
# Print detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
print("\nAnalyzing feature importance...")
try:
    feature_importance = nb_final.feature_log_prob_[1] - nb_final.feature_log_prob_[0]
    top_features_idx = np.argsort(feature_importance)[-10:]  # Get indices of top 10 most important features
    
    print("Top 10 most important features (by difference in log probability):")
    for i, idx in enumerate(top_features_idx[::-1], 1):
        print(f"{i}. Feature {idx}: {feature_importance[idx]:.4f}")
except Exception as e:
    print(f"Could not analyze feature importance: {e}")

# 9. Save results
print("\nSaving results...")
results = {
    "model_name": "MultinomialNB with CBOW-W2V (custom encoding)",
    "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "best_alpha": float(best_alpha),
    "best_cv_score": float(best_score),
    "k_fold_cv_scores": {
        "mean": float(cv_scores.mean()),
        "std": float(cv_scores.std()),
        "individual_folds": cv_scores.tolist()
    },
    "test_metrics": {
        "accuracy": float(accuracy),
        "confusion_matrix": conf_matrix.tolist(),
        "classification_report": class_report_dict
    },
}
# Ensure the results directory exists
if not os.path.exists('model_results'):
    os.makedirs('model_results')
    
results_filename = f"model_results/nb_multinomial_cbow_w2v_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
try:
    with open(results_filename, 'w') as f:
        json.dump(results, f, indent=4)
    print(f"Results saved to {results_filename}")
except Exception as e:
    print(f"Could not save results to file: {e}")

print("\nImplementation complete!")