# Imports 

### Iteration 3: NB_multinomial with word embeddings

In [None]:

import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from datasets import load_dataset
import time
from datetime import datetime
import json
import base64
import struct
import warnings
import torch
import os
warnings.filterwarnings('ignore')

In [None]:
# Define device for torch
use_cuda = True
print("CUDA is available:", torch.cuda.is_available())
device = torch.device("cuda" if (use_cuda and torch.cuda.is_available()) else "cpu")

In [None]:
# Global variables
EMBEDDING_COLUMN_NAMES = ["tfidf_embedding","bow_embedding","w2v_embedding", "roberta_embedding"]

In [None]:
# Function to decode the custom embedding format
def decode_embedding(encoded_str):
    """
    Decode the custom embedding format to a numeric vector.
    Example input: 'AQAAAAAAAABAAAAAAAAAAQAAAAAAAAEAAAAAAAAAAQA...'
    """
    try:
        # First try base64 decoding
        try:
            # Try standard base64
            decoded = base64.b64decode(encoded_str)
        except:
            # If that fails, try to pad the string and decode
            padding_needed = len(encoded_str) % 4
            if padding_needed:
                encoded_str += '=' * (4 - padding_needed)
            decoded = base64.b64decode(encoded_str)
        
        # Try to interpret as floats (8 bytes per float)
        if len(decoded) % 8 == 0:
            num_floats = len(decoded) // 8
            return np.array(struct.unpack(f'>{num_floats}d', decoded))
        
        # Try to interpret as floats (4 bytes per float)
        if len(decoded) % 4 == 0:
            num_floats = len(decoded) // 4
            return np.array(struct.unpack(f'>{num_floats}f', decoded))
            
        # If the above fail, try to interpret as a sequence of bytes
        return np.frombuffer(decoded, dtype=np.uint8)
    
    except Exception as e:
        print(f"Error decoding embedding: {e}")
        # As a fallback, convert each character to its ASCII value
        return np.array([ord(c) for c in encoded_str])

def process_embedding(train_df, test_df, embedding_name):
    """
    Process a specific embedding type from the datasets.
    Returns processed X_train, X_test, y_train, y_test
    """
    print(f"\n{'='*50}")
    print(f"PROCESSING {embedding_name} EMBEDDING")
    print(f"{'='*50}")
    
    # Check if embedding exists in both datasets
    if embedding_name not in train_df.columns or embedding_name not in test_df.columns:
        print(f"Warning: {embedding_name} not found in both datasets!")
        print(f"Train columns: {train_df.columns}")
        print(f"Test columns: {test_df.columns}")
        return None, None, None, None
    
    print(f"\nExtracting {embedding_name} embeddings...")
    
    # Sample an embedding to understand its format
    sample_embedding_train = train_df[embedding_name].iloc[0]
    print(f"Sample training embedding type: {type(sample_embedding_train)}")
    
    # Process training embeddings
    if isinstance(sample_embedding_train, list):
        print(f"Training embedding appears to be a list with {len(sample_embedding_train)} items")
        X_train = np.array(train_df[embedding_name].tolist())
    elif isinstance(sample_embedding_train, str):
        print("Training embedding appears to be a string, will decode each embedding")
        X_train = np.array([decode_embedding(emb) for emb in train_df[embedding_name]])
    else:
        print(f"Unknown training embedding format, will try to convert")
        X_train = np.array([np.array(emb) for emb in train_df[embedding_name]])
    
    # Sample an embedding from test set
    sample_embedding_test = test_df[embedding_name].iloc[0]
    print(f"Sample test embedding type: {type(sample_embedding_test)}")
    
    # Process test embeddings
    if isinstance(sample_embedding_test, list):
        print(f"Test embedding appears to be a list with {len(sample_embedding_test)} items")
        X_test = np.array(test_df[embedding_name].tolist())
    elif isinstance(sample_embedding_test, str):
        print("Test embedding appears to be a string, will decode each embedding")
        X_test = np.array([decode_embedding(emb) for emb in test_df[embedding_name]])
    else:
        print(f"Unknown test embedding format, will try to convert")
        X_test = np.array([np.array(emb) for emb in test_df[embedding_name]])
    
    # Extract labels
    y_train = np.array(train_df['label'])
    y_test = np.array(test_df['label'])
    
    print(f"Prepared training features with shape {X_train.shape} and labels with shape {y_train.shape}")
    print(f"Prepared testing features with shape {X_test.shape} and labels with shape {y_test.shape}")
    
    # Check if dimensions match
    if X_train.shape[1] != X_test.shape[1]:
        print(f"WARNING: Feature dimensions don't match! Training: {X_train.shape[1]}, Testing: {X_test.shape[1]}")
        print("Cannot proceed with this embedding type.")
        return None, None, None, None
    
    return X_train, X_test, y_train, y_test

def train_and_evaluate_model(X_train, X_test, y_train, y_test, embedding_name):
    """Train and evaluate the NB model on the given embedding"""
    try:
        print("\nScaling features...")
        scaler = MinMaxScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        print("Features scaled successfully")
        
        # Remove NaN values
        X_train_scaled = np.nan_to_num(X_train_scaled, nan=0.0)
        X_test_scaled = np.nan_to_num(X_test_scaled, nan=0.0)
        
        print("\nPerforming hyperparameter optimization...")
        param_grid = {'alpha': [0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0]}
        grid_search = GridSearchCV(MultinomialNB(), param_grid, cv=5, scoring='accuracy')
        
        print("Fitting grid search...")
        grid_search.fit(X_train_scaled, y_train)
        
        best_alpha = grid_search.best_params_['alpha']
        best_score = grid_search.best_score_
        print(f"Best alpha parameter: {best_alpha}")
        print(f"Best cross-validation score: {best_score:.4f}")
        
        print("\nPerforming k-fold cross-validation with best alpha...")
        k_folds = 5
        kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
        nb_cv = MultinomialNB(alpha=best_alpha)
        cv_scores = cross_val_score(nb_cv, X_train_scaled, y_train, cv=kf, scoring='accuracy')
        print(f"CV Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
        print(f"Individual fold scores: {cv_scores}")
        
        print("\nTraining the final model...")
        nb_final = MultinomialNB(alpha=best_alpha)
        nb_final.fit(X_train_scaled, y_train)
        print("Model training complete")
        
        print("\nEvaluating model performance on test set...")
        y_pred = nb_final.predict(X_test_scaled)
        
        accuracy = accuracy_score(y_test, y_pred)
        conf_matrix = confusion_matrix(y_test, y_pred)
        class_report = classification_report(y_test, y_pred)
        class_report_dict = classification_report(y_test, y_pred, output_dict=True)
        
        print(f"Test Accuracy: {accuracy:.4f}")
        print(f"Confusion Matrix:\n{conf_matrix}")
        print("Classification Report:")
        print(class_report)
        
        print("\nAnalyzing feature importance...")
        try:
            feature_importance = nb_final.feature_log_prob_[1] - nb_final.feature_log_prob_[0]
            top_features_idx = np.argsort(feature_importance)[-10:]  # Get indices of top 10 most important features
            
            print("Top 10 most important features (by difference in log probability):")
            for i, idx in enumerate(top_features_idx[::-1], 1):
                print(f"{i}. Feature {idx}: {feature_importance[idx]:.4f}")
        except Exception as e:
            print(f"Could not analyze feature importance: {e}")
        
        # Save results
        print("\nSaving results...")
        results = {
            "model_name": f"MultinomialNB with {embedding_name}",
            "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            "best_alpha": float(best_alpha),
            "best_cv_score": float(best_score),
            "k_fold_cv_scores": {
                "mean": float(cv_scores.mean()),
                "std": float(cv_scores.std()),
                "individual_folds": cv_scores.tolist()
            },
            "test_metrics": {
                "accuracy": float(accuracy),
                "confusion_matrix": conf_matrix.tolist(),
                "classification_report": class_report_dict
            },
        }
        
        # Ensure the results directory exists
        if not os.path.exists('model_results'):
            os.makedirs('model_results')
            
        results_filename = f"model_results/nb_multinomial_{embedding_name}_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
        try:
            with open(results_filename, 'w') as f:
                json.dump(results, f, indent=4)
            print(f"Results saved to {results_filename}")
            return results
        except Exception as e:
            print(f"Could not save results to file: {e}")
            return results
            
    except Exception as e:
        print(f"Error during model training/evaluation: {e}")
        return None

def main():
    # Load Training Dataset
    print("\nLoading training dataset...")
    try:
        train_dataset = load_dataset("Paulozs/WELFake_embeddings", split="train")
        train_df = pd.DataFrame(train_dataset)
        print(f"Training dataset loaded successfully! Shape: {train_df.shape}")
    except Exception as e:
        print(f"Error loading training dataset: {e}")
        return
    
    # Load Test Dataset
    print("\nLoading Test dataset...")
    try:
        test_dataset = load_dataset("lelexuanzz/Gossipcop_Politifact_Test", split="train")
        test_df = pd.DataFrame(test_dataset)
        print(f"Testing dataset loaded successfully! Shape: {test_df.shape}")
    except Exception as e:
        print(f"Error loading testing dataset: {e}")
        return
    
    # Summary dict to store results for all embeddings
    all_results = {}
    
    # Process each embedding type
    for embedding_name in EMBEDDING_COLUMN_NAMES:
        start_time = time.time()
        
        # Process the embedding
        X_train, X_test, y_train, y_test = process_embedding(train_df, test_df, embedding_name)
        
        if X_train is not None:
            # Train and evaluate model on this embedding
            results = train_and_evaluate_model(X_train, X_test, y_train, y_test, embedding_name)
            
            if results:
                elapsed_time = time.time() - start_time
                results["processing_time"] = elapsed_time
                all_results[embedding_name] = results
                print(f"\nCompleted processing {embedding_name} in {elapsed_time:.2f} seconds")
            else:
                print(f"\nFailed to process {embedding_name}")
        else:
            print(f"\nSkipping {embedding_name} due to preprocessing issues")
    
    # Compare all embedding results
    print("\n" + "="*80)
    print("EMBEDDING COMPARISON SUMMARY")
    print("="*80)
    
    if all_results:
        print("\nAccuracy comparison:")
        for embedding, results in all_results.items():
            accuracy = results["test_metrics"]["accuracy"]
            precision = results["test_metrics"]["classification_report"]["weighted avg"]["precision"]
            recall = results["test_metrics"]["classification_report"]["weighted avg"]["recall"]
            print(f"{embedding}: Test Accuracy = {accuracy:.4f}, CV Score = {cv_score:.4f}, Precision = {precision:.4f}, Recall = {recall:.4f}")
            cv_score = results["best_cv_score"]
            print(f"{embedding}: Test Accuracy = {accuracy:.4f}, CV Score = {cv_score:.4f}")
        
        # Save comparison results
        comparison_filename = f"model_results/embedding_comparison_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
        try:
            with open(comparison_filename, 'w') as f:
                json.dump(all_results, f, indent=4)
            print(f"\nComparison results saved to {comparison_filename}")
        except Exception as e:
            print(f"\nCould not save comparison results to file: {e}")
    else:
        print("No results to compare. All embedding processing failed.")
    
    print("\nImplementation complete!")

In [None]:
main()