In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, RobustScaler
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_validate
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support
from scipy import sparse
from scipy.sparse import csr_matrix
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import combinations

# Configuration
CSV_PATH = "elections_with_perks_no_special.csv"
RANDOM_STATE = 42
N_ESTIMATORS = 300
N_SPLITS = 4

# iRF Configuration
IRF_ITERATIONS = 3  # Number of iRF iterations
K_INTERACTIONS = 2  # Order of interactions to discover (2 = pairwise)
TOP_K_FEATURES = 11  # Number of top features to consider for interactions

# Fixed domains as provided
CANDIDATES_DOMAIN = [
    'Aatrox', 'Cole', 'Diana', 'Diaz', 'Finnegan', 'Foxy',
    'Marina', 'Paul'
]
PERKS_DOMAIN = [
    'Perks.ATimeForGiving', 'Perks.Benediction',
    'Perks.BloomingBusiness','Perks.ChivalrousCarnival',
    'Perks.DoubleTrouble', 'Perks.EZPZ', 'Perks.ExtraEvent',
    'Perks.ExtraEventFishing_Festival', 'Perks.ExtraEventMining_Fiesta', 'Perks.ExtraEventSpooky_Festival',
    'Perks.ExtraEventSweet_Tooth', 'Perks.FishingFestival', 'Perks.FishingXPBuff', 'Perks.GOATed',
    'Perks.Marauder', 'Perks.MiningFiesta', 'Perks.MiningXPBuff',
    'Perks.MoltenForge', 'Perks.MythologicalRitual', 'Perks.Pathfinder', 'Perks.PeltPocalypse',
    'Perks.Perkpocalypse', 'Perks.PestEradicator', 'Perks.PetXPBuff', 'Perks.Prospection',
    'Perks.SharingIsCaring', 'Perks.ShoppingSpree', 'Perks.SlashedPricing',
    'Perks.SlayerXPBuff','Perks.StockExchange', 'Perks.SweetBenevolence',
    'Perks.VolumeTrading'
]


def sparse_scale_features(X, feature_names, indicator_start_idx):
    """Apply sparse-aware scaling to features."""
    X_scaled = X.copy().astype(float)
    
    candidate_features = X_scaled[:, :len(CANDIDATES_DOMAIN)]
    perk_features = X_scaled[:, len(CANDIDATES_DOMAIN):len(CANDIDATES_DOMAIN) + len(PERKS_DOMAIN)]
    indicator_features = X_scaled[:, indicator_start_idx:]
    
    scaler_info = {}
    
    if indicator_features.shape[1] > 0:
        non_zero_mask = indicator_features != 0
        non_zero_values = indicator_features[non_zero_mask]
        
        if len(non_zero_values) > 0:
            scaler = RobustScaler()
            scaler.fit(indicator_features.reshape(-1, 1))
            
            indicator_scaled = indicator_features.copy()
            indicator_scaled[non_zero_mask] = scaler.transform(
                indicator_features[non_zero_mask].reshape(-1, 1)
            ).flatten()
            
            X_scaled[:, indicator_start_idx:] = indicator_scaled
            
            scaler_info = {
                'scaler': scaler,
                'center': scaler.center_[0],
                'scale': scaler.scale_[0],
                'non_zero_count': len(non_zero_values),
                'total_count': indicator_features.size,
                'sparsity': 1 - (len(non_zero_values) / indicator_features.size)
            }
        else:
            scaler_info = {'scaler': None, 'sparsity': 1.0}
    
    return X_scaled, scaler_info

def load_data(csv_filename):
    """Load data and return features and target."""
    _df = pd.read_csv(csv_filename)

    required_columns = {"candidates", "perks", "mayor"}
    missing = required_columns - set(_df.columns)
    if missing:
        raise ValueError(f"Missing required columns in CSV: {missing}")

    if "minister" not in _df.columns:
        raise ValueError("Expected 'minister' column not found in CSV.")
    minister_index = list(_df.columns).index("minister")
    _right_cols = list(_df.columns)[minister_index + 1 :]
    _perk_indicator_cols = [c for c in _right_cols if c != "mayor"]


    
    rows = []
    for _, row in _df.iterrows():
        candidates = [c.strip() for c in str(row['candidates']).split(',') if c.strip()]
        perks = [p.strip() for p in str(row['perks']).split(',') if p.strip()]
        mayor = str(row['mayor']).strip()
        
        if mayor and mayor != 'nan':
            perk_indicators = [float(row[col]) if not pd.isna(row[col]) else 0.0 
                             for col in _perk_indicator_cols]
            rows.append((candidates, perks, mayor, perk_indicators))

    le_rf_cv = LabelEncoder()
    le_rf_cv.fit(CANDIDATES_DOMAIN)
    
    valid_rows = []
    for candidates, perks, mayor, perk_indicators in rows:
        if mayor in CANDIDATES_DOMAIN:
            valid_rows.append((candidates, perks, mayor, perk_indicators))
    
    rows = valid_rows

    X = []
    y = []
    for candidates, perks, mayor, perk_indicators in rows:
        candidate_vec = [1 if c in candidates else 0 for c in CANDIDATES_DOMAIN]
        perk_vec = [1 if p in perks else 0 for p in PERKS_DOMAIN]
        perk_indicators_vec = perk_indicators
        
        X.append(candidate_vec + perk_vec + perk_indicators_vec)
        y.append(mayor)
    
    feature_names = ([f"cand_{c}" for c in CANDIDATES_DOMAIN] + 
                    [f"perk_{p}" for p in PERKS_DOMAIN] + 
                    [f"indicator_{col}" for col in _perk_indicator_cols])
    
    indicator_start_idx = len(CANDIDATES_DOMAIN) + len(PERKS_DOMAIN)
    
    return np.array(X), np.array(y), feature_names, le_rf_cv, indicator_start_idx

def create_interaction_features(X, feature_names, important_indices, k=2):
    """Create interaction features between important features."""
    X_interactions = []
    interaction_names = []
    
    for combo in combinations(important_indices, k):
        interaction_feature = np.prod(X[:, combo], axis=1)
        X_interactions.append(interaction_feature)
        
        feature_combo_names = [feature_names[i] for i in combo]
        interaction_name = " * ".join(feature_combo_names)
        interaction_names.append(f"interaction_{interaction_name}")
    
    if X_interactions:
        X_interactions = np.column_stack(X_interactions)
        return X_interactions, interaction_names
    else:
        return np.array([]).reshape(X.shape[0], 0), []

def drop_zero_features(X):
    """Replace 0 values with np.nan to ignore them in predictions."""
    X = np.array(X, dtype=float)
    X[X == 0] = np.nan
    return X

class IterativeRandomForest:
    """Iterative Random Forest implementation that discovers feature interactions."""
    
    def __init__(self, n_estimators=100, random_state=42, n_iterations=3, 
                 importance_threshold=0.01, k_interactions=2, top_k_features=20):
        self.n_estimators = n_estimators
        self.random_state = random_state
        self.n_iterations = n_iterations
        self.importance_threshold = importance_threshold
        self.k_interactions = k_interactions
        self.top_k_features = top_k_features
        
        self.models_ = []
        self.feature_names_history_ = []
        self.importance_history_ = []
        self.interaction_features_ = []
        self.interaction_names_ = []
        
    def fit(self, X, y, feature_names):
        """Fit the iRF model with iterative feature selection and interaction discovery."""
        X_current = X.copy()
        feature_names_current = feature_names.copy()
        
        for iteration in range(self.n_iterations):
            rf = RandomForestClassifier(n_estimators=self.n_estimators, 
                                      random_state=self.random_state + iteration)
            rf.fit(X_current, y)
            
            self.models_.append(rf)
            self.feature_names_history_.append(feature_names_current.copy())
            
            importances = rf.feature_importances_
            self.importance_history_.append(importances.copy())
            
            if iteration < self.n_iterations - 1:
                top_indices = np.argsort(importances)[-self.top_k_features:]
                
                X_interactions, interaction_names = create_interaction_features(
                    X_current, feature_names_current, top_indices, self.k_interactions
                )
                
                if X_interactions.shape[1] > 0:
                    X_current = np.column_stack([X_current, X_interactions])
                    feature_names_current.extend(interaction_names)
                    
                    self.interaction_features_.extend(X_interactions.T)
                    self.interaction_names_.extend(interaction_names)
                
        return self

    def predict(self, X):
        """Make predictions using the final model."""
        if not self.models_:
            raise ValueError("Model not fitted yet!")
        X = drop_zero_features(X)
        return self.models_[-1].predict(X)

    def predict_proba(self, X):
        """Predict class probabilities using the final model."""
        if not self.models_:
            raise ValueError("Model not fitted yet!")
        X = drop_zero_features(X)
        return self.models_[-1].predict_proba(X)
    
    def get_feature_importance(self):
        """Get feature importance from the final model."""
        if not self.models_:
            raise ValueError("Model not fitted yet!")
        return self.models_[-1].feature_importances_
    
    def get_final_feature_names(self):
        """Get feature names from the final iteration."""
        if not self.feature_names_history_:
            raise ValueError("Model not fitted yet!")
        return self.feature_names_history_[-1]

def print_performance_metrics(y_true, y_pred, class_names, title="Performance Metrics"):
    """Print performance metrics"""
    print(f"\n{'='*50}")
    print(f"{title}")
    print(f"{'='*50}")
    
    precision, recall, f1, support = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    accuracy = np.mean(y_true == y_pred)
    
    print(f"Overall: Accuracy={accuracy:.4f}, Precision={precision:.4f}, Recall={recall:.4f}, F1={f1:.4f}")
    
    precision_per_class, recall_per_class, f1_per_class, support_per_class = precision_recall_fscore_support(
        y_true, y_pred, average=None, labels=range(len(class_names))
    )
    
    print(f"\nPer-Class:")
    print(f"{'Class':<12} {'Precision':<10} {'Recall':<10} {'F1-Score':<10} {'Support':<8}")
    print("-" * 60)
    for i, class_name in enumerate(class_names):
        print(f"{class_name:<12} {precision_per_class[i]:<10.4f} {recall_per_class[i]:<10.4f} "
              f"{f1_per_class[i]:<10.4f} {support_per_class[i]:<8}")
    
    macro_precision = np.mean(precision_per_class)
    macro_recall = np.mean(recall_per_class)
    macro_f1 = np.mean(f1_per_class)
    
    print(f"\nMacro: Precision={macro_precision:.4f}, Recall={macro_recall:.4f}, F1={macro_f1:.4f}")
    
    return {
        'accuracy': accuracy,
        'precision_weighted': precision,
        'recall_weighted': recall,
        'f1_weighted': f1,
        'precision_macro': macro_precision,
        'recall_macro': macro_recall,
        'f1_macro': macro_f1,
        'precision_per_class': precision_per_class,
        'recall_per_class': recall_per_class,
        'f1_per_class': f1_per_class,
        'support_per_class': support_per_class
    }

def plot_confusion_matrix(y_true, y_pred, class_names, title="Confusion Matrix"):
    """Plot confusion matrix"""
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=class_names, yticklabels=class_names)
    plt.title(title)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.xticks(rotation=45)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()

# Load data and apply sparse scaling
X_raw, y_irf, feature_names_irf, le_irf, indicator_start_idx = load_data(CSV_PATH)
y_irf_enc = le_irf.transform(y_irf)
class_names = le_irf.classes_

# Apply sparse scaling
X_irf, scaler_info = sparse_scale_features(X_raw, feature_names_irf, indicator_start_idx)

# Store scaling information for later use
normalization_results = {
    'X_normalized': X_irf,
    'y': y_irf_enc,
    'feature_names': feature_names_irf,
    'label_encoder': le_irf,
    'scaler_info': scaler_info,
    'indicator_start_idx': indicator_start_idx
}

# Create and fit iRF model
irf_model = IterativeRandomForest(
    n_estimators=N_ESTIMATORS,
    random_state=RANDOM_STATE,
    n_iterations=IRF_ITERATIONS,
    k_interactions=K_INTERACTIONS,
    top_k_features=TOP_K_FEATURES
)

irf_model.fit(X_irf, y_irf_enc, feature_names_irf)

# Get final feature importance
final_importances = irf_model.get_feature_importance()
final_feature_names = irf_model.get_final_feature_names()

feature_importance_df = pd.DataFrame({
    'feature': final_feature_names,
    'importance': final_importances
}).sort_values('importance', ascending=False)

# Analyze feature types
candidate_features = feature_importance_df[feature_importance_df['feature'].str.startswith('cand_')]
perk_features = feature_importance_df[feature_importance_df['feature'].str.startswith('perk_')]
indicator_features = feature_importance_df[feature_importance_df['feature'].str.startswith('indicator_')]
interaction_features = feature_importance_df[feature_importance_df['feature'].str.startswith('interaction_')]

# Print top 15 most useful features
print("="*80)
print("TOP 15 MOST USEFUL FEATURES FOR THE MODEL")
print("="*80)
print(f"{'Rank':<4} {'Feature':<40} {'Importance':<12} {'Type':<15}")
print("-" * 80)

top_15_features = feature_importance_df.head(15)
for i, (_, row) in enumerate(top_15_features.iterrows(), 1):
    feature_name = row['feature']
    importance = row['importance']
    
    # Determine feature type
    if feature_name.startswith('cand_'):
        feature_type = "Candidate"
    elif feature_name.startswith('perk_'):
        feature_type = "Perk"
    elif feature_name.startswith('indicator_'):
        feature_type = "Indicator"
    elif feature_name.startswith('interaction_'):
        feature_type = "Interaction"
    else:
        feature_type = "Other"
    
    print(f"{i:<4} {feature_name:<40} {importance:<12.6f} {feature_type:<15}")

print("\n" + "="*80)
print("FEATURE TYPE BREAKDOWN IN TOP 15")
print("="*80)

# Count feature types in top 15
type_counts = {}
for _, row in top_15_features.iterrows():
    feature_name = row['feature']
    if feature_name.startswith('cand_'):
        feature_type = "Candidate"
    elif feature_name.startswith('perk_'):
        feature_type = "Perk"
    elif feature_name.startswith('indicator_'):
        feature_type = "Indicator"
    elif feature_name.startswith('interaction_'):
        feature_type = "Interaction"
    else:
        feature_type = "Other"
    
    type_counts[feature_type] = type_counts.get(feature_type, 0) + 1

for feature_type, count in sorted(type_counts.items(), key=lambda x: x[1], reverse=True):
    percentage = (count / 15) * 100
    print(f"{feature_type:<15}: {count:>2} features ({percentage:>5.1f}%)")

print("\n" + "="*80)

# MODEL TESTING AND EVALUATION

def create_test_features(X_test, irf_model):
    """Recreate the same feature engineering pipeline for test data."""
    X_test_augmented = X_test.copy()
    
    for iteration in range(len(irf_model.models_) - 1):
        importances = irf_model.importance_history_[iteration]
        top_indices = np.argsort(importances)[-TOP_K_FEATURES:]
        
        X_interactions_test = []
        for combo in combinations(top_indices, K_INTERACTIONS):
            if all(idx < X_test_augmented.shape[1] for idx in combo):
                interaction_feature = np.prod(X_test_augmented[:, combo], axis=1)
                X_interactions_test.append(interaction_feature)
        
        if X_interactions_test:
            X_interactions_test = np.column_stack(X_interactions_test)
            X_test_augmented = np.column_stack([X_test_augmented, X_interactions_test])
    
    return X_test_augmented

def evaluate_irf_with_cv():
    """Evaluate iRF using cross-validation with proper feature reconstruction."""
    skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
    
    cv_scores = {
        'accuracy': [],
        'precision_weighted': [],
        'recall_weighted': [],
        'f1_weighted': [],
        'precision_macro': [],
        'recall_macro': [],
        'f1_macro': []
    }
    
    fold_predictions = []
    fold_true_labels = []
    
    for fold, (train_idx, test_idx) in enumerate(skf.split(X_irf, y_irf_enc)):
        X_train_fold, X_test_fold = X_irf[train_idx], X_irf[test_idx]
        y_train_fold, y_test_fold = y_irf_enc[train_idx], y_irf_enc[test_idx]
        
        irf_fold = IterativeRandomForest(
            n_estimators=N_ESTIMATORS,
            random_state=RANDOM_STATE + fold,
            n_iterations=IRF_ITERATIONS,
            k_interactions=K_INTERACTIONS,
            top_k_features=TOP_K_FEATURES
        )
        
        irf_fold.fit(X_train_fold, y_train_fold, feature_names_irf)
        
        X_test_fold_augmented = create_test_features(X_test_fold, irf_fold)
        
        if X_test_fold_augmented.shape[1] != len(irf_fold.get_final_feature_names()):
            n_features_needed = len(irf_fold.get_final_feature_names())
            if X_test_fold_augmented.shape[1] > n_features_needed:
                X_test_fold_augmented = X_test_fold_augmented[:, :n_features_needed]
            elif X_test_fold_augmented.shape[1] < n_features_needed:
                n_missing = n_features_needed - X_test_fold_augmented.shape[1]
                padding = np.zeros((X_test_fold_augmented.shape[0], n_missing))
                X_test_fold_augmented = np.column_stack([X_test_fold_augmented, padding])
        
        y_pred_fold = irf_fold.predict(X_test_fold_augmented)
        
        fold_predictions.extend(y_pred_fold)
        fold_true_labels.extend(y_test_fold)
        
        precision, recall, f1, _ = precision_recall_fscore_support(y_test_fold, y_pred_fold, average='weighted')
        accuracy = np.mean(y_test_fold == y_pred_fold)
        
        precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
            y_test_fold, y_pred_fold, average='macro'
        )
        
        cv_scores['accuracy'].append(accuracy)
        cv_scores['precision_weighted'].append(precision)
        cv_scores['recall_weighted'].append(recall)
        cv_scores['f1_weighted'].append(f1)
        cv_scores['precision_macro'].append(precision_macro)
        cv_scores['recall_macro'].append(recall_macro)
        cv_scores['f1_macro'].append(f1_macro)
    
    fold_predictions = np.array(fold_predictions)
    fold_true_labels = np.array(fold_true_labels)
    
    return cv_scores

def compare_with_baseline_rf():
    """Compare iRF performance with baseline Random Forest."""
    print(f"\n{'='*60}")
    print("COMPARISON: iRF vs BASELINE RANDOM FOREST")
    print(f"{'='*60}")
    
    rf_baseline = RandomForestClassifier(n_estimators=N_ESTIMATORS, random_state=RANDOM_STATE)
    skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
    
    scoring = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted', 
               'precision_macro', 'recall_macro', 'f1_macro']
    
    baseline_cv_results = cross_validate(rf_baseline, X_irf, y_irf_enc, cv=skf, scoring=scoring, n_jobs=-1)
    
    print(f"\nBaseline Random Forest ({N_SPLITS}-Fold CV):")
    for metric in scoring:
        scores = baseline_cv_results[f'test_{metric}']
        print(f"{metric.replace('_', ' ').title()}: {scores.mean():.4f} ± {scores.std():.4f}")
    
    irf_cv_scores = evaluate_irf_with_cv()
    
    print(f"\n{'='*50}")
    print("PERFORMANCE COMPARISON SUMMARY")
    print(f"{'='*50}")
    
    print(f"{'Metric':<20} {'Baseline RF':<15} {'iRF':<15} {'Improvement':<12}")
    print("-" * 65)
    
    for metric in ['accuracy', 'f1_weighted', 'f1_macro']:
        baseline_score = np.mean(baseline_cv_results[f'test_{metric}'])
        irf_score = np.mean(irf_cv_scores[metric])
        improvement = ((irf_score - baseline_score) / baseline_score) * 100
        
        print(f"{metric.replace('_', ' ').title():<20} {baseline_score:<15.4f} {irf_score:<15.4f} {improvement:+.2f}%")

# Run the evaluation
try:
    compare_with_baseline_rf()
except Exception as e:
    X_train_simple, X_test_simple, y_train_simple, y_test_simple = train_test_split(
        X_irf, y_irf_enc, test_size=0.3, random_state=RANDOM_STATE, stratify=y_irf_enc
    )
    
    irf_simple = IterativeRandomForest(
        n_estimators=N_ESTIMATORS,
        random_state=RANDOM_STATE,
        n_iterations=IRF_ITERATIONS,
        k_interactions=K_INTERACTIONS,
        top_k_features=TOP_K_FEATURES
    )
    
    irf_simple.fit(X_train_simple, y_train_simple, feature_names_irf)
    
    X_test_augmented = create_test_features(X_test_simple, irf_simple)
    
    if X_test_augmented.shape[1] != len(irf_simple.get_final_feature_names()):
        n_features_needed = len(irf_simple.get_final_feature_names())
        if X_test_augmented.shape[1] > n_features_needed:
            X_test_augmented = X_test_augmented[:, :n_features_needed]
        elif X_test_augmented.shape[1] < n_features_needed:
            n_missing = n_features_needed - X_test_augmented.shape[1]
            padding = np.zeros((X_test_augmented.shape[0], n_missing))
            X_test_augmented = np.column_stack([X_test_augmented, padding])
    
    y_pred_simple = irf_simple.predict(X_test_augmented)
    
    print_performance_metrics(y_test_simple, y_pred_simple, class_names, "iRF Simple Test")
    
    rf_baseline_simple = RandomForestClassifier(n_estimators=N_ESTIMATORS, random_state=RANDOM_STATE)
    rf_baseline_simple.fit(X_train_simple, y_train_simple)
    y_pred_baseline = rf_baseline_simple.predict(X_test_simple)
    
    print_performance_metrics(y_test_simple, y_pred_baseline, class_names, "Baseline RF Test")

TOP 15 MOST USEFUL FEATURES FOR THE MODEL
Rank Feature                                  Importance   Type           
--------------------------------------------------------------------------------
1    interaction_perk_Perks.SlashedPricing * interaction_perk_Perks.SlayerXPBuff * perk_Perks.SlashedPricing 0.010973     Interaction    
2    interaction_interaction_perk_Perks.Prospection * perk_Perks.MiningFiesta * perk_Perks.MiningFiesta 0.010387     Interaction    
3    interaction_perk_Perks.Prospection * perk_Perks.MiningFiesta 0.010387     Interaction    
4    perk_Perks.MiningFiesta                  0.010222     Perk           
5    interaction_interaction_cand_Aatrox * perk_Perks.SlayerXPBuff * interaction_cand_Aatrox * perk_Perks.SlashedPricing 0.009950     Interaction    
6    interaction_perk_Perks.EZPZ * interaction_perk_Perks.Benediction * perk_Perks.EZPZ 0.009731     Interaction    
7    interaction_perk_Perks.FishingXPBuff * perk_Perks.FishingFestival 0.009582     Interactio

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])



PERFORMANCE COMPARISON SUMMARY
Metric               Baseline RF     iRF             Improvement 
-----------------------------------------------------------------
Accuracy             0.5020          0.5945          +18.42%
F1 Weighted          0.4745          0.5736          +20.89%
F1 Macro             0.4367          0.5281          +20.93%


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [3]:
# ============================================================================
# TEST MODEL ON LAST ROW OF SPREADSHEET
# ============================================================================

def test_model_on_last_row():
    """
    Test the trained iRF model on the last row of the spreadsheet.
    This row has no mayor value, making it perfect for prediction testing.
    """
    print("="*80)
    print("TESTING MODEL ON LAST ROW OF SPREADSHEET")
    print("="*80)
    
    # Load the CSV and get the last row
    df = pd.read_csv(CSV_PATH)
    last_row = df.iloc[-1]
    
    print(f"Last row data:")
    print(f"  Election number: {last_row['election_number']}")
    print(f"  Date: {last_row['date']}")
    print(f"  Candidates: {last_row['candidates']}")
    print(f"  Perks: {last_row['perks']}")
    print(f"  Mayor: {last_row['mayor']} (empty - perfect for testing!)")
    print(f"  Minister: {last_row['minister']}")
    
    # Process the last row through the same feature engineering pipeline
    candidates = [c.strip() for c in str(last_row['candidates']).split(',') if c.strip()]
    perks = [p.strip() for p in str(last_row['perks']).split(',') if p.strip()]
    
    # Get perk indicator values (the columns after 'minister')
    minister_index = list(df.columns).index("minister")
    perk_indicator_cols = [c for c in list(df.columns)[minister_index + 1:] if c != "mayor"]
    perk_indicators = [float(last_row[col]) if not pd.isna(last_row[col]) else 0.0 
                      for col in perk_indicator_cols]
    
    print(f"\nProcessed data:")
    print(f"  Candidates: {candidates}")
    print(f"  Perks: {perks}")
    print(f"  Perk indicators: {len(perk_indicators)} values")
    
    # Create feature vector (same as in load_data function)
    # Binary vector for candidates (using fixed domain)
    candidate_vec = [1 if c in candidates else 0 for c in CANDIDATES_DOMAIN]
    # Binary vector for perks (using fixed domain)
    perk_vec = [1 if p in perks else 0 for p in PERKS_DOMAIN]
    # Perk indicator features
    perk_indicators_vec = perk_indicators
    
    # Combine all features
    X_test_row = np.array([candidate_vec + perk_vec + perk_indicators_vec])
    
    print(f"\nFeature vector created:")
    print(f"  Shape: {X_test_row.shape}")
    print(f"  Candidate features: {sum(candidate_vec)}/{len(candidate_vec)} active")
    print(f"  Perk features: {sum(perk_vec)}/{len(perk_vec)} active")
    print(f"  Perk indicator features: {len(perk_indicators_vec)} values")
    
    # Show which candidates and perks are active
    active_candidates = [CANDIDATES_DOMAIN[i] for i, val in enumerate(candidate_vec) if val == 1]
    active_perks = [PERKS_DOMAIN[i] for i, val in enumerate(perk_vec) if val == 1]
    
    print(f"\nActive features:")
    print(f"  Active candidates: {active_candidates}")
    print(f"  Active perks: {active_perks}")
    
    # Check if we have scaling results and apply the same scaling
    try:
        if 'scaling_analysis_results' in globals():
            print(f"\nApplying scaling from scaling analysis...")
            best_strategy = scaling_analysis_results['best_strategy']
            best_strategy_data = scaling_analysis_results['scaling_strategies'][best_strategy]
            indicator_start_idx = scaling_analysis_results['indicator_start_idx']
            
            print(f"  Using scaling strategy: {best_strategy}")
            print(f"  Description: {best_strategy_data['description']}")
            
            # Apply the same scaling to test data
            X_test_scaled = X_test_row.copy()
            if best_strategy_data['scaler'] is not None:
                # Apply scaling to indicator features only
                indicator_features = X_test_row[:, indicator_start_idx:]
                X_test_scaled[:, indicator_start_idx:] = best_strategy_data['scaler'].transform(indicator_features)
                print(f"  Applied {type(best_strategy_data['scaler']).__name__} scaling to indicator features")
            else:
                print(f"  No scaler available for this strategy")
            
            X_test_row = X_test_scaled
        else:
            print(f"\nNo scaling analysis results found. Using raw features.")
    except Exception as e:
        print(f"\nError applying scaling: {e}")
        print(f"Continuing with raw features...")
    
    # Create test features with same interaction structure as training
    print(f"\nCreating interaction features for test data...")
    X_test_augmented = create_test_features(X_test_row, irf_model)
    
    print(f"  Original features: {X_test_row.shape[1]}")
    print(f"  Augmented features: {X_test_augmented.shape[1]}")
    print(f"  Interaction features added: {X_test_augmented.shape[1] - X_test_row.shape[1]}")
    
    # Ensure test features match training features
    if X_test_augmented.shape[1] != len(irf_model.get_final_feature_names()):
        print(f"  Adjusting feature dimensions...")
        n_features_needed = len(irf_model.get_final_feature_names())
        if X_test_augmented.shape[1] > n_features_needed:
            X_test_augmented = X_test_augmented[:, :n_features_needed]
            print(f"    Truncated to {n_features_needed} features")
        elif X_test_augmented.shape[1] < n_features_needed:
            n_missing = n_features_needed - X_test_augmented.shape[1]
            padding = np.zeros((X_test_augmented.shape[0], n_missing))
            X_test_augmented = np.column_stack([X_test_augmented, padding])
            print(f"    Padded with {n_missing} zero features")
    
    # Make prediction
    print(f"\nMaking prediction...")
    try:
        # Get prediction probabilities
        prediction_proba = irf_model.predict_proba(X_test_augmented)
        predicted_class_idx = np.argmax(prediction_proba[0])
        predicted_mayor = le_irf.inverse_transform([predicted_class_idx])[0]
        confidence = prediction_proba[0][predicted_class_idx]
        
        print(f"\n{'='*50}")
        print(f"PREDICTION RESULTS")
        print(f"{'='*50}")
        print(f"Predicted Mayor: {predicted_mayor}")
        print(f"Confidence: {confidence:.4f} ({confidence*100:.2f}%)")
        
        # Show all candidate probabilities
        print(f"\nAll candidate probabilities:")
        for i, class_name in enumerate(class_names):
            prob = prediction_proba[0][i]
            print(f"  {class_name}: {prob:.4f} ({prob*100:.2f}%)")
        
        # Show top 3 predictions
        top_3_indices = np.argsort(prediction_proba[0])[-3:][::-1]
        print(f"\nTop 3 predictions:")
        for i, idx in enumerate(top_3_indices, 1):
            class_name = class_names[idx]
            prob = prediction_proba[0][idx]
            print(f"  {i}. {class_name}: {prob:.4f} ({prob*100:.2f}%)")
        
        return {
            'predicted_mayor': predicted_mayor,
            'confidence': confidence,
            'all_probabilities': dict(zip(class_names, prediction_proba[0])),
            'top_3': [(class_names[idx], prediction_proba[0][idx]) for idx in top_3_indices]
        }
        
    except Exception as e:
        print(f"Error making prediction: {e}")
        return None

# Run the test
print("Testing the trained iRF model on the last row of the spreadsheet...")
test_results = test_model_on_last_row()

Testing the trained iRF model on the last row of the spreadsheet...
TESTING MODEL ON LAST ROW OF SPREADSHEET
Last row data:
  Election number: 444
  Date: September 22, 2025
  Candidates: Aatrox,Cole,Diaz,Finnegan,Marina
  Perks: Perks.SlashedPricing,Perks.Pathfinder,Perks.Prospection,Perks.MiningFiesta,Perks.LongTermInvestment,Perks.PestEradicator,Perks.GOATed,Perks.BloomingBusiness,Perks.FishingXPBuff,Perks.LuckOfTheSea
  Mayor: nan (empty - perfect for testing!)
  Minister: nan

Processed data:
  Candidates: ['Aatrox', 'Cole', 'Diaz', 'Finnegan', 'Marina']
  Perks: ['Perks.SlashedPricing', 'Perks.Pathfinder', 'Perks.Prospection', 'Perks.MiningFiesta', 'Perks.LongTermInvestment', 'Perks.PestEradicator', 'Perks.GOATed', 'Perks.BloomingBusiness', 'Perks.FishingXPBuff', 'Perks.LuckOfTheSea']
  Perk indicators: 37 values

Feature vector created:
  Shape: (1, 77)
  Candidate features: 5/8 active
  Perk features: 8/32 active
  Perk indicator features: 37 values

Active features:
  Active c