In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, cross_val_score
import numpy as np
import csv

# Fixed domains as provided
CANDIDATES_DOMAIN = [
    'Aatrox', 'Barry', 'Cole', 'Derpy', 'Diana', 'Diaz', 'Finnegan', 'Foxy',
    'Jerry', 'Marina', 'Paul', 'Scorpius'
]
PERKS_DOMAIN = [
    'Perks.ATimeForGiving', 'Perks.ArcaneCatalyst', 'Perks.AstralNegotiator', 'Perks.Benediction',
    'Perks.BloomingBusiness', 'Perks.Bribe', 'Perks.ChivalrousCarnival', 'Perks.DarkerAuctions',
    'Perks.DoubleMobsHP', 'Perks.DoubleTrouble', 'Perks.EZPZ', 'Perks.ExtraEvent',
    'Perks.ExtraEventFishing_Festival', 'Perks.ExtraEventMining_Fiesta', 'Perks.ExtraEventSpooky_Festival',
    'Perks.ExtraEventSweet_Tooth', 'Perks.FishingFestival', 'Perks.FishingXPBuff', 'Perks.GOATed',
    'Perks.Jerrypocalypse', 'Perks.LongTermInvestment', 'Perks.LuckOfTheSea', 'Perks.Lucky',
    'Perks.MagicXPBoost', 'Perks.Marauder', 'Perks.MiningFiesta', 'Perks.MiningXPBuff', 'Perks.MoarSkillz',
    'Perks.MoltenForge', 'Perks.MythologicalRitual', 'Perks.Pathfinder', 'Perks.PeltPocalypse',
    'Perks.Perkpocalypse', 'Perks.PestEradicator', 'Perks.PetXPBuff', 'Perks.Prospection',
    'Perks.QuadTaxes', 'Perks.SharingIsCaring', 'Perks.ShoppingSpree', 'Perks.SlashedPricing',
    'Perks.SlayerXPBuff', 'Perks.Statspocalypse', 'Perks.StockExchange', 'Perks.SweetBenevolence',
    'Perks.TurboMinions', 'Perks.VolumeTrading'
]
MAYOR_DOMAIN = [
    'Aatrox', 'Barry', 'Cole', 'Derpy', 'Diana', 'Diaz', 'Finnegan', 'Foxy',
    'Jerry', 'Marina', 'Paul', 'Scorpius'
]

def load_data(csv_filename):
    """
    Loads the data and returns X (features) and y (target).
    Features are binary indicators for each candidate and perk, using the provided domains.
    Target is the mayor.
    """
    rows = []
    with open(csv_filename, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            candidates = [c.strip() for c in row['candidates'].split(',')]
            perks = [p.strip() for p in row['perks'].split(',')]
            mayor = row['mayor'].strip()
            # Only include rows with non-empty mayor values
            if mayor:  # This filters out empty strings
                rows.append((candidates, perks, mayor))

    X = []
    y = []
    for candidates, perks, mayor in rows:
        # Binary vector for candidates (using fixed domain)
        candidate_vec = [1 if c in candidates else 0 for c in CANDIDATES_DOMAIN]
        # Binary vector for perks (using fixed domain)
        perk_vec = [1 if p in perks else 0 for p in PERKS_DOMAIN]
        X.append(candidate_vec + perk_vec)
        y.append(mayor)
    feature_names = [f"cand_{c}" for c in CANDIDATES_DOMAIN] + [f"perk_{p}" for p in PERKS_DOMAIN]
    return np.array(X), np.array(y), feature_names


# Load data
X_rf_cv, y_rf_cv, feature_names_rf_cv = load_data('elections_with_perks_no_special.csv')
le_rf_cv = LabelEncoder()
y_rf_cv_enc = le_rf_cv.fit_transform(y_rf_cv)

# Define model (baseline)
rf_cv_model = RandomForestClassifier(n_estimators=100, random_state=42)

# 5-fold stratified CV
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(rf_cv_model, X_rf_cv, y_rf_cv_enc, cv=skf, scoring='accuracy', n_jobs=-1)
print(f"RandomForest 5-fold CV accuracy: mean={cv_scores.mean():.3f}, std={cv_scores.std():.3f}")

# Also fit on standard 80/20 split for comparison
X_train_cv, X_test_cv, y_train_cv, y_test_cv = train_test_split(
    X_rf_cv, y_rf_cv_enc, test_size=0.2, train_size=0.8, random_state=42
)
clf_cv = RandomForestClassifier(n_estimators=100, random_state=42)
clf_cv.fit(X_train_cv, y_train_cv)
acc_cv_split = clf_cv.score(X_test_cv, y_test_cv)
print(f"Held-out 80/20 split accuracy (post-CV training): {acc_cv_split:.3f}")




RandomForest 5-fold CV accuracy: mean=0.628, std=0.068
Held-out 80/20 split accuracy (post-CV training): 0.642


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Configuration
CSV_PATH = "elections_with_perks.csv"
RANDOM_STATE = 42
N_ESTIMATORS = 100
N_SPLITS = 5

# Load data
_df = pd.read_csv(CSV_PATH)
# Use only rows after 179 (0-based indexing), i.e., start from row 180
_df = _df.iloc[180:].reset_index(drop=True)

# Basic validation
required_columns = {"candidates", "perks", "mayor"}
missing = required_columns - set(_df.columns)
if missing:
    raise ValueError(f"Missing required columns in CSV: {missing}")

# Determine feature columns: all columns to the right of 'minister' (exclusive), excluding target 'mayor'
if "minister" not in _df.columns:
    raise ValueError("Expected 'minister' column not found in CSV.")
minister_index = list(_df.columns).index("minister")
_right_cols = list(_df.columns)[minister_index + 1 :]
_perk_indicator_cols = [c for c in _right_cols if c != "mayor"]

print(f"Using {len(_perk_indicator_cols)} columns to the right of 'minister' as features.")

# Feature matrix and target
_feature_names = ["candidates", "perks"] + _perk_indicator_cols
X = _df[_feature_names].copy()
y = _df["mayor"].copy()

# Drop rows with NaN target
mask = y.notna()
X = X.loc[mask]
y = y.loc[mask]

# Coerce features to numeric continuous values and handle missing values
X = X.apply(pd.to_numeric, errors='coerce').astype(float).fillna(0.0)

n_samples = len(y)

# 5-Fold Cross-Validation Test
print("\n" + "="*50)
print("5-FOLD CROSS-VALIDATION TEST")
print("="*50)

# Determine effective CV splits based on smallest class count
class_counts = y.value_counts(dropna=False)
min_class_count = int(class_counts.min()) if len(class_counts) > 0 else 0
effective_splits = max(2, min(N_SPLITS, min_class_count)) if min_class_count > 1 else 0

# Model
_trained_model = RandomForestClassifier(
    n_estimators=N_ESTIMATORS,
    random_state=RANDOM_STATE,
)

if effective_splits >= 2:
    # 5-Fold Stratified Cross-Validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
    cv_scores = cross_val_score(_trained_model, X, y, cv=cv, scoring="accuracy")
    
    print(f"5-Fold CV Results:")
    print(f"Mean Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
    print(f"Individual fold scores: {np.round(cv_scores, 4)}")
    print(f"Min score: {cv_scores.min():.4f}")
    print(f"Max score: {cv_scores.max():.4f}")
    
    # Additional metrics for 5-fold CV
    cv_precision = cross_val_score(_trained_model, X, y, cv=cv, scoring="precision_macro")
    cv_recall = cross_val_score(_trained_model, X, y, cv=cv, scoring="recall_macro")
    cv_f1 = cross_val_score(_trained_model, X, y, cv=cv, scoring="f1_macro")
    
    print(f"\nAdditional 5-Fold CV Metrics:")
    print(f"Precision (macro): {cv_precision.mean():.4f} ± {cv_precision.std():.4f}")
    print(f"Recall (macro): {cv_recall.mean():.4f} ± {cv_recall.std():.4f}")
    print(f"F1-Score (macro): {cv_f1.mean():.4f} ± {cv_f1.std():.4f}")
    
else:
    print("Skipping 5-fold CV: not enough samples per class after filtering.")

# Train-Test Split for Final Evaluation
print("\n" + "="*50)
print("TRAIN-TEST SPLIT EVALUATION")
print("="*50)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

print(f"Train set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

# Train model on training set
_trained_model.fit(X_train, y_train)

# Make predictions on test set
y_pred = _trained_model.predict(X_test)

# Calculate test accuracy
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Test Set Accuracy: {test_accuracy:.4f}")

# Detailed classification report
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
print(f"Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Feature Importance
feature_importance = _trained_model.feature_importances_
feature_importance_df = pd.DataFrame({
    'feature': _feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

print(f"\nTop 10 Most Important Features:")
print(feature_importance_df.head(10))

# Expose variables for other cells
feature_names = _feature_names
perk_indicator_cols = _perk_indicator_cols
trained_model = _trained_model
X_test = X_test
y_test = y_test
y_pred = y_pred

for _name in [
    "_df",
    "_right_cols",
    "_perk_indicator_cols",
    "_feature_names",
    "_trained_model",
    "minister_index",
]:
    if _name in globals():
        del globals()[_name]


Using 46 columns to the right of 'minister' as features.
Total samples: 85
Class distribution:
mayor
Diaz        12
Aatrox      12
Paul        11
Diana        9
Marina       9
Cole         9
Finnegan     7
Foxy         5
Scorpius     4
Derpy        4
Jerry        3
Name: count, dtype: int64

5-FOLD CROSS-VALIDATION TEST




5-Fold CV Results:
Mean Accuracy: 0.6824 ± 0.0880
Individual fold scores: [0.5294 0.7059 0.6471 0.7647 0.7647]
Min score: 0.5294
Max score: 0.7647


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])



Additional 5-Fold CV Metrics:
Precision (macro): 0.5665 ± 0.0973
Recall (macro): 0.6155 ± 0.0855
F1-Score (macro): 0.5690 ± 0.0889

TRAIN-TEST SPLIT EVALUATION
Train set size: 68
Test set size: 17
Test Set Accuracy: 0.5882

Classification Report:
              precision    recall  f1-score   support

      Aatrox       0.50      1.00      0.67         2
        Cole       1.00      0.50      0.67         2
       Derpy       0.00      0.00      0.00         1
       Diana       1.00      0.50      0.67         2
        Diaz       0.50      1.00      0.67         2
    Finnegan       0.00      0.00      0.00         1
        Foxy       0.00      0.00      0.00         1
       Jerry       0.00      0.00      0.00         1
      Marina       0.33      0.50      0.40         2
        Paul       1.00      1.00      1.00         2
    Scorpius       1.00      1.00      1.00         1

    accuracy                           0.59        17
   macro avg       0.48      0.50      0.46     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [3]:
# Test model on the last row of the dataset
last_row_features = X.iloc[-1:].values
last_row_actual = y.iloc[-1]

prediction = trained_model.predict(last_row_features)[0]
prediction_proba = trained_model.predict_proba(last_row_features)[0]

print(f"Last row prediction: {prediction}")
print(f"Actual mayor: {last_row_actual}")
print(f"Prediction confidence: {max(prediction_proba):.3f}")


Last row prediction: Cole
Actual mayor: Cole
Prediction confidence: 0.360


