In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, cross_val_score
import numpy as np
import csv

# Fixed domains as provided
CANDIDATES_DOMAIN = [
    'Aatrox', 'Barry', 'Cole', 'Derpy', 'Diana', 'Diaz', 'Finnegan', 'Foxy',
    'Jerry', 'Marina', 'Paul', 'Scorpius'
]
PERKS_DOMAIN = [
    'Perks.ATimeForGiving', 'Perks.ArcaneCatalyst', 'Perks.AstralNegotiator', 'Perks.Benediction',
    'Perks.BloomingBusiness', 'Perks.Bribe', 'Perks.ChivalrousCarnival', 'Perks.DarkerAuctions',
    'Perks.DoubleMobsHP', 'Perks.DoubleTrouble', 'Perks.EZPZ', 'Perks.ExtraEvent',
    'Perks.ExtraEventFishing_Festival', 'Perks.ExtraEventMining_Fiesta', 'Perks.ExtraEventSpooky_Festival',
    'Perks.ExtraEventSweet_Tooth', 'Perks.FishingFestival', 'Perks.FishingXPBuff', 'Perks.GOATed',
    'Perks.Jerrypocalypse', 'Perks.LongTermInvestment', 'Perks.LuckOfTheSea', 'Perks.Lucky',
    'Perks.MagicXPBoost', 'Perks.Marauder', 'Perks.MiningFiesta', 'Perks.MiningXPBuff', 'Perks.MoarSkillz',
    'Perks.MoltenForge', 'Perks.MythologicalRitual', 'Perks.Pathfinder', 'Perks.PeltPocalypse',
    'Perks.Perkpocalypse', 'Perks.PestEradicator', 'Perks.PetXPBuff', 'Perks.Prospection',
    'Perks.QuadTaxes', 'Perks.SharingIsCaring', 'Perks.ShoppingSpree', 'Perks.SlashedPricing',
    'Perks.SlayerXPBuff', 'Perks.Statspocalypse', 'Perks.StockExchange', 'Perks.SweetBenevolence',
    'Perks.TurboMinions', 'Perks.VolumeTrading'
]
MAYOR_DOMAIN = [
    'Aatrox', 'Barry', 'Cole', 'Derpy', 'Diana', 'Diaz', 'Finnegan', 'Foxy',
    'Jerry', 'Marina', 'Paul', 'Scorpius'
]

def load_data(csv_filename):
    """
    Loads the data and returns X (features) and y (target).
    Features are binary indicators for each candidate and perk, using the provided domains.
    Target is the mayor.
    """
    rows = []
    with open(csv_filename, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            candidates = [c.strip() for c in row['candidates'].split(',')]
            perks = [p.strip() for p in row['perks'].split(',')]
            mayor = row['mayor'].strip()
            # Only include rows with non-empty mayor values
            if mayor:  # This filters out empty strings
                rows.append((candidates, perks, mayor))

    X = []
    y = []
    for candidates, perks, mayor in rows:
        # Binary vector for candidates (using fixed domain)
        candidate_vec = [1 if c in candidates else 0 for c in CANDIDATES_DOMAIN]
        # Binary vector for perks (using fixed domain)
        perk_vec = [1 if p in perks else 0 for p in PERKS_DOMAIN]
        X.append(candidate_vec + perk_vec)
        y.append(mayor)
    feature_names = [f"cand_{c}" for c in CANDIDATES_DOMAIN] + [f"perk_{p}" for p in PERKS_DOMAIN]
    return np.array(X), np.array(y), feature_names


# Load data
X_rf_cv, y_rf_cv, feature_names_rf_cv = load_data('elections_better_mayors.csv')
le_rf_cv = LabelEncoder()
y_rf_cv_enc = le_rf_cv.fit_transform(y_rf_cv)

# Define model (baseline)
rf_cv_model = RandomForestClassifier(n_estimators=100, random_state=42)

# 5-fold stratified CV
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(rf_cv_model, X_rf_cv, y_rf_cv_enc, cv=skf, scoring='accuracy', n_jobs=-1)
print(f"RandomForest 5-fold CV accuracy: mean={cv_scores.mean():.3f}, std={cv_scores.std():.3f}")

# Also fit on standard 80/20 split for comparison
X_train_cv, X_test_cv, y_train_cv, y_test_cv = train_test_split(
    X_rf_cv, y_rf_cv_enc, test_size=0.2, train_size=0.8, random_state=42
)
clf_cv = RandomForestClassifier(n_estimators=100, random_state=42)
clf_cv.fit(X_train_cv, y_train_cv)
acc_cv_split = clf_cv.score(X_test_cv, y_test_cv)
print(f"Held-out 80/20 split accuracy (post-CV training): {acc_cv_split:.3f}")




RandomForest 5-fold CV accuracy: mean=0.548, std=0.156
Held-out 80/20 split accuracy (post-CV training): 0.500


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score

# Configuration
CSV_PATH = "elections_with_perks.csv"
RANDOM_STATE = 42
N_ESTIMATORS = 100
N_SPLITS = 5

# Load data
_df = pd.read_csv(CSV_PATH)
# Use only rows after 179 (0-based indexing), i.e., start from row 180
_df = _df.iloc[180:].reset_index(drop=True)

# Basic validation
required_columns = {"candidates", "perks", "mayor"}
missing = required_columns - set(_df.columns)
if missing:
    raise ValueError(f"Missing required columns in CSV: {missing}")

# Determine feature columns: all columns to the right of 'minister' (exclusive), excluding target 'mayor'
if "minister" not in _df.columns:
    raise ValueError("Expected 'minister' column not found in CSV.")
minister_index = list(_df.columns).index("minister")
_right_cols = list(_df.columns)[minister_index + 1 :]
_perk_indicator_cols = [c for c in _right_cols if c != "mayor"]

print(f"Using {len(_perk_indicator_cols)} columns to the right of 'minister' as features.")

# Feature matrix and target
_feature_names = ["candidates", "perks"] + _perk_indicator_cols
X = _df[_feature_names].copy()
y = _df["mayor"].copy()

# Drop rows with NaN target
mask = y.notna()
X = X.loc[mask]
y = y.loc[mask]

# Coerce features to numeric continuous values and handle missing values
X = X.apply(pd.to_numeric, errors='coerce').astype(float).fillna(0.0)

# Ensure we have samples
n_samples = len(y)
if n_samples == 0:
    raise ValueError("No samples available after filtering (rows > 179) and NaN removal. Check data.")

# Determine effective CV splits based on smallest class count
class_counts = y.value_counts(dropna=False)
min_class_count = int(class_counts.min()) if len(class_counts) > 0 else 0
effective_splits = max(2, min(N_SPLITS, min_class_count)) if min_class_count > 1 else 0

# Model
_trained_model = RandomForestClassifier(
    n_estimators=N_ESTIMATORS,
    random_state=RANDOM_STATE,
)

if effective_splits >= 2:
    # Stratified CV with adjusted number of splits
    cv = StratifiedKFold(n_splits=effective_splits, shuffle=True, random_state=RANDOM_STATE)
    cv_scores = cross_val_score(_trained_model, X, y, cv=cv, scoring="accuracy")
    print(f"{effective_splits}-fold CV accuracy: mean={cv_scores.mean():.4f}, std={cv_scores.std():.4f}")
    print(f"Per-fold scores: {np.round(cv_scores, 4)}")
else:
    print("Skipping CV: not enough samples per class after filtering.")

# Fit on the full dataset for later use
_trained_model.fit(X, y)


# Expose variables for other cells
feature_names = _feature_names
perk_indicator_cols = _perk_indicator_cols
trained_model = _trained_model

for _name in [
    "_df",
    "_right_cols",
    "_perk_indicator_cols",
    "_feature_names",
    "_trained_model",
    "minister_index",
]:
    if _name in globals():
        del globals()[_name]


Using 46 columns to the right of 'minister' as features.
3-fold CV accuracy: mean=0.6905, std=0.0937
Per-fold scores: [0.8214 0.6429 0.6071]
