In [44]:
import csv
from collections import Counter

def gather_domains(csv_filename):
    """
    Reads the CSV and prints the unique values (domains) for candidates, perks, and mayor.
    """
    candidates_set = set()
    perks_set = set()
    mayor_set = set()

    with open(csv_filename, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            # Candidates: comma-separated
            candidates = [c.strip() for c in row['candidates'].split(',')]
            candidates_set.update(candidates)
            # Perks: comma-separated
            perks = [p.strip() for p in row['perks'].split(',')]
            perks_set.update(perks)
            # Mayor: single value
            mayor_set.add(row['mayor'].strip())

    print("Candidates domain:", sorted(candidates_set))
    print("Perks domain:", sorted(perks_set))
    print("Mayor domain:", sorted(mayor_set))

gather_domains('elections.csv')

Candidates domain: ['Aatrox', 'Barry', 'Cole', 'Derpy', 'Diana', 'Diaz', 'Finnegan', 'Foxy', 'Jerry', 'Marina', 'Paul', 'Scorpius']
Perks domain: ['Perks.ATimeForGiving', 'Perks.ArcaneCatalyst', 'Perks.AstralNegotiator', 'Perks.Benediction', 'Perks.BloomingBusiness', 'Perks.Bribe', 'Perks.ChivalrousCarnival', 'Perks.DarkerAuctions', 'Perks.DoubleMobsHP', 'Perks.DoubleTrouble', 'Perks.EZPZ', 'Perks.ExtraEvent', 'Perks.ExtraEventFishing_Festival', 'Perks.ExtraEventMining_Fiesta', 'Perks.ExtraEventSpooky_Festival', 'Perks.ExtraEventSweet_Tooth', 'Perks.FishingFestival', 'Perks.FishingXPBuff', 'Perks.GOATed', 'Perks.Jerrypocalypse', 'Perks.LongTermInvestment', 'Perks.LuckOfTheSea', 'Perks.Lucky', 'Perks.MagicXPBoost', 'Perks.Marauder', 'Perks.MiningFiesta', 'Perks.MiningXPBuff', 'Perks.MoarSkillz', 'Perks.MoltenForge', 'Perks.MythologicalRitual', 'Perks.Pathfinder', 'Perks.PeltPocalypse', 'Perks.Perkpocalypse', 'Perks.PestEradicator', 'Perks.PetXPBuff', 'Perks.Prospection', 'Perks.QuadTaxe

In [80]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
import csv

# Fixed domains as provided
CANDIDATES_DOMAIN = [
    'Aatrox', 'Barry', 'Cole', 'Derpy', 'Diana', 'Diaz', 'Finnegan', 'Foxy',
    'Jerry', 'Marina', 'Paul', 'Scorpius'
]
PERKS_DOMAIN = [
    'Perks.ATimeForGiving', 'Perks.ArcaneCatalyst', 'Perks.AstralNegotiator', 'Perks.Benediction',
    'Perks.BloomingBusiness', 'Perks.Bribe', 'Perks.ChivalrousCarnival', 'Perks.DarkerAuctions',
    'Perks.DoubleMobsHP', 'Perks.DoubleTrouble', 'Perks.EZPZ', 'Perks.ExtraEvent',
    'Perks.ExtraEventFishing_Festival', 'Perks.ExtraEventMining_Fiesta', 'Perks.ExtraEventSpooky_Festival',
    'Perks.ExtraEventSweet_Tooth', 'Perks.FishingFestival', 'Perks.FishingXPBuff', 'Perks.GOATed',
    'Perks.Jerrypocalypse', 'Perks.LongTermInvestment', 'Perks.LuckOfTheSea', 'Perks.Lucky',
    'Perks.MagicXPBoost', 'Perks.Marauder', 'Perks.MiningFiesta', 'Perks.MiningXPBuff', 'Perks.MoarSkillz',
    'Perks.MoltenForge', 'Perks.MythologicalRitual', 'Perks.Pathfinder', 'Perks.PeltPocalypse',
    'Perks.Perkpocalypse', 'Perks.PestEradicator', 'Perks.PetXPBuff', 'Perks.Prospection',
    'Perks.QuadTaxes', 'Perks.SharingIsCaring', 'Perks.ShoppingSpree', 'Perks.SlashedPricing',
    'Perks.SlayerXPBuff', 'Perks.Statspocalypse', 'Perks.StockExchange', 'Perks.SweetBenevolence',
    'Perks.TurboMinions', 'Perks.VolumeTrading'
]
MAYOR_DOMAIN = [
    'Aatrox', 'Barry', 'Cole', 'Derpy', 'Diana', 'Diaz', 'Finnegan', 'Foxy',
    'Jerry', 'Marina', 'Paul', 'Scorpius'
]

def load_data(csv_filename):
    """
    Loads the data and returns X (features) and y (target).
    Features are binary indicators for each candidate and perk, using the provided domains.
    Target is the mayor.
    """
    rows = []
    with open(csv_filename, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            candidates = [c.strip() for c in row['candidates'].split(',')]
            perks = [p.strip() for p in row['perks'].split(',')]
            mayor = row['mayor'].strip()
            # Only include rows with non-empty mayor values
            if mayor:  # This filters out empty strings
                rows.append((candidates, perks, mayor))

    X = []
    y = []
    for candidates, perks, mayor in rows:
        # Binary vector for candidates (using fixed domain)
        candidate_vec = [1 if c in candidates else 0 for c in CANDIDATES_DOMAIN]
        # Binary vector for perks (using fixed domain)
        perk_vec = [1 if p in perks else 0 for p in PERKS_DOMAIN]
        X.append(candidate_vec + perk_vec)
        y.append(mayor)
    feature_names = [f"cand_{c}" for c in CANDIDATES_DOMAIN] + [f"perk_{p}" for p in PERKS_DOMAIN]
    return np.array(X), np.array(y), feature_names


# Load data
X_rf_cv, y_rf_cv, feature_names_rf_cv = load_data('elections.csv')
le_rf_cv = LabelEncoder()
y_rf_cv_enc = le_rf_cv.fit_transform(y_rf_cv)

# Define model (baseline)
rf_cv_model = RandomForestClassifier(n_estimators=100, random_state=42)

# 5-fold stratified CV
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(rf_cv_model, X_rf_cv, y_rf_cv_enc, cv=skf, scoring='accuracy', n_jobs=-1)
print(f"RandomForest 5-fold CV accuracy: mean={cv_scores.mean():.3f}, std={cv_scores.std():.3f}")

# Also fit on standard 80/20 split for comparison
X_train_cv, X_test_cv, y_train_cv, y_test_cv = train_test_split(
    X_rf_cv, y_rf_cv_enc, test_size=0.2, train_size=0.8, random_state=42
)
clf_cv = RandomForestClassifier(n_estimators=100, random_state=42)
clf_cv.fit(X_train_cv, y_train_cv)
acc_cv_split = clf_cv.score(X_test_cv, y_test_cv)
print(f"Held-out 80/20 split accuracy (post-CV training): {acc_cv_split:.3f}")




RandomForest 5-fold CV accuracy: mean=0.628, std=0.068
Held-out 80/20 split accuracy (post-CV training): 0.642


In [61]:
import pandas as pd

# Build and display the training DataFrame
X_df, y_df, feature_names_df = load_data('elections.csv')
df_train = pd.DataFrame(X_df, columns=feature_names_df)
df_train['mayor'] = y_df

df_train.head(200)


Unnamed: 0,cand_Aatrox,cand_Barry,cand_Cole,cand_Derpy,cand_Diana,cand_Diaz,cand_Finnegan,cand_Foxy,cand_Jerry,cand_Marina,...,perk_Perks.SharingIsCaring,perk_Perks.ShoppingSpree,perk_Perks.SlashedPricing,perk_Perks.SlayerXPBuff,perk_Perks.Statspocalypse,perk_Perks.StockExchange,perk_Perks.SweetBenevolence,perk_Perks.TurboMinions,perk_Perks.VolumeTrading,mayor
0,1,0,0,0,0,1,0,1,0,1,...,0,0,1,0,0,0,1,0,1,Marina
1,0,0,1,0,1,1,0,1,0,0,...,0,0,0,0,0,0,1,0,1,Cole
2,1,1,0,0,1,1,0,0,0,1,...,0,0,1,0,0,0,0,0,1,Aatrox
3,0,1,0,0,1,1,0,1,0,1,...,0,0,0,0,0,0,1,0,1,Marina
4,0,1,1,0,1,1,0,1,0,0,...,0,0,0,0,0,0,1,0,1,Foxy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0,0,0,0,1,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,Finnegan
196,0,0,0,0,1,1,0,1,1,0,...,0,0,0,0,1,0,0,0,0,Jerry
197,1,0,1,0,1,1,0,1,0,0,...,1,1,0,1,0,0,0,0,0,Diana
198,0,0,0,0,0,1,1,1,0,1,...,0,1,0,0,0,0,0,0,0,Paul


In [64]:
# Display failed test cases from the model's 80/20 split
from sklearn.model_selection import train_test_split

# Recreate the same dataset and split deterministically
X_all, y_all, feature_names_all = load_data('elections.csv')
y_all_encoded = le.transform(y_all)
indices = np.arange(len(y_all_encoded))

X_train_, X_test_, y_train_, y_test_, idx_train, idx_test = train_test_split(
    X_all, y_all_encoded, indices, test_size=0.2, train_size=0.8, random_state=42
)

# Choose available model (prefer cleaned if present)
model_clf = clf_clean if 'clf_clean' in globals() else clf
model_le = le_clean if 'le_clean' in globals() else le

# Ensure model exists; if not, train it now with the same split params
if 'model_clf' not in globals() or model_clf is None:
    model_clf, model_le, _ = train_random_forest('elections.csv')

# Predict on the recreated test set
y_pred = model_clf.predict(X_test_)

# Identify failures
failed_mask = y_pred != y_test_
num_failed = int(failed_mask.sum())

# Build filtered source rows aligned with indices used by load_data
rows_filtered = []
with open('elections.csv', newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        mayor_val = row['mayor'].strip()
        if mayor_val:
            cand_list = [c.strip() for c in row['candidates'].split(',') if c.strip()]
            perk_list = [p.strip() for p in row['perks'].split(',') if p.strip()]
            rows_filtered.append({'candidates': cand_list, 'perks': perk_list, 'mayor': mayor_val})

failed_rows = []
if num_failed > 0:
    true_labels = model_le.inverse_transform(y_test_[failed_mask])
    pred_labels = model_le.inverse_transform(y_pred[failed_mask])
    original_idx = idx_test[failed_mask]
    for oi, t, p in zip(original_idx, true_labels, pred_labels):
        src = rows_filtered[int(oi)]
        failed_rows.append({
            'row_index': int(oi),
            'true': t,
            'pred': p,
            'candidates': ', '.join(src['candidates']),
            'perks': ', '.join(src['perks'])
        })

failed_df = pd.DataFrame(failed_rows)
print(f"Failed test cases: {num_failed}")
failed_df.head(50)


Failed test cases: 19


Unnamed: 0,row_index,true,pred,candidates,perks
0,113,Paul,Cole,"Cole, Diana, Foxy, Marina, Paul","Perks.MiningFiesta, Perks.PetXPBuff, Perks.Ext..."
1,178,Diaz,Diana,"Cole, Diana, Diaz, Foxy, Paul","Perks.Prospection, Perks.MythologicalRitual, P..."
2,9,Barry,Aatrox,"Aatrox, Barry, Diana, Diaz, Foxy","Perks.SlayerXPBuff, Perks.ArcaneCatalyst, Perk..."
3,249,Marina,Diaz,"Aatrox, Cole, Diaz, Finnegan, Marina","Perks.SlayerXPBuff, Perks.Prospection, Perks.S..."
4,67,Diana,Aatrox,"Aatrox, Cole, Diana, Diaz, Finnegan","Perks.SlashedPricing, Perks.Prospection, Perks..."
5,239,Diaz,Foxy,"Cole, Diana, Diaz, Finnegan, Foxy","Perks.Prospection, Perks.MiningXPBuff, Perks.M..."
6,177,Marina,Diana,"Aatrox, Diana, Diaz, Foxy, Marina","Perks.SlashedPricing, Perks.Lucky, Perks.Mytho..."
7,125,Aatrox,Cole,"Aatrox, Cole, Diaz, Foxy, Marina","Perks.SlashedPricing, Perks.Pathfinder, Perks...."
8,154,Aatrox,Marina,"Aatrox, Diana, Finnegan, Foxy, Marina","Perks.SlashedPricing, Perks.Pathfinder, Perks...."
9,233,Diaz,Cole,"Aatrox, Cole, Diaz, Foxy, Marina","Perks.SlayerXPBuff, Perks.MiningXPBuff, Perks...."


In [67]:
# Display Model class reliance to determine which features are being over relied on
import numpy as np
import pandas as pd

# Ensure we have test split and model artifacts
if 'X_all' not in globals() or 'y_all' not in globals():
    X_all, y_all, feature_names_all = load_data('elections.csv')
    y_all_encoded = le.transform(y_all)
    indices = np.arange(len(y_all_encoded))
    X_train_, X_test_, y_train_, y_test_, idx_train, idx_test = train_test_split(
        X_all, y_all_encoded, indices, test_size=0.2, train_size=0.8, random_state=42
    )

model_clf = clf_clean if 'clf_clean' in globals() else clf
model_le = le_clean if 'le_clean' in globals() else le

# Baseline predictions and per-class recall on test set
y_pred_base = model_clf.predict(X_test_)
classes = np.arange(len(model_le.classes_))

results = []
rng = np.random.default_rng(42)

for class_idx in classes:
    mask = (y_test_ == class_idx)
    support = int(mask.sum())
    if support == 0:
        continue
    baseline_recall = float((y_pred_base[mask] == class_idx).mean())
    # Permute each feature and measure recall drop for this class
    for feat_idx, feat_name in enumerate(feature_names_all):
        drops = []
        for _ in range(5):
            X_perm = X_test_.copy()
            perm = rng.permutation(X_perm.shape[0])
            X_perm[:, feat_idx] = X_perm[perm, feat_idx]
            y_pred_perm = model_clf.predict(X_perm)
            recall_perm = float((y_pred_perm[mask] == class_idx).mean())
            drops.append(baseline_recall - recall_perm)
        avg_drop = float(np.mean(drops))
        results.append({
            'class': model_le.classes_[class_idx],
            'feature': feat_name,
            'support': support,
            'baseline_recall': baseline_recall,
            'reliance': avg_drop
        })

reliance_df = pd.DataFrame(results)

print("Full feature reliance DataFrame (higher reliance = larger recall drop when shuffled):")
display(reliance_df)


Full feature reliance DataFrame (higher reliance = larger recall drop when shuffled):


Unnamed: 0,class,feature,support,baseline_recall,reliance
0,Aatrox,cand_Aatrox,12,0.666667,0.133333
1,Aatrox,cand_Barry,12,0.666667,-0.016667
2,Aatrox,cand_Cole,12,0.666667,-0.033333
3,Aatrox,cand_Derpy,12,0.666667,0.000000
4,Aatrox,cand_Diana,12,0.666667,0.016667
...,...,...,...,...,...
691,Scorpius,perk_Perks.Statspocalypse,1,1.000000,0.000000
692,Scorpius,perk_Perks.StockExchange,1,1.000000,0.000000
693,Scorpius,perk_Perks.SweetBenevolence,1,1.000000,0.000000
694,Scorpius,perk_Perks.TurboMinions,1,1.000000,0.000000


In [58]:
# Predict with mayor domain constrained to only the provided candidates
candidates_input = "Aatrox,Cole,Diaz,Finnegan,Marina"
perks_input = "Perks.SlashedPricing,Perks.Pathfinder,Perks.Prospection,Perks.MiningFiesta,Perks.LongTermInvestment,Perks.PestEradicator,Perks.GOATed,Perks.BloomingBusiness,Perks.FishingXPBuff,Perks.LuckOfTheSea"

# Parse input
candidates = [c.strip() for c in candidates_input.split(',') if c.strip()]
perks = [p.strip() for p in perks_input.split(',') if p.strip()]

# Build feature vector
candidate_vec = [1 if c in candidates else 0 for c in CANDIDATES_DOMAIN]
perk_vec = [1 if p in perks else 0 for p in PERKS_DOMAIN]
X_input = np.array([candidate_vec + perk_vec])

def predict_mayor_constrained(model_clf, model_le, X_input, allowed_candidates):
    classes = model_le.classes_
    allowed_set = set(allowed_candidates)
    proba = model_clf.predict_proba(X_input)[0]
    masked = np.full_like(proba, fill_value=-1.0, dtype=float)
    for i, label in enumerate(classes):
        if label in allowed_set:
            masked[i] = proba[i]
    if np.all(masked < 0):
        return None
    best_idx = int(np.argmax(masked))
    return classes[best_idx]

# Choose available model (prefer cleaned if present)
model_clf = clf_clean if 'clf_clean' in globals() else clf
model_le = le_clean if 'le_clean' in globals() else le

# If neither exists yet, train now
if 'model_clf' not in globals() or model_clf is None:
    model_clf, model_le, _ = train_random_forest('elections.csv')

# Constrained prediction
constrained_mayor = predict_mayor_constrained(model_clf, model_le, X_input, candidates)
print(f"Constrained predicted mayor: '{constrained_mayor}'")



Constrained predicted mayor: 'Cole'


In [None]:
# Retrain model with tuned regularization parameters
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Load data fresh
X, y, feature_names = load_data('elections.csv')
le_tuned = LabelEncoder()
y_encoded = le_tuned.fit_transform(y)

# Create train/test split
X_train_tuned, X_test_tuned, y_train_tuned, y_test_tuned = train_test_split(
    X, y_encoded, test_size=0.2, train_size=0.8, random_state=42
)

# Define parameter grid for regularization tuning
param_grid = {
    'n_estimators': [50, 75, 100, 150, 200],
    'max_depth': [3, 5, 6, 8, 10, None],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False]
}

# Create base classifier
rf_base = RandomForestClassifier(random_state=42)

# Grid search with 5-fold CV
print("Performing grid search for best regularization parameters...")
grid_search = GridSearchCV(
    estimator=rf_base,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

# Fit the grid search
grid_search.fit(X_train_tuned, y_train_tuned)

# Get best parameters and model
best_params = grid_search.best_params_
best_score = grid_search.best_score_
clf_tuned = grid_search.best_estimator_

print(f"\nBest cross-validation score: {best_score:.3f}")
print(f"Best parameters: {best_params}")

# Evaluate on test set
y_pred_tuned = clf_tuned.predict(X_test_tuned)
test_accuracy = clf_tuned.score(X_test_tuned, y_test_tuned)
print(f"Test accuracy with tuned params: {test_accuracy:.3f}")

# Classification report
print("\nClassification Report:")
print(classification_report(y_test_tuned, y_pred_tuned, target_names=le_tuned.classes_))

# Feature importance
feature_importance = pd.DataFrame({
    'feature': feature_names,
    'importance': clf_tuned.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 15 Most Important Features:")
print(feature_importance.head(15))


Performing grid search for best regularization parameters...
Fitting 5 folds for each of 3840 candidates, totalling 19200 fits





Best cross-validation score: 0.625
Best parameters: {'bootstrap': False, 'max_depth': 8, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}
Test accuracy with tuned params: 0.660

Classification Report:
              precision    recall  f1-score   support

      Aatrox       0.71      0.83      0.77        12
       Barry       0.00      0.00      0.00         1
        Cole       0.55      1.00      0.71         6
       Derpy       1.00      1.00      1.00         3
       Diana       0.38      0.60      0.46         5
        Diaz       1.00      0.33      0.50         3
    Finnegan       0.00      0.00      0.00         3
        Foxy       1.00      0.50      0.67         2
       Jerry       1.00      1.00      1.00         4
      Marina       0.75      0.38      0.50         8
        Paul       0.50      0.60      0.55         5
    Scorpius       1.00      1.00      1.00         1

    accuracy                           0.66       

4800 fits failed out of a total of 19200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2233 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Mash\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Mash\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1358, in wrapper
    estimator._validate_params()
  File "c:\Users\Mash\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 471, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\Mash\AppData\Local\Programs\Python\Python312\L

In [72]:
# Retrain model again but with different a parameter grid based on the previous results
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Load data fresh
X, y, feature_names = load_data('elections.csv')
le_tuned = LabelEncoder()
y_encoded = le_tuned.fit_transform(y)

# Create train/test split
X_train_tuned, X_test_tuned, y_train_tuned, y_test_tuned = train_test_split(
    X, y_encoded, test_size=0.2, train_size=0.8, random_state=42
)

# Define parameter grid for regularization tuning
param_grid = {
    'n_estimators': [50, 60, 75, 85, 100, 125],
    'max_depth': [5, 6, 7, 8, 9],
    'min_samples_split': [8, 10, 12, 14, 16, 18],
    'min_samples_leaf': [0.5, 1, 3, 4, 5],
    'max_features': [0.1, 0.25, 'sqrt', 'log2', None],
    'bootstrap': [True, False]
}

# Create base classifier
rf_base = RandomForestClassifier(random_state=42)

# Grid search with 5-fold CV
print("Performing grid search for best regularization parameters...")
grid_search = GridSearchCV(
    estimator=rf_base,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

# Fit the grid search
grid_search.fit(X_train_tuned, y_train_tuned)

# Get best parameters and model
best_params = grid_search.best_params_
best_score = grid_search.best_score_
clf_tuned = grid_search.best_estimator_

print(f"\nBest cross-validation score: {best_score:.3f}")
print(f"Best parameters: {best_params}")

# Evaluate on test set
y_pred_tuned = clf_tuned.predict(X_test_tuned)
test_accuracy = clf_tuned.score(X_test_tuned, y_test_tuned)
print(f"Test accuracy with tuned params: {test_accuracy:.3f}")

# Classification report
print("\nClassification Report:")
print(classification_report(y_test_tuned, y_pred_tuned, target_names=le_tuned.classes_))

# Feature importance
feature_importance = pd.DataFrame({
    'feature': feature_names,
    'importance': clf_tuned.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 15 Most Important Features:")
print(feature_importance.head(15))


Performing grid search for best regularization parameters...
Fitting 5 folds for each of 9000 candidates, totalling 45000 fits





Best cross-validation score: 0.640
Best parameters: {'bootstrap': True, 'max_depth': 9, 'max_features': 0.1, 'min_samples_leaf': 1, 'min_samples_split': 16, 'n_estimators': 125}
Test accuracy with tuned params: 0.604

Classification Report:
              precision    recall  f1-score   support

      Aatrox       0.71      0.83      0.77        12
       Barry       0.00      0.00      0.00         1
        Cole       0.50      1.00      0.67         6
       Derpy       1.00      1.00      1.00         3
       Diana       0.29      0.40      0.33         5
        Diaz       0.00      0.00      0.00         3
    Finnegan       0.00      0.00      0.00         3
        Foxy       0.00      0.00      0.00         2
       Jerry       1.00      1.00      1.00         4
      Marina       0.50      0.25      0.33         8
        Paul       0.50      0.80      0.62         5
    Scorpius       1.00      1.00      1.00         1

    accuracy                           0.60        53


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [73]:
# Train Lasso Regression with Cross-Validation
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

# Load data fresh
X_lasso, y_lasso, feature_names_lasso = load_data('elections.csv')
le_lasso = LabelEncoder()
y_encoded_lasso = le_lasso.fit_transform(y_lasso)

# Standardize features for Lasso (important for regularization)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_lasso)

# Create train/test split
X_train_lasso, X_test_lasso, y_train_lasso, y_test_lasso = train_test_split(
    X_scaled, y_encoded_lasso, test_size=0.2, train_size=0.8, random_state=42
)

# LassoCV with cross-validation to find optimal alpha
print("Training Lasso Regression with cross-validation...")
lasso_cv = LassoCV(
    alphas=np.logspace(-4, 1, 50),  # Range of alpha values to test
    cv=5,  # 5-fold cross-validation
    random_state=42,
    max_iter=2000
)

# Fit the model
lasso_cv.fit(X_train_lasso, y_train_lasso)

print(f"Best alpha (regularization strength): {lasso_cv.alpha_:.6f}")
print(f"Cross-validation score: {lasso_cv.score(X_train_lasso, y_train_lasso):.3f}")

# Make predictions on test set
y_pred_lasso = lasso_cv.predict(X_test_lasso)

# Convert continuous predictions back to class labels
# Round to nearest integer and clip to valid range
y_pred_classes = np.round(y_pred_lasso).astype(int)
y_pred_classes = np.clip(y_pred_classes, 0, len(le_lasso.classes_) - 1)

# Calculate accuracy
test_accuracy_lasso = accuracy_score(y_test_lasso, y_pred_classes)
print(f"Test accuracy: {test_accuracy_lasso:.3f}")

# Classification report
print("\nClassification Report:")
print(classification_report(y_test_lasso, y_pred_classes, target_names=le_lasso.classes_))

# Feature coefficients (importance)
feature_coefs = pd.DataFrame({
    'feature': feature_names_lasso,
    'coefficient': lasso_cv.coef_
}).sort_values('coefficient', key=abs, ascending=False)

print("\nTop 15 Most Important Features (by absolute coefficient):")
print(feature_coefs.head(15))

# Count of non-zero coefficients (features selected by Lasso)
non_zero_count = np.sum(lasso_cv.coef_ != 0)
print(f"\nNumber of features selected by Lasso: {non_zero_count} out of {len(feature_names_lasso)}")

# Show features with zero coefficients (removed by regularization)
zero_features = feature_coefs[feature_coefs['coefficient'] == 0]
print(f"Number of features removed by Lasso: {len(zero_features)}")


Training Lasso Regression with cross-validation...
Best alpha (regularization strength): 0.022230
Cross-validation score: 0.651
Test accuracy: 0.132

Classification Report:
              precision    recall  f1-score   support

      Aatrox       0.57      0.33      0.42        12
       Barry       0.00      0.00      0.00         1
        Cole       0.25      0.17      0.20         6
       Derpy       0.00      0.00      0.00         3
       Diana       0.00      0.00      0.00         5
        Diaz       0.14      0.33      0.20         3
    Finnegan       0.00      0.00      0.00         3
        Foxy       0.00      0.00      0.00         2
       Jerry       0.00      0.00      0.00         4
      Marina       0.00      0.00      0.00         8
        Paul       0.00      0.00      0.00         5
    Scorpius       1.00      1.00      1.00         1

    accuracy                           0.13        53
   macro avg       0.16      0.15      0.15        53
weighted avg   

In [77]:
# Train Support Vector Machine (SVM) with Cross-Validation
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

# Load data fresh
X_svm, y_svm, feature_names_svm = load_data('elections.csv')
le_svm = LabelEncoder()
y_encoded_svm = le_svm.fit_transform(y_svm)

# Standardize features for SVM (very important for SVM performance)
scaler_svm = StandardScaler()
X_scaled_svm = scaler_svm.fit_transform(X_svm)

# Create train/test split
X_train_svm, X_test_svm, y_train_svm, y_test_svm = train_test_split(
    X_scaled_svm, y_encoded_svm, test_size=0.2, train_size=0.8, random_state=42
)

print("Training Support Vector Machine with cross-validation...")

# Grid search for optimal SVM parameters
param_grid_svm = {
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    'kernel': ['linear', 'rbf', 'poly'],  # Kernel types
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],  # Kernel coefficient (for rbf, poly)
    'degree': [2, 3, 4]  # Polynomial degree (for poly kernel)
}

# Create base SVM classifier
svm_base = SVC(random_state=42, probability=True)  # probability=True for predict_proba

# Grid search with 5-fold CV
print("Performing grid search for best SVM parameters...")
svm_grid = GridSearchCV(
    estimator=svm_base,
    param_grid=param_grid_svm,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

# Fit the grid search
svm_grid.fit(X_train_svm, y_train_svm)

# Get best parameters and model
best_params_svm = svm_grid.best_params_
best_score_svm = svm_grid.best_score_
svm_best = svm_grid.best_estimator_

print(f"\nBest parameters: {best_params_svm}")
print(f"Best cross-validation score: {best_score_svm:.3f}")

# Make predictions on test set
y_pred_svm = svm_best.predict(X_test_svm)

# Calculate accuracy
test_accuracy_svm = accuracy_score(y_test_svm, y_pred_svm)
print(f"Test accuracy: {test_accuracy_svm:.3f}")

# Classification report
print("\nClassification Report:")
print(classification_report(y_test_svm, y_pred_svm, target_names=le_svm.classes_))

# Support vectors information
print(f"\nSVM Model Information:")
print(f"Number of support vectors: {svm_best.n_support_.sum()}")
print(f"Support vectors per class: {svm_best.n_support_}")

# Compare with Random Forest performance
print(f"\nComparison with Random Forest:")
print(f"Random Forest test accuracy: 0.642")
print(f"SVM test accuracy: {test_accuracy_svm:.3f}")
print(f"Improvement: {test_accuracy_svm - 0.642:+.3f}")

# Show some prediction examples
print(f"\nSample predictions:")
sample_indices = np.random.choice(len(y_test_svm), 10, replace=False)
for i, idx in enumerate(sample_indices):
    true_label = le_svm.inverse_transform([y_test_svm[idx]])[0]
    pred_label = le_svm.inverse_transform([y_pred_svm[idx]])[0]
    correct = "✓" if y_test_svm[idx] == y_pred_svm[idx] else "✗"
    print(f"  {i+1:2d}. True: {true_label:8s} | Pred: {pred_label:8s} {correct}")

# Show class probabilities for a few examples
print(f"\nClass probabilities for first 3 test samples:")
for i in range(min(3, len(y_test_svm))):
    true_label = le_svm.inverse_transform([y_test_svm[i]])[0]
    pred_label = le_svm.inverse_transform([y_pred_svm[i]])[0]
    probabilities = svm_best.predict_proba(X_test_svm[i:i+1])[0]
    
    print(f"\nSample {i+1}: True={true_label}, Pred={pred_label}")
    for j, prob in enumerate(probabilities):
        class_name = le_svm.inverse_transform([j])[0]
        print(f"  {class_name:8s}: {prob:.3f}")


Training Support Vector Machine with cross-validation...
Performing grid search for best SVM parameters...
Fitting 5 folds for each of 216 candidates, totalling 1080 fits





Best parameters: {'C': 0.1, 'degree': 2, 'gamma': 'scale', 'kernel': 'linear'}
Best cross-validation score: 0.630
Test accuracy: 0.642

Classification Report:
              precision    recall  f1-score   support

      Aatrox       0.69      0.75      0.72        12
       Barry       1.00      1.00      1.00         1
        Cole       0.50      0.83      0.62         6
       Derpy       1.00      1.00      1.00         3
       Diana       0.44      0.80      0.57         5
        Diaz       1.00      0.33      0.50         3
    Finnegan       1.00      0.33      0.50         3
        Foxy       0.25      0.50      0.33         2
       Jerry       1.00      1.00      1.00         4
      Marina       0.67      0.25      0.36         8
        Paul       0.67      0.40      0.50         5
    Scorpius       1.00      1.00      1.00         1

    accuracy                           0.64        53
   macro avg       0.77      0.68      0.68        53
weighted avg       0.71     

In [81]:
# Failed test cases for current SVM using deterministic split (with proper scaling)
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

# Recreate dataset
X_all_reg, y_all_reg, feature_names_all_reg = load_data('elections.csv')

# Use existing label encoder if available; else fit a new one
model_le_reg = le_svm if 'le_svm' in globals() else None
if model_le_reg is None:
    from sklearn.preprocessing import LabelEncoder
    model_le_reg = LabelEncoder()
    _ = model_le_reg.fit_transform(y_all_reg)

y_all_reg_enc = model_le_reg.transform(y_all_reg)
indices_reg = np.arange(len(y_all_reg_enc))

# Deterministic split
X_train_reg_raw, X_test_reg_raw, y_train_reg, y_test_reg, idx_train_reg, idx_test_reg = train_test_split(
    X_all_reg, y_all_reg_enc, indices_reg, test_size=0.2, train_size=0.8, random_state=42
)

# Use existing scaler if present; else fit new on training only
scaler_reg = scaler_svm if 'scaler_svm' in globals() else StandardScaler()
X_train_reg = scaler_reg.fit_transform(X_train_reg_raw) if 'scaler_svm' not in globals() else scaler_reg.transform(X_train_reg_raw)
X_test_reg = scaler_reg.transform(X_test_reg_raw)

# Choose SVM model if available; else create and fit a reasonable default
if 'svm_best' in globals() and svm_best is not None:
    model_clf_reg = svm_best
else:
    model_clf_reg = SVC(C=1.0, kernel='linear', gamma='scale', probability=True, random_state=42)
    model_clf_reg.fit(X_train_reg, y_train_reg)

# Predict on test split
y_pred_reg = model_clf_reg.predict(X_test_reg)

# Identify failures
failed_mask_reg = y_pred_reg != y_test_reg
num_failed_reg = int(failed_mask_reg.sum())

# Build filtered source rows aligned with indices used by load_data
rows_filtered_reg = []
with open('elections.csv', newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        mayor_val = row['mayor'].strip()
        if mayor_val:
            cand_list = [c.strip() for c in row['candidates'].split(',') if c.strip()]
            perk_list = [p.strip() for p in row['perks'].split(',') if p.strip()]
            rows_filtered_reg.append({'candidates': cand_list, 'perks': perk_list, 'mayor': mayor_val})

failed_rows_reg = []
if num_failed_reg > 0:
    true_labels_reg = model_le_reg.inverse_transform(y_test_reg[failed_mask_reg])
    pred_labels_reg = model_le_reg.inverse_transform(y_pred_reg[failed_mask_reg])
    original_idx_reg = idx_test_reg[failed_mask_reg]
    for oi, t, p in zip(original_idx_reg, true_labels_reg, pred_labels_reg):
        src = rows_filtered_reg[int(oi)]
        failed_rows_reg.append({
            'row_index': int(oi),
            'true': t,
            'pred': p,
            'candidates': ', '.join(src['candidates']),
            'perks': ', '.join(src['perks'])
        })

failed_df_reg = pd.DataFrame(failed_rows_reg)
print(f"Failed test cases (current SVM): {num_failed_reg}")
failed_df_reg.head(50)


Failed test cases (current SVM): 19


Unnamed: 0,row_index,true,pred,candidates,perks
0,113,Paul,Cole,"Cole, Diana, Foxy, Marina, Paul","Perks.MiningFiesta, Perks.PetXPBuff, Perks.Ext..."
1,178,Diaz,Diana,"Cole, Diana, Diaz, Foxy, Paul","Perks.Prospection, Perks.MythologicalRitual, P..."
2,195,Finnegan,Diana,"Diana, Diaz, Finnegan, Marina, Paul","Perks.PetXPBuff, Perks.LongTermInvestment, Per..."
3,67,Diana,Aatrox,"Aatrox, Cole, Diana, Diaz, Finnegan","Perks.SlashedPricing, Perks.Prospection, Perks..."
4,30,Cole,Foxy,"Aatrox, Barry, Cole, Diaz, Foxy","Perks.SlayerXPBuff, Perks.AstralNegotiator, Pe..."
5,177,Marina,Diana,"Aatrox, Diana, Diaz, Foxy, Marina","Perks.SlashedPricing, Perks.Lucky, Perks.Mytho..."
6,19,Marina,Aatrox,"Aatrox, Barry, Foxy, Marina, Paul","Perks.Pathfinder, Perks.AstralNegotiator, Perk..."
7,139,Marina,Foxy,"Aatrox, Cole, Finnegan, Foxy, Marina","Perks.Pathfinder, Perks.Prospection, Perks.Min..."
8,125,Aatrox,Cole,"Aatrox, Cole, Diaz, Foxy, Marina","Perks.SlashedPricing, Perks.Pathfinder, Perks...."
9,233,Diaz,Marina,"Aatrox, Cole, Diaz, Foxy, Marina","Perks.SlayerXPBuff, Perks.MiningXPBuff, Perks...."
