Data upload and wrangling

In [None]:
import pandas as pd
import numpy as np
import os
from google.colab import files
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings('ignore')

# Define the Elo class for rating calculations
class Elo:
    def __init__(self, k=5):
        self.k = k

    def expected_score(self, rating_a, rating_b):
        return 1 / (1 + 10 ** ((rating_b - rating_a) / 400))

    def update_rating(self, current_rating, expected_score, actual_score, k_factor):
        return current_rating + k_factor * (actual_score - expected_score)

# Clean column names (removes invisible string characters)
def clean_column_names(df):
    df.columns = [col.strip().replace('\ufeff', '').replace('\u200b', '') for col in df.columns]
    return df

# Data Loading
def load_competition_data(tournament_folder, tournament_name):
    print(f"Uploading files for {tournament_name}: pool_results.csv, elimination_rounds.csv, final_ranking.csv, athletes.csv")
    uploaded = files.upload()
    for filename in uploaded.keys():
        os.rename(filename, os.path.join(tournament_folder, filename))
    print(f"Files loaded in {tournament_folder}: {os.listdir(tournament_folder)}")

    try:
        # Load CSVs with explicit UTF-8 BOM handling
        pool_df = pd.read_csv(os.path.join(tournament_folder, 'pool_results.csv'), encoding='utf-8-sig')
        elimination_df = pd.read_csv(os.path.join(tournament_folder, 'elimination_rounds.csv'), encoding='utf-8-sig')
        ranking_df = pd.read_csv(os.path.join(tournament_folder, 'final_ranking.csv'), encoding='utf-8-sig')
        athletes_df = pd.read_csv(os.path.join(tournament_folder, 'athletes.csv'), encoding='utf-8-sig')

        # Clean column names
        elimination_df = clean_column_names(elimination_df)
        pool_df = clean_column_names(pool_df)
        ranking_df = clean_column_names(ranking_df)
        athletes_df = clean_column_names(athletes_df)

        # Force column names for elimination_df
        expected_elim_columns = [
            'Round', 'Fencer A Name', 'Fencer A Country', 'Fencer A Score',
            'Fencer B Name', 'Fencer B Country', 'Fencer B Score', 'Winner'
        ]
        if len(elimination_df.columns) == len(expected_elim_columns):
            elimination_df.columns = expected_elim_columns
        else:
            raise ValueError(f"Expected {len(expected_elim_columns)} columns in elimination_df, got {len(elimination_df.columns)}: {elimination_df.columns.tolist()}")

        print(f"Raw elimination_df columns: {elimination_df.columns.tolist()}")
        print(f"Cleaned elimination_df columns: {elimination_df.columns.tolist()}")
        print(f"Sample elimination_df data (rows: {len(elimination_df)}):\n{elimination_df.head(2).to_string()}")

        # Verify required columns
        if not all(col in elimination_df.columns for col in expected_elim_columns):
            missing = [col for col in expected_elim_columns if col not in elimination_df.columns]
            raise ValueError(f"Missing columns in elimination_df: {missing}")

        return elimination_df, ranking_df, pool_df, athletes_df
    except Exception as e:
        print(f"Error loading data for {tournament_name}: {str(e)}")
        raise

# Data Cleaning and Feature Engineering
def clean_data(elimination_df, ranking_df, pool_df, athletes_df):
    print("Entering clean_data function")
    print(f"Initial elimination_df columns (rows: {len(elimination_df)}): {elimination_df.columns.tolist()}")

    # Verify required columns
    required_elim_columns = ['Round', 'Fencer A Name', 'Fencer A Country', 'Fencer A Score',
                            'Fencer B Name', 'Fencer B Country', 'Fencer B Score', 'Winner']
    required_pool_columns = ['Name', 'V', 'M', 'Rk.', 'TD', 'TR', 'Ind.', 'Diff.', 'Q']
    required_ranking_columns = ['Name', 'Rank', 'Points', 'Age', 'Nationality']
    required_athletes_columns = ['Name', 'Rank', 'Points', 'Age', 'Nationality']

    for df, req_cols in [(elimination_df, required_elim_columns),
                         (pool_df, required_pool_columns),
                         (ranking_df, required_ranking_columns),
                         (athletes_df, required_athletes_columns)]:
        if not all(col in df.columns for col in req_cols):
            missing = [col for col in req_cols if col not in df.columns]
            raise ValueError(f"Missing columns in DataFrame: {missing}")

    assert 'Fencer A Name' in elimination_df.columns, f"Fencer A Name missing after verification: {elimination_df.columns.tolist()}"

    # Defensive string cleaning
    elimination_df['Fencer A Name'] = elimination_df['Fencer A Name'].astype(str).str.title().str.strip()
    elimination_df['Fencer B Name'] = elimination_df['Fencer B Name'].astype(str).str.title().str.strip()
    elimination_df['Winner'] = elimination_df['Winner'].astype(str).str.title().str.strip()
    print(f"After string cleaning, elimination_df columns: {elimination_df.columns.tolist()}")

    # Select only required columns
    elimination_df = elimination_df[required_elim_columns].copy()
    print(f"After selecting columns, elimination_df columns: {elimination_df.columns.tolist()}")

    # Process elimination_df
    print(f"Rows before dropna: {len(elimination_df)}")
    elimination_df = elimination_df.dropna(subset=['Fencer A Name', 'Fencer B Name', 'Winner', 'Fencer A Score', 'Fencer B Score'])
    print(f"After dropna, elimination_df columns (rows: {len(elimination_df)}): {elimination_df.columns.tolist()}")

    # Type conversions
    elimination_df['Fencer A Score'] = elimination_df['Fencer A Score'].astype(float)
    elimination_df['Fencer B Score'] = elimination_df['Fencer B Score'].astype(float)
    elimination_df['Round'] = elimination_df['Round'].astype(str).str.extract(r'(\d+)').astype(float).fillna(0).astype(int)
    print(f"After all elimination_df processing, columns: {elimination_df.columns.tolist()}")

    # Process other DataFrames
    ranking_df['Name'] = ranking_df['Name'].astype(str).str.title().str.strip()
    ranking_df['Rank'] = ranking_df['Rank'].astype(int)
    ranking_df['Points'] = ranking_df['Points'].astype(float)
    ranking_df['Age'] = ranking_df['Age'].astype(int)
    ranking_df['Nationality'] = ranking_df['Nationality'].astype(str)

    athletes_df['Name'] = athletes_df['Name'].astype(str).str.title().str.strip()
    athletes_df['Rank'] = athletes_df['Rank'].astype(int)
    athletes_df['Points'] = athletes_df['Points'].astype(float)
    athletes_df['Age'] = athletes_df['Age'].astype(int)
    athletes_df['Nationality'] = athletes_df['Nationality'].astype(str)

    pool_df['Name'] = pool_df['Name'].astype(str).str.title().str.strip()
    pool_df['Rk.'] = pool_df['Rk.'].astype(int)
    pool_df['V'] = pool_df['V'].astype(int)
    pool_df['M'] = pool_df['M'].astype(int)
    pool_df['TD'] = pool_df['TD'].astype(int)
    pool_df['TR'] = pool_df['TR'].astype(int)
    pool_df['Ind.'] = pool_df['Ind.'].astype(float)
    pool_df['Diff.'] = pool_df['Diff.'].astype(int)
    pool_df['Q'] = pool_df['Q'].astype(str)

    top_16_athletes = athletes_df.nlargest(16, 'Rank')['Name'].tolist()
    pool_qualifiers = pool_df.nlargest(16, 'Rk.')['Name'].tolist()
    main_round_athletes = top_16_athletes + pool_qualifiers
    preliminary_athletes = [name for name in athletes_df['Name'].tolist() if name not in main_round_athletes]

    elo_df, fencer_elos = calculate_elo_ratings(elimination_df, ranking_df)
    features_df = engineer_features(elimination_df, pool_df, elo_df, ranking_df, preliminary_athletes)

    print("Exiting clean_data function")
    return elimination_df, ranking_df, pool_df, athletes_df, features_df, elo_df, fencer_elos

# Calculate Elo ratings
def calculate_elo_ratings(elimination_df, ranking_df):
    assert 'Fencer A Name' in elimination_df.columns, f"Fencer A Name missing in elimination_df: {elimination_df.columns.tolist()}"
    print(f"calculate_elo_ratings: elimination_df columns: {elimination_df.columns.tolist()}")

    # Initialize Elo system
    elo = Elo(k=5)
    # Use initial ranks as a proxy for Elo (lower rank = higher Elo)
    initial_elos = ranking_df.set_index('Name')['Rank'].rank(ascending=False).to_dict()
    fencer_elos = pd.Series(initial_elos, dtype=float).fillna(1000).to_dict()

    for _, row in elimination_df.iterrows():
        fencer_a = row.get('Fencer A Name')
        fencer_b = row.get('Fencer B Name')
        winner = row.get('Winner')
        if pd.isna(fencer_a) or pd.isna(fencer_b) or pd.isna(winner):
            continue

        rating_a = fencer_elos.get(fencer_a, 1000)
        rating_b = fencer_elos.get(fencer_b, 1000)

        expected_a = elo.expected_score(rating_a, rating_b)
        result_a = 1 if winner == fencer_a else 0

        fencer_elos[fencer_a] = elo.update_rating(rating_a, expected_a, result_a, elo.k)
        fencer_elos[fencer_b] = elo.update_rating(rating_b, 1 - expected_a, 1 - result_a, elo.k)

    elo_df = pd.DataFrame(list(fencer_elos.items()), columns=['Name', 'Elo'])
    print(f"elo_df columns: {elo_df.columns.tolist()}")
    return elo_df, fencer_elos

# Engineer features for individual fencers
def engineer_features(elimination_df, pool_df, elo_df, ranking_df, preliminary_athletes):
    assert 'Fencer A Name' in elimination_df.columns, f"Fencer A Name missing in elimination_df: {elimination_df.columns.tolist()}"
    print(f"engineer_features: elimination_df columns: {elimination_df.columns.tolist()}")

    pool_perf = pool_df.groupby('Name').agg({'V': 'sum', 'M': 'sum', 'Rk.': 'mean'}).reset_index()
    pool_perf['Win_Rate'] = pool_perf['V'] / pool_perf['M']

    # Count wins for each fencer
    win_counts = elimination_df['Winner'].value_counts().rename_axis('Name').reset_index(name='Elim_Wins')

    # Count matches for each fencer
    matches_a = elimination_df['Fencer A Name'].value_counts()
    matches_b = elimination_df['Fencer B Name'].value_counts()
    total_matches = (matches_a.add(matches_b, fill_value=0)).rename_axis('Name').reset_index(name='Elim_Matches')

    # Combine features
    features = pool_perf.merge(win_counts, on='Name', how='left')
    features = features.merge(total_matches, on='Name', how='left')
    features = features.merge(elo_df, on='Name', how='left')
    features = features.merge(ranking_df[['Name', 'Rank']], on='Name', how='left')

    features['Elim_Win_Rate'] = features['Elim_Wins'] / features['Elim_Matches'].replace(0, 1)
    features['IsPreliminary'] = features['Name'].isin(preliminary_athletes).astype(int)

    # Fill NA values
    features = features.fillna({'Elim_Wins': 0, 'Elim_Matches': 0, 'Elo': 1000, 'Rank': 999})

    return features

# Create head-to-head features
def engineer_head_to_head_features(elimination_df, features_df):
    assert 'Fencer A Name' in elimination_df.columns, f"Fencer A Name missing in elimination_df: {elimination_df.columns.tolist()}"
    print(f"engineer_head_to_head_features: elimination_df columns: {elimination_df.columns.tolist()}")

    match_features = []
    for _, row in elimination_df.iterrows():
        fencer_a = row.get('Fencer A Name')
        fencer_b = row.get('Fencer B Name')
        winner = row.get('Winner')
        score_diff = row.get('Fencer A Score') - row.get('Fencer B Score')
        if pd.isna(fencer_a) or pd.isna(fencer_b) or pd.isna(winner) or pd.isna(score_diff):
            continue
        if winner == fencer_a:
            result = 1
        elif winner == fencer_b:
            result = 0
        else:
            continue

        fencer_a_features = features_df[features_df['Name'] == fencer_a]
        fencer_b_features = features_df[features_df['Name'] == fencer_b]
        if fencer_a_features.empty or fencer_b_features.empty:
            continue

        match_data = {
            'Elo_Diff': fencer_a_features['Elo'].iloc[0] - fencer_b_features['Elo'].iloc[0],
            'Win_Rate_Diff': fencer_a_features['Win_Rate'].iloc[0] - fencer_b_features['Win_Rate'].iloc[0],
            'Elim_Win_Rate_Diff': fencer_a_features['Elim_Win_Rate'].iloc[0] - fencer_b_features['Elim_Win_Rate'].iloc[0],
            'Rank_Diff': fencer_a_features['Rank'].iloc[0] - fencer_b_features['Rank'].iloc[0],
            'Elim_Wins_Diff': fencer_a_features['Elim_Wins'].iloc[0] - fencer_b_features['Elim_Wins'].iloc[0],
            'IsPreliminary_A': fencer_a_features['IsPreliminary'].iloc[0],
            'IsPreliminary_B': fencer_b_features['IsPreliminary'].iloc[0],
            'Result': result,
            'Score_Diff': score_diff
        }
        match_features.append(match_data)

    return pd.DataFrame(match_features)

# Main execution
all_features = []
all_elimination_dfs = []
for i in range(1, 7):
    tournament_name = input(f"Enter name for Tournament {i}: ")
    tournament_folder = f"/content/{tournament_name}"
    os.makedirs(tournament_folder, exist_ok=True)

    # Load and clean data in one go
    elimination_df, ranking_df, pool_df, athletes_df = load_competition_data(tournament_folder, tournament_name)
    elimination_df, ranking_df, pool_df, athletes_df, features_df, elo_df, fencer_elos = clean_data(elimination_df, ranking_df, pool_df, athletes_df)

    all_features.append(features_df)
    all_elimination_dfs.append(elimination_df)

combined_features = pd.concat(all_features, ignore_index=True)
combined_features = combined_features.drop_duplicates(subset=['Name'])
combined_elimination = pd.concat(all_elimination_dfs, ignore_index=True)






Enter name for Tournament 1: Grand_Prix_Seoul_2023_165
Uploading files for Grand_Prix_Seoul_2023_165: pool_results.csv, elimination_rounds.csv, final_ranking.csv, athletes.csv


Saving pool_results.csv to pool_results.csv
Saving elimination_rounds.csv to elimination_rounds.csv
Saving athletes.csv to athletes.csv
Saving final_ranking.csv to final_ranking.csv
Files loaded in /content/Grand_Prix_Seoul_2023_165: ['pool_results.csv', 'elimination_rounds.csv', 'athletes.csv', 'final_ranking.csv']
Raw elimination_df columns: ['Round', 'Fencer A Name', 'Fencer A Country', 'Fencer A Score', 'Fencer B Name', 'Fencer B Country', 'Fencer B Score', 'Winner']
Cleaned elimination_df columns: ['Round', 'Fencer A Name', 'Fencer A Country', 'Fencer A Score', 'Fencer B Name', 'Fencer B Country', 'Fencer B Score', 'Winner']
Sample elimination_df data (rows: 159):
             Round     Fencer A Name Fencer A Country  Fencer A Score     Fencer B Name Fencer B Country  Fencer B Score            Winner
0  Preliminary 128  REPETTI Giovanni              ITA              15               NaN              NaN             NaN  REPETTI Giovanni
1  Preliminary 128      JEONG Hangil        

Saving pool_results.csv to pool_results.csv
Saving elimination_rounds.csv to elimination_rounds.csv
Saving athletes.csv to athletes.csv
Saving final_ranking.csv to final_ranking.csv
Files loaded in /content/Grand_Prix_Seoul_2024_165: ['pool_results.csv', 'elimination_rounds.csv', 'athletes.csv', 'final_ranking.csv']
Raw elimination_df columns: ['Round', 'Fencer A Name', 'Fencer A Country', 'Fencer A Score', 'Fencer B Name', 'Fencer B Country', 'Fencer B Score', 'Winner']
Cleaned elimination_df columns: ['Round', 'Fencer A Name', 'Fencer A Country', 'Fencer A Score', 'Fencer B Name', 'Fencer B Country', 'Fencer B Score', 'Winner']
Sample elimination_df data (rows: 159):
             Round             Fencer A Name Fencer A Country  Fencer A Score      Fencer B Name Fencer B Country  Fencer B Score                    Winner
0  Preliminary 128  MATEEV Andri Nikolaysson              ISL              15                NaN              NaN             NaN  MATEEV Andri Nikolaysson
1  Prelimi

Saving pool_results.csv to pool_results.csv
Saving elimination_rounds.csv to elimination_rounds.csv
Saving athletes.csv to athletes.csv
Saving final_ranking.csv to final_ranking.csv
Files loaded in /content/Villa_de_Madrid_World_Cup_2023_474: ['pool_results.csv', 'elimination_rounds.csv', 'athletes.csv', 'final_ranking.csv']
Raw elimination_df columns: ['Round', 'Fencer A Name', 'Fencer A Country', 'Fencer A Score', 'Fencer B Name', 'Fencer B Country', 'Fencer B Score', 'Winner']
Cleaned elimination_df columns: ['Round', 'Fencer A Name', 'Fencer A Country', 'Fencer A Score', 'Fencer B Name', 'Fencer B Country', 'Fencer B Score', 'Winner']
Sample elimination_df data (rows: 287):
             Round Fencer A Name Fencer A Country  Fencer A Score      Fencer B Name Fencer B Country  Fencer B Score             Winner
0  Preliminary 256    TSAP Yuriy              UKR              15                NaN              NaN             NaN         TSAP Yuriy
1  Preliminary 256   KATO Hibiki       

Saving pool_results.csv to pool_results.csv
Saving elimination_rounds.csv to elimination_rounds.csv
Saving athletes.csv to athletes.csv
Saving final_ranking.csv to final_ranking.csv
Files loaded in /content/Villa_de_Madrid_World_Cup_2024_474: ['pool_results.csv', 'elimination_rounds.csv', 'athletes.csv', 'final_ranking.csv']
Raw elimination_df columns: ['Round', 'Fencer A Name', 'Fencer A Country', 'Fencer A Score', 'Fencer B Name', 'Fencer B Country', 'Fencer B Score', 'Winner']
Cleaned elimination_df columns: ['Round', 'Fencer A Name', 'Fencer A Country', 'Fencer A Score', 'Fencer B Name', 'Fencer B Country', 'Fencer B Score', 'Winner']
Sample elimination_df data (rows: 287):
             Round           Fencer A Name Fencer A Country  Fencer A Score   Fencer B Name Fencer B Country  Fencer B Score           Winner
0  Preliminary 256         THOMPSON Khalil              USA              15             NaN              NaN             NaN  THOMPSON Khalil
1  Preliminary 256  ZHAILYBAY

Saving pool_results.csv to pool_results.csv
Saving elimination_rounds.csv to elimination_rounds.csv
Saving athletes.csv to athletes.csv
Saving final_ranking.csv to final_ranking.csv
Files loaded in /content/World_Championships_Milan_2023_246: ['pool_results.csv', 'elimination_rounds.csv', 'athletes.csv', 'final_ranking.csv']
Raw elimination_df columns: ['Round', 'Fencer A Name', 'Fencer A Country', 'Fencer A Score', 'Fencer B Name', 'Fencer B Country', 'Fencer B Score', 'Winner']
Cleaned elimination_df columns: ['Round', 'Fencer A Name', 'Fencer A Country', 'Fencer A Score', 'Fencer B Name', 'Fencer B Country', 'Fencer B Score', 'Winner']
Sample elimination_df data (rows: 159):
             Round    Fencer A Name Fencer A Country  Fencer A Score Fencer B Name Fencer B Country  Fencer B Score           Winner
0  Preliminary 128  NUCCIO Riccardo              ITA              15           NaN              NaN             NaN  NUCCIO Riccardo
1  Preliminary 128  MAMUTOV Sherzod            

Saving pool_results.csv to pool_results.csv
Saving elimination_rounds.csv to elimination_rounds.csv
Saving athletes.csv to athletes.csv
Saving final_ranking.csv to final_ranking.csv
Files loaded in /content/World_Cup_Budapest_2023_160: ['pool_results.csv', 'elimination_rounds.csv', 'athletes.csv', 'final_ranking.csv']
Raw elimination_df columns: ['Round', 'Fencer A Name', 'Fencer A Country', 'Fencer A Score', 'Fencer B Name', 'Fencer B Country', 'Fencer B Score', 'Winner']
Cleaned elimination_df columns: ['Round', 'Fencer A Name', 'Fencer A Country', 'Fencer A Score', 'Fencer B Name', 'Fencer B Country', 'Fencer B Score', 'Winner']
Sample elimination_df data (rows: 159):
             Round Fencer A Name Fencer A Country  Fencer A Score      Fencer B Name Fencer B Country  Fencer B Score       Winner
0  Preliminary 128    KOKUBO Mao              JPN              15         ROSTOM Ali              KUW               4   KOKUBO Mao
1  Preliminary 128   PECH Miklos              HUN         

Creating feature dataset, scaling dataset and training/evaluating ML models

In [None]:
# Create head-to-head dataset
head_to_head_df = engineer_head_to_head_features(combined_elimination, combined_features)

# Train machine learning models for win probability
X = head_to_head_df[['Elo_Diff', 'Win_Rate_Diff', 'Elim_Win_Rate_Diff', 'Rank_Diff', 'Elim_Wins_Diff', 'IsPreliminary_A', 'IsPreliminary_B']]
y_win = head_to_head_df['Result']
y_score_diff = head_to_head_df['Score_Diff']

X_train, X_test, y_win_train, y_win_test, y_score_train, y_score_test = train_test_split(
    X, y_win, y_score_diff, test_size=0.2, random_state=42
)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# RandomForestRegressor for win probability
rf_win_model = RandomForestRegressor(random_state=42)
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10]
}
rf_win_grid_search = GridSearchCV(rf_win_model, rf_param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
rf_win_grid_search.fit(X_train_scaled, y_win_train)
rf_win_best_model = rf_win_grid_search.best_estimator_
rf_win_y_pred = rf_win_best_model.predict(X_test_scaled)
rf_win_mse = mean_squared_error(y_win_test, rf_win_y_pred)
print(f"RandomForest Win Probability Best Parameters: {rf_win_grid_search.best_params_}")
print(f"RandomForest Win Probability Mean Squared Error: {rf_win_mse:.3f}")
# XGBoost for win probability
xgb_win_model = XGBRegressor(random_state=42)
xgb_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3]
}
xgb_win_grid_search = GridSearchCV(xgb_win_model, xgb_param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
xgb_win_grid_search.fit(X_train_scaled, y_win_train)
xgb_win_best_model = xgb_win_grid_search.best_estimator_
xgb_win_y_pred = xgb_win_best_model.predict(X_test_scaled)
xgb_win_mse = mean_squared_error(y_win_test, xgb_win_y_pred)
print(f"XGBoost Win Probability Best Parameters: {xgb_win_grid_search.best_params_}")
print(f"XGBoost Win Probability Mean Squared Error: {xgb_win_mse:.3f}")

# RandomForestRegressor for score difference
rf_score_model = RandomForestRegressor(random_state=42)
rf_score_grid_search = GridSearchCV(rf_score_model, rf_param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
rf_score_grid_search.fit(X_train_scaled, y_score_train)
rf_score_best_model = rf_score_grid_search.best_estimator_
rf_score_y_pred = rf_score_best_model.predict(X_test_scaled)
rf_score_mse = mean_squared_error(y_score_test, rf_score_y_pred)
print(f"RandomForest Score Difference Best Parameters: {rf_score_grid_search.best_params_}")
print(f"RandomForest Score Difference Mean Squared Error: {rf_score_mse:.3f}")

# XGBoost for score difference
xgb_score_model = XGBRegressor(random_state=42)
xgb_score_grid_search = GridSearchCV(xgb_score_model, xgb_param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
xgb_score_grid_search.fit(X_train_scaled, y_score_train)
xgb_score_best_model = xgb_score_grid_search.best_estimator_
xgb_score_y_pred = xgb_score_best_model.predict(X_test_scaled)
xgb_score_mse = mean_squared_error(y_score_test, xgb_score_y_pred)
print(f"XGBoost Score Difference Best Parameters: {xgb_score_grid_search.best_params_}")
print(f"XGBoost Score Difference Mean Squared Error: {xgb_score_mse:.3f}")

engineer_head_to_head_features: elimination_df columns: ['Round', 'Fencer A Name', 'Fencer A Country', 'Fencer A Score', 'Fencer B Name', 'Fencer B Country', 'Fencer B Score', 'Winner']
RandomForest Win Probability Best Parameters: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 200}
RandomForest Win Probability Mean Squared Error: 0.182
XGBoost Win Probability Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200}
XGBoost Win Probability Mean Squared Error: 0.167
RandomForest Score Difference Best Parameters: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 200}
RandomForest Score Difference Mean Squared Error: 30.947
XGBoost Score Difference Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200}
XGBoost Score Difference Mean Squared Error: 29.481


ML Model and Elo rating predictions (ensembling by averaging)

In [None]:
# Predict head-to-head win probability and scoreline
def predict_head_to_head(fencer_a_name, fencer_b_name, combined_features, fencer_elos, rf_win_best_model, xgb_win_best_model, rf_score_best_model, xgb_score_best_model, scaler):
    fencer_a_name = fencer_a_name.title().strip()
    fencer_b_name = fencer_b_name.title().strip()
    fencer_a_data = combined_features[combined_features['Name'] == fencer_a_name]
    fencer_b_data = combined_features[combined_features['Name'] == fencer_b_name]
    if fencer_a_data.empty or fencer_b_data.empty:
        return f"One or both fencers ({fencer_a_name}, {fencer_b_name}) not found in data."

    # Create input features
    input_data = pd.DataFrame([{
        'Elo_Diff': fencer_a_data['Elo'].iloc[0] - fencer_b_data['Elo'].iloc[0],
        'Win_Rate_Diff': fencer_a_data['Win_Rate'].iloc[0] - fencer_b_data['Win_Rate'].iloc[0],
        'Elim_Win_Rate_Diff': fencer_a_data['Elim_Win_Rate'].iloc[0] - fencer_b_data['Elim_Win_Rate'].iloc[0],
        'Rank_Diff': fencer_a_data['Rank'].iloc[0] - fencer_b_data['Rank'].iloc[0],
        'Elim_Wins_Diff': fencer_a_data['Elim_Wins'].iloc[0] - fencer_b_data['Elim_Wins'].iloc[0],
        'IsPreliminary_A': fencer_a_data['IsPreliminary'].iloc[0],
        'IsPreliminary_B': fencer_b_data['IsPreliminary'].iloc[0]
    }])
    input_scaled = scaler.transform(input_data)

    # Win probability predictions
    rf_win_pred = rf_win_best_model.predict(input_scaled)[0]
    xgb_win_pred = xgb_win_best_model.predict(input_scaled)[0]
    elo = Elo(k=5)
    elo_pred = elo.expected_score(fencer_elos.get(fencer_a_name, 1000), fencer_elos.get(fencer_b_name, 1000))
    final_win_pred = (rf_win_pred + xgb_win_pred + elo_pred) / 3
    final_win_pred = np.clip(final_win_pred, 0, 1)

    # Score difference predictions
    rf_score_pred = rf_score_best_model.predict(input_scaled)[0]
    xgb_score_pred = xgb_score_best_model.predict(input_scaled)[0]
    final_score_diff = (rf_score_pred + xgb_score_pred) / 2

    # Derive scoreline with proper caps (winner = 15, loser between 0–14)
    if final_win_pred >= 0.5:
        fencer_a_score = 15
        fencer_b_score = min(14, max(0, int(round(15 - final_score_diff))))
        outcome = f"V (Fencer A wins)"
    else:
        fencer_b_score = 15
        fencer_a_score = min(14, max(0, int(round(15 + final_score_diff))))
        outcome = f"D (Fencer B wins)"


    # Customising output for better visibility
    length = 20
    prob_a = final_win_pred
    prob_b = 1 - final_win_pred
    blocks_a = int(length * prob_a)
    blocks_b = int(length * prob_b)
    bar_a = "█" * blocks_a + "-" * (length - blocks_a)
    bar_b = "█" * blocks_b + "-" * (length - blocks_b)

    winner = fencer_a_name if fencer_a_score > fencer_b_score else fencer_b_name
    loser= fencer_a_name if fencer_a_score < fencer_b_score else fencer_b_name

    # General one-liner summary with winner included
    if final_win_pred > 0.7:
      summary = f"{winner} is strongly favored to win over {loser}."
    elif final_win_pred > 0.55:
      summary = f"{winner} has a slight advantage over {loser}."
    elif final_win_pred > 0.45:
      summary = f"The match is expected to be closely contested, with {winner} slightly ahead."
    else:
      summary = f"{winner} is predicted to win, but could face a tough challenge from {loser}."


    output = (
        "\n===== HEAD-TO-HEAD PREDICTION =====\n"
        f"Fencers: {fencer_a_name} vs {fencer_b_name}\n\n"
        "Win Probability:\n"
        f"{fencer_a_name}: {bar_a} {prob_a*100:.1f}%\n"
        f"{fencer_b_name}: {bar_b} {prob_b*100:.1f}%\n\n"
        # "Model Breakdown:\n"
        # f"- RandomForest Win: {rf_win_pred:.2%}\n"
        # f"- XGBoost Win: {xgb_win_pred:.2%}\n"
        # f"- Elo: {elo_pred:.2%}\n"
        # f"- Combined Average Prediction: {final_win_pred:.2%}\n\n"
        f"Predicted Outcome: {winner}\n"
        f"Predicted Scoreline: {fencer_a_name} {fencer_a_score} : {fencer_b_score} {fencer_b_name}\n"
        f'Analysis: {summary}\n '
        "\n==================================\n"
    )

    return output



Usage

In [None]:

names = sorted(combined_features['Name'].unique())

def choose_fencer(prompt):
    while True:
        search = input(f"\n{prompt} (type part of the name or 'all' to see full list, 'exit' to quit): ").lower()

        if search == "exit":
            return None  # Signal to exit the main loop

        if search == "all":
            print("\nFull list of fencers:")
            for i, name in enumerate(names, start=1):
                print(f"{i}. {name}")
            continue

        matches = [n for n in names if search in n.lower()]

        if not matches:
            print("No matches found, try again.")
            continue

        # show matches with numbers
        for i, name in enumerate(matches, start=1):
            print(f"{i}. {name}")

        # ask user to pick one
        try:
            choice = int(input("Select by number: "))
            if 1 <= choice <= len(matches):
                return matches[choice - 1]
            else:
                print("Invalid choice, try again.")
        except ValueError:
            print("Please enter a valid number.")


# Main loop for multiple predictions
while True:
    fencer_a_name = choose_fencer("Fencer A")
    if fencer_a_name is None:
        print("Exiting predictions.")
        break

    fencer_b_name = choose_fencer("Fencer B")
    if fencer_b_name is None:
        print("Exiting predictions.")
        break

    print(f"\nHead-to-head: {fencer_a_name} vs {fencer_b_name}\n")
    print(predict_head_to_head(
        fencer_a_name,
        fencer_b_name,
        combined_features,
        fencer_elos,
        rf_win_best_model,
        xgb_win_best_model,
        rf_score_best_model,
        xgb_score_best_model,
        scaler
    ))

    # Optionally, ask if user wants to predict another match
    cont = input("\nDo you want to predict another match? (y/n): ").lower()
    if cont != 'y':
        print("Exiting predictions.")
        break



Fencer A (type part of the name or 'all' to see full list, 'exit' to quit): all

Full list of fencers:
1. Abdazov Islambek
2. Abdelaal Mohamed
3. Abdulkareem Mohammed
4. Ailinca Rares
5. Al-Nasser Mahmoud
6. Alamr Mohammed
7. Albaqami Saad
8. Aleksandrov Ivaylo
9. Allen Samuel
10. Almansaf Abdullah
11. Almutairi Adel
12. Almutairi Ziyad
13. Alpaidze Mikhail
14. Alqudihi Ahmed
15. Alshamlan Yousef
16. Alvarez Duran Pablo
17. Alvarez Ricardo
18. Amer Mohamed
19. Anasiz Muhammed
20. Andrews Sean
21. Annic Edern
22. Archvadze Vakhtangi
23. Arfa Fares
24. Armijo Gabriel
25. Arpino Alberto
26. Aslan Tolga
27. Ayala Julian
28. Ayman Mostafa
29. Aymuratov Musa
30. Baaken Matyas
31. Bae Kyungrok
32. Bahamonde Manuel
33. Baher Arasbaran Farzad
34. Banyay Endre Marcell
35. Bazadze Beka
36. Ben Avram Lev
37. Benedict Nicolaus
38. Berre' Enrico
39. Bibi Eliott
40. Bohovin Bohdan
41. Bonah Luis
42. Bonah Raoul
43. Bonsanto Francesco
44. Bounchada Zacharia
45. Boureau Paco
46. Bravo Inaki
47. Bronis