In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# EDA

# Simple starter code

In [None]:
w_seed = pd.read_csv('/kaggle/input/march-machine-learning-mania-2025/WNCAATourneySeeds.csv')
m_seed = pd.read_csv('/kaggle/input/march-machine-learning-mania-2025/MNCAATourneySeeds.csv')
seed_df = pd.concat([m_seed, w_seed], axis=0).fillna(0.05)
submission_df = pd.read_csv('/kaggle/input/march-machine-learning-mania-2025/SampleSubmissionStage2.csv')

Team rankings are present in the files WNCAATourneySeeds.csv and MNCAATourneySeeds.csv.

- The "Season" column indicates the year
- The "Seed" column indicates the ranking for a given conference (W01 = ranking 1 in conference W)
- The "TeamID" column contains a unique identifier for every team

In [None]:
seed_df.head()

The sample_submission.csv file contains an "ID" column with the format year_teamID1_teamID2.

In [None]:
submission_df.head()

## Extract game info and team rankings

In [None]:
def extract_game_info(id_str):
    # Extract year and team_ids
    parts = id_str.split('_')
    year = int(parts[0])
    teamID1 = int(parts[1])
    teamID2 = int(parts[2])
    return year, teamID1, teamID2

def extract_seed_value(seed_str):
    # Extract seed value
    try:
        return int(seed_str[1:])
    # Set seed to 16 for unselected teams and errors
    except ValueError:
        return 16

# Reformat the data
submission_df[['Season', 'TeamID1', 'TeamID2']] = submission_df['ID'].apply(extract_game_info).tolist()
seed_df['SeedValue'] = seed_df['Seed'].apply(extract_seed_value)

# Merge seed information for TeamID1
submission_df = pd.merge(submission_df, seed_df[['Season', 'TeamID', 'SeedValue']],
                         left_on=['Season', 'TeamID1'], right_on=['Season', 'TeamID'],
                         how='left')
submission_df = submission_df.rename(columns={'SeedValue': 'SeedValue1'}).drop(columns=['TeamID'])

# Merge seed information for TeamID2
submission_df = pd.merge(submission_df, seed_df[['Season', 'TeamID', 'SeedValue']],
                         left_on=['Season', 'TeamID2'], right_on=['Season', 'TeamID'],
                         how='left')
submission_df = submission_df.rename(columns={'SeedValue': 'SeedValue2'}).drop(columns=['TeamID'])
print(submission_df)

## Make your predictions

In [None]:
# Overall ranking of the team in the underlying system.
m_massey_ordinals_df = pd.read_csv('/kaggle/input/march-machine-learning-mania-2025/MMasseyOrdinals.csv')

In [None]:
m_massey_ordinals_df.head()

In [None]:
m_massey_ordinals_2025_df = m_massey_ordinals_df[m_massey_ordinals_df['Season'] == 2025]

In [None]:
m_massey_ordinals_2025_df.head()

In [None]:
m_mo_ave_df = pd.DataFrame(m_massey_ordinals_2025_df.groupby("TeamID")["OrdinalRank"].mean())

In [None]:
m_mo_ave_df.head()

Method 1: Bradley Terry Model

Predicts the winning percentage based on the difference in team standings using a logistic function
* Statistically well-founded model
* Need to adjust parameter c (to fit the data set)
* Often used in sports forecasting

In [None]:
from scipy.special import expit

def bradley_terry_probability(team_a: int, team_b: int, ranks: pd.DataFrame, c=0.01) -> np.float64:
    rank_diff = ranks.loc[team_b] - ranks.loc[team_a]
    prob = expit(c * rank_diff) # Convert to probability with logistic function

    return prob["OrdinalRank"]

Method2: ELO Rating

Applying the ELO rating system, we predict the winning percentage based on the difference in rankings
* Proven in many competitions, including chess
* Scale parameters need to be adjusted
* Can be adjusted more accurately with past competition results

In [None]:
def elo_win_probability(team_a: int, team_b: int, ranks: pd.DataFrame, scale=400):
    max_rank = ranks.max()
    # Reverse OrdinalRank and treat it like an ELO rating (smaller is stronger)
    elo_like_ratings = max_rank - ranks

    rating_diff = elo_like_ratings.loc[team_a] - elo_like_ratings.loc[team_b]
    prob = 1.0 / (1.0 + 10 ** (-rating_diff / scale))

    return prob["OrdinalRank"]

Method 3: Normalized Rank

A method that normalizes a team's ranking to the range [0,1] and interprets it directly as a probability
* Simple and easy to implement
* Intuitive reflection of relative strength among teams

In [None]:
def normalized_rank_probability(team_a, team_b, ranks):
    # Normalized by inverting the ranks (because smaller is stronger)
    reversed_ranks = ranks.max() - ranks
    normalized = (reversed_ranks - reversed_ranks.min()) / (reversed_ranks.max() - reversed_ranks.min())

    # Calculate win rate from ratio of normalized values
    p_a = normalized.loc[team_a]
    p_b = normalized.loc[team_b]
    win_prob = p_a / (p_a + p_b)

    return win_prob["OrdinalRank"]

In [None]:
for target_team in m_mo_ave_df.index:
    for opposing_team in m_mo_ave_df.index:
        match_id = f"2025_{target_team}_{opposing_team}"
        pred = bradley_terry_probability(target_team, opposing_team, m_mo_ave_df)
        # pred = elo_win_probability(target_team, opposing_team, m_mo_ave_df)
        # pred = normalized_rank_probability(target_team, opposing_team, m_mo_ave_df)

        submission_df.loc[submission_df["ID"] == match_id, "Pred"] = pred

In [None]:
submission_df

In [None]:
stats = submission_df.iloc[:, 1].describe()
print(stats)

## Understand the metric
We don't know the outcomes of the games, so instead let's assume that the team that was listed first won every single matchup. This is what we'll call our "true value". Next, we'll calculate the average squared difference between the probabilities in our submission and that ground truth value. We'll call this the "Brier score". https://en.wikipedia.org/wiki/Brier_score

# Create label data and evaluate accuracy

In [None]:
# Use tourney results for evaluation
m_tourney = pd.read_csv('/kaggle/input/march-machine-learning-mania-2025/MNCAATourneyCompactResults.csv')
w_tourney = pd.read_csv('/kaggle/input/march-machine-learning-mania-2025/WNCAATourneyCompactResults.csv')
tourney_results = pd.concat([m_tourney, w_tourney])
print(tourney_results)

In [None]:
def create_all_tourney_combination(tourney_results):
    # Ensure small ID comes first
    df = tourney_results.copy()
    df["SmallID"] = df[["WTeamID", "LTeamID"]].min(axis=1)
    df["BigID"] = df[["WTeamID", "LTeamID"]].max(axis=1)

    # Create ID in the format year_smallerId_biggerId
    df["ID"] = df["Season"].astype(str) + "_" + df["SmallID"].astype(str) + "_" + df["BigID"].astype(str)

    # Set prediction based on the original winner
    df["Pred"] = (df["SmallID"] == df["WTeamID"]).astype(int)

    return df[["ID", "Pred"]]

In [None]:
tourney_df = create_all_tourney_combination(tourney_results)
print(tourney_df)

In [None]:
from sklearn.metrics import brier_score_loss

def create_evaluation_data(tourney_df, prediction):
    # Merge to keep only matching IDs
    merged_df = tourney_df.merge(prediction, on="ID", suffixes=("_true", "_pred"))
    if merged_df.empty:
        # If there are no matchings between label data and prediction, return 0
        return None

    return merged_df

In [None]:
prediction_test_data = submission_df.copy()
# Create example test data by converting year from 2025 to 2024, because we don't have 2025 data yet.
prediction_test_data["ID"] = prediction_test_data["ID"].str.replace("2025_", "2024_", regex=False)
merged_df = create_evaluation_data(tourney_df, prediction_test_data)  # Extract matching values

# Calculate Brier Score
if not merged_df is None:
    y_true = merged_df["Pred_true"]  # Actual results (0 or 1)
    y_pred = merged_df["Pred_pred"]  # Predicted probabilities
    brier_score = brier_score_loss(y_true, y_pred)
else:
    brier_score = 0

print("Brier Score:", brier_score)

# Make submission

In [None]:
submission_df.to_csv('/kaggle/working/submission.csv', index=False)