In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/march-machine-learning-mania-2025/Conferences.csv
/kaggle/input/march-machine-learning-mania-2025/SeedBenchmarkStage1.csv
/kaggle/input/march-machine-learning-mania-2025/WNCAATourneyDetailedResults.csv
/kaggle/input/march-machine-learning-mania-2025/WRegularSeasonCompactResults.csv
/kaggle/input/march-machine-learning-mania-2025/MNCAATourneySeedRoundSlots.csv
/kaggle/input/march-machine-learning-mania-2025/MRegularSeasonDetailedResults.csv
/kaggle/input/march-machine-learning-mania-2025/MNCAATourneyCompactResults.csv
/kaggle/input/march-machine-learning-mania-2025/MGameCities.csv
/kaggle/input/march-machine-learning-mania-2025/WSecondaryTourneyCompactResults.csv
/kaggle/input/march-machine-learning-mania-2025/WGameCities.csv
/kaggle/input/march-machine-learning-mania-2025/MSeasons.csv
/kaggle/input/march-machine-learning-mania-2025/WNCAATourneySlots.csv
/kaggle/input/march-machine-learning-mania-2025/MSecondaryTourneyTeams.csv
/kaggle/input/march-machine-learning-mania-20

# EDA

# Simple starter code

In [2]:
w_seed = pd.read_csv('/kaggle/input/march-machine-learning-mania-2025/WNCAATourneySeeds.csv')
m_seed = pd.read_csv('/kaggle/input/march-machine-learning-mania-2025/MNCAATourneySeeds.csv')
seed_df = pd.concat([m_seed, w_seed], axis=0).fillna(0.05)
submission_df = pd.read_csv('/kaggle/input/march-machine-learning-mania-2025/SampleSubmissionStage2.csv')

Team rankings are present in the files WNCAATourneySeeds.csv and MNCAATourneySeeds.csv.

- The "Season" column indicates the year
- The "Seed" column indicates the ranking for a given conference (W01 = ranking 1 in conference W)
- The "TeamID" column contains a unique identifier for every team

In [3]:
seed_df.head()

Unnamed: 0,Season,Seed,TeamID
0,1985,W01,1207
1,1985,W02,1210
2,1985,W03,1228
3,1985,W04,1260
4,1985,W05,1374


The sample_submission.csv file contains an "ID" column with the format year_teamID1_teamID2.

In [4]:
submission_df.head()

Unnamed: 0,ID,Pred
0,2025_1101_1102,0.5
1,2025_1101_1103,0.5
2,2025_1101_1104,0.5
3,2025_1101_1105,0.5
4,2025_1101_1106,0.5


## Extract game info and team rankings

In [5]:
def extract_game_info(id_str):
    # Extract year and team_ids
    parts = id_str.split('_')
    year = int(parts[0])
    teamID1 = int(parts[1])
    teamID2 = int(parts[2])
    return year, teamID1, teamID2

def extract_seed_value(seed_str):
    # Extract seed value
    try:
        return int(seed_str[1:])
    # Set seed to 16 for unselected teams and errors
    except ValueError:
        return 16

# Reformat the data
submission_df[['Season', 'TeamID1', 'TeamID2']] = submission_df['ID'].apply(extract_game_info).tolist()
seed_df['SeedValue'] = seed_df['Seed'].apply(extract_seed_value)

# Merge seed information for TeamID1
submission_df = pd.merge(submission_df, seed_df[['Season', 'TeamID', 'SeedValue']],
                         left_on=['Season', 'TeamID1'], right_on=['Season', 'TeamID'],
                         how='left')
submission_df = submission_df.rename(columns={'SeedValue': 'SeedValue1'}).drop(columns=['TeamID'])

# Merge seed information for TeamID2
submission_df = pd.merge(submission_df, seed_df[['Season', 'TeamID', 'SeedValue']],
                         left_on=['Season', 'TeamID2'], right_on=['Season', 'TeamID'],
                         how='left')
submission_df = submission_df.rename(columns={'SeedValue': 'SeedValue2'}).drop(columns=['TeamID'])
print(submission_df)

                    ID  Pred  Season  TeamID1  TeamID2  SeedValue1  SeedValue2
0       2025_1101_1102   0.5    2025     1101     1102         NaN         NaN
1       2025_1101_1103   0.5    2025     1101     1103         NaN         NaN
2       2025_1101_1104   0.5    2025     1101     1104         NaN         NaN
3       2025_1101_1105   0.5    2025     1101     1105         NaN         NaN
4       2025_1101_1106   0.5    2025     1101     1106         NaN         NaN
...                ...   ...     ...      ...      ...         ...         ...
131402  2025_3477_3479   0.5    2025     3477     3479         NaN         NaN
131403  2025_3477_3480   0.5    2025     3477     3480         NaN         NaN
131404  2025_3478_3479   0.5    2025     3478     3479         NaN         NaN
131405  2025_3478_3480   0.5    2025     3478     3480         NaN         NaN
131406  2025_3479_3480   0.5    2025     3479     3480         NaN         NaN

[131407 rows x 7 columns]


  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


## Make your predictions

In [6]:
# Overall ranking of the team in the underlying system.
m_massey_ordinals_df = pd.read_csv('/kaggle/input/march-machine-learning-mania-2025/MMasseyOrdinals.csv')

In [7]:
m_massey_ordinals_df.head()

Unnamed: 0,Season,RankingDayNum,SystemName,TeamID,OrdinalRank
0,2003,35,SEL,1102,159
1,2003,35,SEL,1103,229
2,2003,35,SEL,1104,12
3,2003,35,SEL,1105,314
4,2003,35,SEL,1106,260


In [8]:
m_massey_ordinals_2025_df = m_massey_ordinals_df[m_massey_ordinals_df['Season'] == 2025]

In [9]:
m_massey_ordinals_2025_df.head()

Unnamed: 0,Season,RankingDayNum,SystemName,TeamID,OrdinalRank
5264557,2025,9,AP,1104,2
5264558,2025,9,AP,1112,9
5264559,2025,9,AP,1116,18
5264560,2025,9,AP,1120,5
5264561,2025,9,AP,1124,12


In [10]:
m_mo_ave_df = pd.DataFrame(m_massey_ordinals_2025_df.groupby("TeamID")["OrdinalRank"].mean())

In [11]:
m_mo_ave_df.head()

Unnamed: 0_level_0,OrdinalRank
TeamID,Unnamed: 1_level_1
1101,231.161238
1102,300.382736
1103,125.822476
1104,6.832827
1105,348.554649


Method 1: Bradley Terry Model

Predicts the winning percentage based on the difference in team standings using a logistic function
* Statistically well-founded model
* Need to adjust parameter c (to fit the data set)
* Often used in sports forecasting

In [12]:
from scipy.special import expit

def bradley_terry_probability(team_a: int, team_b: int, ranks: pd.DataFrame, c=0.01) -> np.float64:
    rank_diff = ranks.loc[team_b] - ranks.loc[team_a]
    prob = expit(c * rank_diff) # Convert to probability with logistic function

    return prob["OrdinalRank"]

Method2: ELO Rating

Applying the ELO rating system, we predict the winning percentage based on the difference in rankings
* Proven in many competitions, including chess
* Scale parameters need to be adjusted
* Can be adjusted more accurately with past competition results

In [13]:
def elo_win_probability(team_a: int, team_b: int, ranks: pd.DataFrame, scale=400):
    max_rank = ranks.max()
    # Reverse OrdinalRank and treat it like an ELO rating (smaller is stronger)
    elo_like_ratings = max_rank - ranks

    rating_diff = elo_like_ratings.loc[team_a] - elo_like_ratings.loc[team_b]
    prob = 1.0 / (1.0 + 10 ** (-rating_diff / scale))

    return prob["OrdinalRank"]

Method 3: Normalized Rank

A method that normalizes a team's ranking to the range [0,1] and interprets it directly as a probability
* Simple and easy to implement
* Intuitive reflection of relative strength among teams

In [14]:
def normalized_rank_probability(team_a, team_b, ranks):
    # Normalized by inverting the ranks (because smaller is stronger)
    reversed_ranks = ranks.max() - ranks
    normalized = (reversed_ranks - reversed_ranks.min()) / (reversed_ranks.max() - reversed_ranks.min())

    # Calculate win rate from ratio of normalized values
    p_a = normalized.loc[team_a]
    p_b = normalized.loc[team_b]
    win_prob = p_a / (p_a + p_b)

    return win_prob["OrdinalRank"]

In [15]:
for target_team in m_mo_ave_df.index:
    for opposing_team in m_mo_ave_df.index:
        match_id = f"2025_{target_team}_{opposing_team}"
        pred = bradley_terry_probability(target_team, opposing_team, m_mo_ave_df)
        # pred = elo_win_probability(target_team, opposing_team, m_mo_ave_df)
        # pred = normalized_rank_probability(target_team, opposing_team, m_mo_ave_df)

        submission_df.loc[submission_df["ID"] == match_id, "Pred"] = pred

In [16]:
submission_df

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,ID,Pred,Season,TeamID1,TeamID2,SeedValue1,SeedValue2
0,2025_1101_1102,0.666459,2025,1101,1102,,
1,2025_1101_1103,0.258575,2025,1101,1103,,
2,2025_1101_1104,0.095930,2025,1101,1104,,
3,2025_1101_1105,0.763855,2025,1101,1105,,
4,2025_1101_1106,0.627018,2025,1101,1106,,
...,...,...,...,...,...,...,...
131402,2025_3477_3479,0.500000,2025,3477,3479,,
131403,2025_3477_3480,0.500000,2025,3477,3480,,
131404,2025_3478_3479,0.500000,2025,3478,3479,,
131405,2025_3478_3480,0.500000,2025,3478,3480,,


In [17]:
stats = submission_df.iloc[:, 1].describe()
print(stats)

count    131407.000000
mean          0.496818
std           0.190784
min           0.027493
25%           0.489733
50%           0.500000
75%           0.500000
max           0.972532
Name: Pred, dtype: float64


## Understand the metric
We don't know the outcomes of the games, so instead let's assume that the team that was listed first won every single matchup. This is what we'll call our "true value". Next, we'll calculate the average squared difference between the probabilities in our submission and that ground truth value. We'll call this the "Brier score". https://en.wikipedia.org/wiki/Brier_score

In [18]:
from sklearn.metrics import brier_score_loss, mean_squared_error

# Create a dataframe of ground truth values
solution_df = submission_df.copy()
solution_df['Pred'] = 1

# Now calculate the Brier score
y_true = solution_df['Pred']
y_pred = submission_df['Pred']
brier_score = brier_score_loss(y_true, y_pred)
print(f"Brier Score: {brier_score}")

Brier Score: 0.2895905670337449


# Make submission

In [19]:
submission_df.to_csv('/kaggle/working/submission.csv', index=False)