In [185]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/march-machine-learning-mania-2025/Conferences.csv
/kaggle/input/march-machine-learning-mania-2025/SeedBenchmarkStage1.csv
/kaggle/input/march-machine-learning-mania-2025/WNCAATourneyDetailedResults.csv
/kaggle/input/march-machine-learning-mania-2025/WRegularSeasonCompactResults.csv
/kaggle/input/march-machine-learning-mania-2025/MNCAATourneySeedRoundSlots.csv
/kaggle/input/march-machine-learning-mania-2025/MRegularSeasonDetailedResults.csv
/kaggle/input/march-machine-learning-mania-2025/MNCAATourneyCompactResults.csv
/kaggle/input/march-machine-learning-mania-2025/MGameCities.csv
/kaggle/input/march-machine-learning-mania-2025/WSecondaryTourneyCompactResults.csv
/kaggle/input/march-machine-learning-mania-2025/WGameCities.csv
/kaggle/input/march-machine-learning-mania-2025/MSeasons.csv
/kaggle/input/march-machine-learning-mania-2025/WNCAATourneySlots.csv
/kaggle/input/march-machine-learning-mania-2025/MSecondaryTourneyTeams.csv
/kaggle/input/march-machine-learning-mania-20

# EDA

# Simple starter code

In [186]:
w_seed = pd.read_csv('/kaggle/input/march-machine-learning-mania-2025/WNCAATourneySeeds.csv')
m_seed = pd.read_csv('/kaggle/input/march-machine-learning-mania-2025/MNCAATourneySeeds.csv')
seed_df = pd.concat([m_seed, w_seed], axis=0).fillna(0.05)
submission_df = pd.read_csv('/kaggle/input/march-machine-learning-mania-2025/SampleSubmissionStage2.csv')

Team rankings are present in the files WNCAATourneySeeds.csv and MNCAATourneySeeds.csv.

- The "Season" column indicates the year
- The "Seed" column indicates the ranking for a given conference (W01 = ranking 1 in conference W)
- The "TeamID" column contains a unique identifier for every team

In [187]:
seed_df.head()

Unnamed: 0,Season,Seed,TeamID
0,1985,W01,1207
1,1985,W02,1210
2,1985,W03,1228
3,1985,W04,1260
4,1985,W05,1374


The sample_submission.csv file contains an "ID" column with the format year_teamID1_teamID2.

In [188]:
submission_df.head()

Unnamed: 0,ID,Pred
0,2025_1101_1102,0.5
1,2025_1101_1103,0.5
2,2025_1101_1104,0.5
3,2025_1101_1105,0.5
4,2025_1101_1106,0.5


## Extract game info and team rankings

In [189]:
def extract_game_info(id_str):
    # Extract year and team_ids
    parts = id_str.split('_')
    year = int(parts[0])
    teamID1 = int(parts[1])
    teamID2 = int(parts[2])
    return year, teamID1, teamID2

def extract_seed_value(seed_str):
    # Extract seed value
    try:
        return int(seed_str[1:])
    # Set seed to 16 for unselected teams and errors
    except ValueError:
        return 16

# Reformat the data
submission_df[['Season', 'TeamID1', 'TeamID2']] = submission_df['ID'].apply(extract_game_info).tolist()
seed_df['SeedValue'] = seed_df['Seed'].apply(extract_seed_value)

# Merge seed information for TeamID1
submission_df = pd.merge(submission_df, seed_df[['Season', 'TeamID', 'SeedValue']],
                         left_on=['Season', 'TeamID1'], right_on=['Season', 'TeamID'],
                         how='left')
submission_df = submission_df.rename(columns={'SeedValue': 'SeedValue1'}).drop(columns=['TeamID'])

# Merge seed information for TeamID2
submission_df = pd.merge(submission_df, seed_df[['Season', 'TeamID', 'SeedValue']],
                         left_on=['Season', 'TeamID2'], right_on=['Season', 'TeamID'],
                         how='left')
submission_df = submission_df.rename(columns={'SeedValue': 'SeedValue2'}).drop(columns=['TeamID'])
print(submission_df)

                    ID  Pred  Season  TeamID1  TeamID2  SeedValue1  SeedValue2
0       2025_1101_1102   0.5    2025     1101     1102         NaN         NaN
1       2025_1101_1103   0.5    2025     1101     1103         NaN         NaN
2       2025_1101_1104   0.5    2025     1101     1104         NaN         NaN
3       2025_1101_1105   0.5    2025     1101     1105         NaN         NaN
4       2025_1101_1106   0.5    2025     1101     1106         NaN         NaN
...                ...   ...     ...      ...      ...         ...         ...
131402  2025_3477_3479   0.5    2025     3477     3479         NaN         NaN
131403  2025_3477_3480   0.5    2025     3477     3480         NaN         NaN
131404  2025_3478_3479   0.5    2025     3478     3479         NaN         NaN
131405  2025_3478_3480   0.5    2025     3478     3480         NaN         NaN
131406  2025_3479_3480   0.5    2025     3479     3480         NaN         NaN

[131407 rows x 7 columns]


  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


## Make your predictions

In [190]:
# Calculate seed difference
submission_df['SeedDiff'] = submission_df['SeedValue1'] - submission_df['SeedValue2']

# Update 'Pred' column
submission_df['Pred'] = 0.5 + (0.03 * submission_df['SeedDiff'])

# Drop unnecessary columns
submission_df = submission_df[['ID', 'Pred']].fillna(0.5)

# Preview your submission
submission_df.head()

Unnamed: 0,ID,Pred
0,2025_1101_1102,0.5
1,2025_1101_1103,0.5
2,2025_1101_1104,0.5
3,2025_1101_1105,0.5
4,2025_1101_1106,0.5


In [191]:
stats = submission_df.iloc[:, 1].describe()
print(stats)

count    131407.0
mean          0.5
std           0.0
min           0.5
25%           0.5
50%           0.5
75%           0.5
max           0.5
Name: Pred, dtype: float64


## Understand the metric
We don't know the outcomes of the games, so instead let's assume that the team that was listed first won every single matchup. This is what we'll call our "true value". Next, we'll calculate the average squared difference between the probabilities in our submission and that ground truth value. We'll call this the "Brier score". https://en.wikipedia.org/wiki/Brier_score

# Create label data and evaluate accuracy

In [192]:
# Use tourney results for evaluation
m_tourney = pd.read_csv('/kaggle/input/march-machine-learning-mania-2025/MNCAATourneyCompactResults.csv')
w_tourney = pd.read_csv('/kaggle/input/march-machine-learning-mania-2025/WNCAATourneyCompactResults.csv')
tourney_results = pd.concat([m_tourney, w_tourney])
print(tourney_results)

      Season  DayNum  WTeamID  WScore  LTeamID  LScore WLoc  NumOT
0       1985     136     1116      63     1234      54    N      0
1       1985     136     1120      59     1345      58    N      0
2       1985     136     1207      68     1250      43    N      0
3       1985     136     1229      58     1425      55    N      0
4       1985     136     1242      49     1325      38    N      0
...      ...     ...      ...     ...      ...     ...  ...    ...
1645    2024     147     3163      80     3425      73    A      0
1646    2024     147     3234      94     3261      87    H      0
1647    2024     151     3234      71     3163      69    N      0
1648    2024     151     3376      78     3301      59    N      0
1649    2024     153     3376      87     3234      75    N      0

[4168 rows x 8 columns]


In [193]:
def create_all_tourney_combination(tourney_results):
    # Create rows where the winner is first
    df1 = tourney_results.copy()
    df1["ID"] = df1["Season"].astype(str) + "_" + df1["WTeamID"].astype(str) + "_" + df1["LTeamID"].astype(str)
    df1["Pred"] = 1  # WTeamID wins

    # Create rows where the loser is first
    df2 = tourney_results.copy()
    df2["ID"] = df2["Season"].astype(str) + "_" + df2["LTeamID"].astype(str) + "_" + df2["WTeamID"].astype(str)
    df2["Pred"] = 0  # LTeamID loses

    # Combine both versions
    tourney_df = pd.concat([df1, df2], ignore_index=True)[["ID", "Pred"]]

    return tourney_df

In [194]:
tourney_df = create_all_tourney_combination(tourney_results)
print(tourney_df)

                  ID  Pred
0     1985_1116_1234     1
1     1985_1120_1345     1
2     1985_1207_1250     1
3     1985_1229_1425     1
4     1985_1242_1325     1
...              ...   ...
8331  2024_3425_3163     0
8332  2024_3261_3234     0
8333  2024_3163_3234     0
8334  2024_3301_3376     0
8335  2024_3234_3376     0

[8336 rows x 2 columns]


In [195]:
from sklearn.metrics import brier_score_loss

def create_evaluation_data(tourney_df, prediction):
    # Merge to keep only matching IDs
    merged_df = tourney_df.merge(prediction, on="ID", suffixes=("_true", "_pred"))
    if merged_df.empty:
        # If there are no matchings between label data and prediction, return 0
        return None

    return merged_df

In [196]:
prediction_test_data = submission_df.copy()
# Create example test data by converting year from 2025 to 2024, because we don't have 2025 data yet.
prediction_test_data["ID"] = prediction_test_data["ID"].str.replace("2025_", "2024_", regex=False)
merged_df = create_evaluation_data(tourney_df, prediction_test_data)  # Extract matching values

# Calculate Brier Score
if not merged_df is None:
    y_true = merged_df["Pred_true"]  # Actual results (0 or 1)
    y_pred = merged_df["Pred_pred"]  # Predicted probabilities
    brier_score = brier_score_loss(y_true, y_pred)
else:
    brier_score = 0

print("Brier Score:", brier_score)

Brier Score: 0.25


# Make submission

In [197]:
submission_df.to_csv('/kaggle/working/submission.csv', index=False)