## Load libraries

In [157]:
import pandas as pd
import numpy as np
from sklearn.metrics import brier_score_loss
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from itertools import combinations
from collections import defaultdict

## Prepare data

### Load data

In [43]:
wregularseason = pd.read_csv("data/WRegularSeasonCompactResults.csv")
wtourneyseason = pd.read_csv("data/WNCAATourneyCompactResults.csv")

mregularseason = pd.read_csv("data/MRegularSeasonCompactResults.csv")
mtourneyseason = pd.read_csv("data/MNCAATourneyCompactResults.csv")

wregularseason["isTourney"] = np.zeros(wregularseason.shape[0], dtype=int)
wtourneyseason["isTourney"] = np.ones(wtourneyseason.shape[0], dtype=int)

mregularseason["isTourney"] = np.zeros(mregularseason.shape[0], dtype=int)
mtourneyseason["isTourney"] = np.ones(mtourneyseason.shape[0], dtype=int)

### Auxillar functions

In [198]:
def transform_loc(row):
    if row["WLoc"] == 'A':
        return 'H'
    elif row["WLoc"] == 'H':
        return 'A'
    else:
        return 'N'

def transform_data(df: pd.DataFrame) -> pd.DataFrame:
    df = df.drop(["DayNum", "WScore", "LScore", "NumOT", "WLoc"], axis=1)
    df["Result"] = np.ones(df.shape[0], dtype=int)
    inv_df = df.copy()
    inv_df[["WTeamID", "LTeamID"]] = inv_df[["LTeamID", "WTeamID"]]
    inv_df["Result"] = np.zeros(inv_df.shape[0], dtype=int)
    
    merged_df = pd.concat([df, inv_df], axis=0).reset_index(drop=True)
    return merged_df

def make_preds_for_submission(clf, list_of_teams):
    combinations_teams_id = combinations(list_of_teams, r=2)

    features = defaultdict(list)
    for team1, team2 in combinations_teams_id:
        features["Season"].append(2025)
        features["WTeamID"].append(team1)
        features["LTeamID"].append(team2)
        features["isTourney"].append(1)

    X_features = pd.DataFrame(features)
    pred_probs = np.max(clf.predict_proba(X_features), axis=1)
    X_features["ID"] = X_features.apply(lambda x: f"{x["Season"]}_{x["WTeamID"]}_{x["LTeamID"]}", axis=1)
    X_features["Pred"] = np.round(pred_probs, 4)
    return X_features[["ID", "Pred"]]

In [201]:
wprep = pd.concat([transform_data(wregularseason), transform_data(wtourneyseason)], axis=0).reset_index(drop=True)
mprep = pd.concat([transform_data(mregularseason), transform_data(mtourneyseason)], axis=0).reset_index(drop=True)

X_men = mprep.drop("Result", axis=1)
y_men = mprep.Result

X_women = wprep.drop("Result", axis=1)
y_women = wprep.Result

X_trainm, X_testm, y_trainm, y_testm = train_test_split(X_men, y_men, test_size=0.1, random_state=42)
X_trainw, X_testw, y_trainw, y_testw = train_test_split(X_women, y_women, test_size=0.1, random_state=42)

### Train simple model

In [202]:
log_regm = LogisticRegression()
log_regw = LogisticRegression()

log_regm.fit(X_trainm, y_trainm)
log_regw.fit(X_trainw, y_trainw)

pred_probsm = log_regm.predict_proba(X_testm)[:, 1]
pred_probsw = log_regw.predict_proba(X_testw)[:, 1]

print(f"Brier score for mens data: {np.round(brier_score_loss(y_testm, pred_probsm), 3).item()}")
print(f"Brier score for womens data: {np.round(brier_score_loss(y_testw, pred_probsw), 3).item()}")

Brier score for mens data: 0.25
Brier score for womens data: 0.25


In [None]:
teamlistm = sorted(mprep.WTeamID.unique())
teamlistw = wprep.WTeamID.unique()

mens_results = make_preds_for_submission(log_regm, teamlistm)
womens_results = make_preds_for_submission(log_regw, teamlistw)

final_result = pd.concat([mens_results, womens_results], axis=0).reset_index(drop=True)
final_result.to_csv("submission_result.csv")

In [204]:
teamlistm = mprep.WTeamID.unique().sort()

In [205]:
teamlistm

[np.int64(1101),
 np.int64(1102),
 np.int64(1103),
 np.int64(1104),
 np.int64(1105),
 np.int64(1106),
 np.int64(1107),
 np.int64(1108),
 np.int64(1109),
 np.int64(1110),
 np.int64(1111),
 np.int64(1112),
 np.int64(1113),
 np.int64(1114),
 np.int64(1115),
 np.int64(1116),
 np.int64(1117),
 np.int64(1118),
 np.int64(1119),
 np.int64(1120),
 np.int64(1121),
 np.int64(1122),
 np.int64(1123),
 np.int64(1124),
 np.int64(1125),
 np.int64(1126),
 np.int64(1127),
 np.int64(1128),
 np.int64(1129),
 np.int64(1130),
 np.int64(1131),
 np.int64(1132),
 np.int64(1133),
 np.int64(1134),
 np.int64(1135),
 np.int64(1136),
 np.int64(1137),
 np.int64(1138),
 np.int64(1139),
 np.int64(1140),
 np.int64(1141),
 np.int64(1142),
 np.int64(1143),
 np.int64(1144),
 np.int64(1145),
 np.int64(1146),
 np.int64(1147),
 np.int64(1148),
 np.int64(1149),
 np.int64(1150),
 np.int64(1151),
 np.int64(1152),
 np.int64(1153),
 np.int64(1154),
 np.int64(1155),
 np.int64(1156),
 np.int64(1157),
 np.int64(1158),
 np.int64(1159