## Load libraries

In [10]:
import pandas as pd
import numpy as np
from sklearn.metrics import brier_score_loss
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from collections import defaultdict

## Prepare data

### Load data

In [2]:
wregularseason = pd.read_csv("data/WRegularSeasonCompactResults.csv")
wtourneyseason = pd.read_csv("data/WNCAATourneyCompactResults.csv")

mregularseason = pd.read_csv("data/MRegularSeasonCompactResults.csv")
mtourneyseason = pd.read_csv("data/MNCAATourneyCompactResults.csv")

wregularseason["isTourney"] = np.zeros(wregularseason.shape[0], dtype=int)
wtourneyseason["isTourney"] = np.ones(wtourneyseason.shape[0], dtype=int)

mregularseason["isTourney"] = np.zeros(mregularseason.shape[0], dtype=int)
mtourneyseason["isTourney"] = np.ones(mtourneyseason.shape[0], dtype=int)

### Auxillar functions

In [51]:
def transform_loc(row):
    if row["WLoc"] == 'A':
        return 'H'
    elif row["WLoc"] == 'H':
        return 'A'
    else:
        return 'N'

def transform_data(df: pd.DataFrame) -> pd.DataFrame:
    df = df.drop(["DayNum", "WScore", "LScore", "NumOT", "WLoc"], axis=1)
    df["Result"] = 1
    inv_df = df.copy()
    inv_df[["WTeamID", "LTeamID"]] = inv_df[["LTeamID", "WTeamID"]]
    inv_df["Result"] = 0
    
    merged_df = pd.concat([df, inv_df], axis=0).reset_index(drop=True)
    return merged_df

def make_preds_for_submission(clf, filepath_sub, gender):
    df = pd.read_csv(filepath_sub)
    df[['Season', 'WTeamID', 'LTeamID']] = df['ID'].str.split('_', expand=True).astype(int)
    df['isTourney'] = 1
    if gender == "W":
        df = df.loc[df.WTeamID < 3000]
    else:
        df = df.loc[df.WTeamID > 3000]
    X_features = df.iloc[:, 2:]
    pred_probs = np.max(clf.predict_proba(X_features), axis=1)
    df["Pred"] = np.round(pred_probs, 4)
    return df[["ID", "Pred"]]

In [52]:
wprep = pd.concat([transform_data(wregularseason), transform_data(wtourneyseason)], axis=0).reset_index(drop=True)
mprep = pd.concat([transform_data(mregularseason), transform_data(mtourneyseason)], axis=0).reset_index(drop=True)

X_men = mprep.drop("Result", axis=1)
y_men = mprep.Result

X_women = wprep.drop("Result", axis=1)
y_women = wprep.Result

X_trainm, X_testm, y_trainm, y_testm = train_test_split(X_men, y_men, test_size=0.1, random_state=42)
X_trainw, X_testw, y_trainw, y_testw = train_test_split(X_women, y_women, test_size=0.1, random_state=42)

### Train simple model

In [53]:
log_regm = LogisticRegression()
log_regw = LogisticRegression()

log_regm.fit(X_trainm, y_trainm)
log_regw.fit(X_trainw, y_trainw)

pred_probsm = log_regm.predict_proba(X_testm)[:, 1]
pred_probsw = log_regw.predict_proba(X_testw)[:, 1]

print(f"Brier score for mens data: {np.round(brier_score_loss(y_testm, pred_probsm), 3).item()}")
print(f"Brier score for womens data: {np.round(brier_score_loss(y_testw, pred_probsw), 3).item()}")

Brier score for mens data: 0.25
Brier score for womens data: 0.25


In [54]:
teamlistm = sorted(mprep.WTeamID.unique())
teamlistw = sorted(wprep.WTeamID.unique())

mens_results = make_preds_for_submission(log_regm, "data/SampleSubmissionStage2.csv", "W")
womens_results = make_preds_for_submission(log_regw, "data/SampleSubmissionStage2.csv", "M")

final_result = pd.concat([mens_results, womens_results], axis=0).reset_index(drop=True)
final_result.to_csv("submission_result.csv", index=False)