In [118]:
#Step 1: Load and Preprocess the Data

In [119]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
import statsmodels.formula.api as smf
# Load the dataset
data = pd.read_csv('NCAA_Tourney_2002_2024.csv')

# Exclude 2019 data
data = data[data['season'] != 2019]

data['team1_win'] = (data['team1_score'] > data['team2_score']).astype(int)
data['seed_diff'] = data['team1_seed'] - data['team2_seed']
data['exp_win1'] = (data['team1_adjoe']**11.5)/ ((data['team1_adjde']**11.5)+(data['team1_adjoe']**11.5))
data['exp_win2'] = (data['team2_adjoe']**11.5)/ ((data['team2_adjde']**11.5)+(data['team2_adjoe']**11.5))


In [120]:
# Converts expected win percentage into log odds

data['team1_log5'] = (data['exp_win1'] - (data['exp_win1']*data['exp_win2']))/ (data['exp_win1']+data['exp_win2']-(2*data['exp_win1']*data['exp_win2']))
data.dropna(subset=['team1_log5'], inplace=True)

In [121]:

# Define features and target
features = ['game_id','season','team1_win','exp_win1','exp_win2','team2_score', 'team1_score','team1_log5','team1_id','team2_id','seed_diff']

data_1 = data.sample(frac=.5, random_state=15)[features]
data_2 = data[~data.index.isin(data_1.index)][features].reset_index(drop=True)
data_1 = data_1.reset_index(drop=True)

In [132]:
data_2['team1_win'] = 0
# data_2['diff_dist'] = data_2['diff_dist']*-1
data_2['team1_log5'] = 1-data_2['team1_log5']
data_2['game_id'] = (data_2['game_id'].str.split('-', expand=True)[0] 
                           + '-' + data_2['game_id'].str.split('-', expand=True)[2] 
                           + '-' + data_2['game_id'].str.split('-', expand=True)[1]
                           ) 
data_2['seed_diff'] = data_2['seed_diff']*-1
data_2.columns = features

In [133]:
data_train = pd.concat([data_2, data_1]).reset_index(drop=True)

In [134]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data_train, data_train['team1_win'], test_size=0.2, random_state=42)

In [135]:
#Step 2: Train the Logistic Regression Model

In [136]:
# Initialize and train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X=X_train[['seed_diff']], y=y_train)

train_df = pd.DataFrame({'team1_win': y_train, 'seed_diff': X_train['seed_diff']})

modelTrain = smf.logit("team1_win ~ seed_diff", data=train_df).fit()

# Predict probabilities on the test set
y_pred_proba = model.predict_proba(X_test[['seed_diff']])[:, 1]
# Calculate log loss
log_loss_value = log_loss(y_test, y_pred_proba, labels=[0,1])
print(f'Log Loss: {log_loss_value}')

Optimization terminated successfully.
         Current function value: 0.692252
         Iterations 3
Log Loss: 0.6882848047579899


In [137]:
all_data = pd.read_csv('NCAA_Tourney_2002_2024.csv').query("season==2019").reset_index(drop=True)

In [138]:
all_data['seed_diff'] = all_data['team1_seed'] - all_data['team2_seed']

In [149]:
all_data['prob'] = modelTrain.predict(all_data[['seed_diff']])
all_data['id'] = all_data['game_id'].str.replace('-','',regex=True)
all_data = all_data.sort_values(by='id')

In [150]:
all_data[['game_id','prob']]

Unnamed: 0,game_id,prob
3,2019-1113-1385,0.512257
44,2019-1120-1242,0.515437
63,2019-1120-1246,0.521794
6,2019-1120-1308,0.489982
58,2019-1120-1314,0.524970
...,...,...
66,2019-1438-1403,0.505894
42,2019-1439-1251,0.486801
31,2019-1439-1387,0.483621
23,2019-1449-1429,0.515437


In [141]:
all_data[['game_id','prob']].to_csv("Data Titans.csv", index=False)