In [49]:
#Step 1: Load and Preprocess the Data

In [50]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import log_loss
import statsmodels.formula.api as smf
# Load the dataset
data = pd.read_csv('NCAA_Tourney_2002_2024.csv')

# Exclude 2019 data
data = data[data['season'] != 2019]

data['team1_win'] = (data['team1_score'] > data['team2_score']).astype(int)
data['seed_diff'] = data['team1_seed'] - data['team2_seed']
data['exp_win1'] = (data['team1_adjoe']**11.5)/ ((data['team1_adjde']**11.5)+(data['team1_adjoe']**11.5))
data['exp_win2'] = (data['team2_adjoe']**11.5)/ ((data['team2_adjde']**11.5)+(data['team2_adjoe']**11.5))


In [51]:
# Converts expected win percentage into log odds

data['team1_log5'] = (data['exp_win1'] - (data['exp_win1']*data['exp_win2']))/ (data['exp_win1']+data['exp_win2']-(2*data['exp_win1']*data['exp_win2']))
data.dropna(subset=['team1_log5'], inplace=True)

In [52]:

# Define features and target
features = ['game_id','season','team1_win','exp_win1','exp_win2','team2_score', 'team1_score','team1_log5','team1_id','team2_id','seed_diff']

data_1 = data.sample(frac=.5, random_state=15)[features]
data_2 = data[~data.index.isin(data_1.index)][features].reset_index(drop=True)
data_1 = data_1.reset_index(drop=True)

In [53]:
data_2['team1_win'] = 0
# data_2['diff_dist'] = data_2['diff_dist']*-1
data_2['team1_log5'] = 1-data_2['team1_log5']
data_2['game_id'] = (data_2['game_id'].str.split('-', expand=True)[0] 
                           + '-' + data_2['game_id'].str.split('-', expand=True)[2] 
                           + '-' + data_2['game_id'].str.split('-', expand=True)[1]
                           ) 
data_2['seed_diff'] = data_2['seed_diff']*-1
data_2.columns = features

In [54]:
data_train = pd.concat([data_2, data_1]).reset_index(drop=True)

In [55]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data_train, data_train['team1_win'], test_size=0.2, random_state=42)

In [56]:
#Step 2: Train the Logistic Regression Model

In [57]:
# Initialize and train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X=X_train[['seed_diff']], y=y_train)
model2 = SVC(kernel='rbf', C=1.0, gamma='scale', probability=True)
model2.fit(X=X_train[['seed_diff']], y=y_train)

train_df = pd.DataFrame({'team1_win': y_train, 'seed_diff': X_train['seed_diff']})

modelTrain = smf.logit("team1_win ~ seed_diff", data=train_df).fit()

# Predict probabilities on the test set
y_pred_proba = model.predict_proba(X_test[['seed_diff']])[:, 1]
y_pred_proba2 = model2.predict_proba(X_test[['seed_diff']])[:, 1]
# Calculate log loss
log_loss_value = log_loss(y_test, y_pred_proba, labels=[0,1])
log_loss_value2 = log_loss(y_test, y_pred_proba2, labels=[0,1])
print(f'Log Loss: {log_loss_value}')
print(f'Log Loss 2: {log_loss_value2}')

Optimization terminated successfully.
         Current function value: 0.569838
         Iterations 5
Log Loss: 0.5699102713091542
Log Loss 2: 0.5893439398813555


In [58]:
all_data = pd.read_csv('NCAA_Tourney_2025.csv').reset_index(drop=True)

In [59]:
# all_data = pd.read_csv('NCAA_Tourney_2002_2024.csv').query("season==2019").reset_index(drop=True)

In [60]:
all_data['seed_diff'] = all_data['team1_seed'] - all_data['team2_seed']

In [61]:
all_data['prob'] = modelTrain.predict(all_data[['seed_diff']])
all_data['id'] = all_data['game_id'].str.replace('-','',regex=True)
all_data = all_data.sort_values(by='id')

In [62]:
all_data[['game_id','prob']]

Unnamed: 0,game_id,prob
775,2025-1103-1106,0.623507
740,2025-1103-1110,0.623507
786,2025-1103-1116,0.396829
759,2025-1103-1120,0.141434
774,2025-1103-1136,0.586763
...,...,...
2051,2025-1471-1384,0.658882
2053,2025-1471-1385,0.183066
2054,2025-1471-1403,0.207206
2065,2025-1471-1423,0.586763


In [63]:
all_data[['game_id','prob']].to_csv("Data Titans.csv", index=False)