In [1]:
# %pip install pandas
# %pip install numpy
# %pip install seaborn
# %pip install scikit-learn
# %pip install tensorflow

import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, FunctionTransformer, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import accuracy_score, make_scorer, recall_score, confusion_matrix
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import FunctionTransformer, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, f_classif, SelectFromModel
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix
from sklearn.linear_model import LassoCV, LogisticRegressionCV

import warnings
warnings.filterwarnings('ignore')

# -------------------------------
# 0. Load Data and Preprocessing (Existing Code)
# -------------------------------

path = os.getcwd()

pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 100)

bracket_training = pd.read_csv(f"{path}/bracket_training.csv", sep=",")
bracket_test = pd.read_csv(f'{path}/bracket_test.csv')
college_info = pd.read_csv(f'{path}/institutions.csv', sep=',', encoding='utf-8')
df_kenpom = pd.read_csv('Kenpom Data.csv')
distances_ew_df = pd.read_csv(f'{path}/SemifinalWinner_East_West.csv', sep='|')



In [2]:
# --- Step 1: Process KenPom Data
df_kenpom['Team_Name'] = df_kenpom['Team'].apply(lambda x: ' '.join(x.split()[:-1]))
mapping = {
    "Connecticut": "UConn", "Houston": "Houston", "Purdue": "Purdue", "Auburn": "Auburn",
    "Tennessee": "Tennessee", "Arizona": "Arizona", "Duke": "Duke", "Iowa St.": "Iowa St.",
    "North Carolina": "North Carolina", "Illinois": "Illinois", "Creighton": "Creighton",
    "Gonzaga": "Gonzaga", "Marquette": "Marquette", "Alabama": "Alabama", "Baylor": "Baylor",
    "Michigan St.": "Michigan St.", "Wisconsin": "Wisconsin", "BYU": "BYU", "Clemson": "Clemson",
    "Saint Mary's": "Saint Mary's", "San Diego St.": "San Diego St.", "Kentucky": "Kentucky",
    "Colorado": "Colorado", "Texas": "Texas", "Florida": "Florida", "Kansas": "Kansas",
    "New Mexico": "New Mexico", "Nebraska": "Nebraska", "Texas Tech": "Texas Tech",
    "Dayton": "Dayton", "Mississippi St.": "Mississippi St.", "Texas A&M": "Texas A&M",
    "Colorado St.": "Colorado St.", "Nevada": "Nevada", "Northwestern": "Northwestern",
    "Washington St.": "Washington St.", "TCU": "TCU", "Boise St.": "Boise St.",
    "N.C. State": "NC State", "Florida Atlantic": "FAU", "Utah St.": "Utah St.",
    "Grand Canyon": "Grand Canyon", "Drake": "Drake", "South Carolina": "South Carolina",
    "Oregon": "Oregon", "James Madison": "James Madison", "McNeese St.": "McNeese",
    "Virginia": "Virginia", "Samford": "Samford", "Duquesne": "Duquesne", "Yale": "Yale",
    "Charleston": "Charleston", "Vermont": "Vermont", "UAB": "UAB", "Morehead St.": "Morehead St.",
    "Akron": "Akron", "Oakland": "Oakland", "Western Kentucky": "Western Ky.",
    "South Dakota St.": "South Dakota St.", "Colgate": "Colgate", "Longwood": "Longwood",
    "Long Beach St.": "Long Beach St.", "Saint Peter's": "Saint Peter's", "Stetson": "Stetson",
    "Montana St.": "Montana St.", "Grambling St.": "Grambling St.", "Howard": "Howard", "Wagner": "Wagner"
}
df_kenpom['Team_Name'] = df_kenpom['Team_Name'].map(mapping)
df_kenpom['Seed_Rank'] = df_kenpom['Team'].str.extract(r'(\d+)$')
df_kenpom = df_kenpom.dropna(subset=['Seed_Rank'])
df_kenpom['Seed_Rank'] = df_kenpom['Seed_Rank'].astype(int)
df_ken_clean = df_kenpom.loc[:, ['Rk', 'Team_Name', 'Seed_Rank', 'NetRtg', 'Luck']]
df_ken_clean = df_ken_clean.set_index('Team_Name')

# --- Step 2: Merge with college_info and Join to Bracket Data
college_info_ken_df = college_info.join(df_ken_clean, how='left', on='InstitutionName')
college_info_ken_df['win_%'] = college_info_ken_df['RegularSeasonWins'] / (college_info_ken_df['RegularSeasonWins'] + college_info_ken_df['RegularSeasonLosses'])
college_info_ken_df = college_info_ken_df.set_index('InstitutionID')

train_df = bracket_training.join(
    college_info_ken_df.add_prefix("W_"), on="RegionWinner_West"
).join(
    college_info_ken_df.add_prefix("E_"), on="RegionWinner_East"
).join(
    college_info_ken_df.add_prefix('M_'), on="RegionWinner_Midwest"
).join(
    college_info_ken_df.add_prefix('S_'), on='RegionWinner_South'
)

test_df = bracket_test.join(
    college_info_ken_df.add_prefix("W_"), on="RegionWinner_West"
).join(
    college_info_ken_df.add_prefix("E_"), on="RegionWinner_East"
).join(
    college_info_ken_df.add_prefix('M_'), on="RegionWinner_Midwest"
).join(
    college_info_ken_df.add_prefix('S_'), on='RegionWinner_South'
)

classic1_df_train = train_df[[
    'CustomerID',
    'CustomerPostalCodeLatitude', 'CustomerPostalCodeLongitude', 'CustomerDMACode', 'CustomerDMADescription',
    'NCAACustomerRecordCreated', 'BracketEntryId', 'BracketEntryCreatedDate',
    'RegionWinner_East', 'RegionWinner_West', 'RegionWinner_South', 'RegionWinner_Midwest',
    'SemifinalWinner_East_West', 'SemifinalWinner_South_Midwest', 'NationalChampion',
    'E_InstitutionName', 'E_InstitutionDMACode', 'E_InstitutionLatitude', 'E_InstitutionLongitude',
    'E_InstitutionConference', 'E_InstitutionEnrollment_Male', 'E_InstitutionEnrollment_Female',
    'E_InstitutionEnrollment_Total', 'E_InstitutionNCAAMemberSinceDate', 'E_RegularSeasonWins',
    'E_RegularSeasonLosses', 'E_RegularSeasonAverageAttendance', 'E_RegularSeasonAverageScore',
    'E_Rk', 'E_Seed_Rank', 'E_NetRtg', 'E_Luck',
    'M_InstitutionName', 'M_InstitutionDMACode', 'M_InstitutionLatitude', 'M_InstitutionLongitude',
    'M_InstitutionConference', 'M_InstitutionEnrollment_Male', 'M_InstitutionEnrollment_Female',
    'M_InstitutionEnrollment_Total', 'M_InstitutionNCAAMemberSinceDate', 'M_RegularSeasonWins',
    'M_RegularSeasonLosses', 'M_RegularSeasonAverageAttendance', 'M_RegularSeasonAverageScore',
    'M_Rk', 'M_Seed_Rank', 'M_NetRtg', 'M_Luck',
    'S_InstitutionName', 'S_InstitutionDMACode', 'S_InstitutionLatitude', 'S_InstitutionLongitude',
    'S_InstitutionConference', 'S_InstitutionEnrollment_Male', 'S_InstitutionEnrollment_Female',
    'S_InstitutionEnrollment_Total', 'S_InstitutionNCAAMemberSinceDate', 'S_RegularSeasonWins',
    'S_RegularSeasonLosses', 'S_RegularSeasonAverageAttendance', 'S_RegularSeasonAverageScore',
    'S_Rk', 'S_Seed_Rank', 'S_NetRtg', 'S_Luck',
    'W_InstitutionName', 'W_InstitutionDMACode', 'W_InstitutionLatitude', 'W_InstitutionLongitude',
    'W_InstitutionConference', 'W_InstitutionEnrollment_Male', 'W_InstitutionEnrollment_Female',
    'W_InstitutionEnrollment_Total', 'W_InstitutionNCAAMemberSinceDate', 'W_RegularSeasonWins',
    'W_RegularSeasonLosses', 'W_RegularSeasonAverageAttendance', 'W_RegularSeasonAverageScore',
    'W_Rk', 'W_Seed_Rank', 'W_NetRtg', 'W_Luck'
]]
classic1_df_test = test_df[[
    'CustomerID',
    'CustomerPostalCodeLatitude', 'CustomerPostalCodeLongitude', 'CustomerDMACode', 'CustomerDMADescription',
    'NCAACustomerRecordCreated', 'BracketEntryId', 'BracketEntryCreatedDate',
    'RegionWinner_East', 'RegionWinner_West', 'RegionWinner_South', 'RegionWinner_Midwest',
    'E_InstitutionName', 'E_InstitutionDMACode', 'E_InstitutionLatitude', 'E_InstitutionLongitude',
    'E_InstitutionConference', 'E_InstitutionEnrollment_Male', 'E_InstitutionEnrollment_Female',
    'E_InstitutionEnrollment_Total', 'E_InstitutionNCAAMemberSinceDate', 'E_RegularSeasonWins',
    'E_RegularSeasonLosses', 'E_RegularSeasonAverageAttendance', 'E_RegularSeasonAverageScore',
    'E_Rk', 'E_Seed_Rank', 'E_NetRtg', 'E_Luck',
    'M_InstitutionName', 'M_InstitutionDMACode', 'M_InstitutionLatitude', 'M_InstitutionLongitude',
    'M_InstitutionConference', 'M_InstitutionEnrollment_Male', 'M_InstitutionEnrollment_Female',
    'M_InstitutionEnrollment_Total', 'M_InstitutionNCAAMemberSinceDate', 'M_RegularSeasonWins',
    'M_RegularSeasonLosses', 'M_RegularSeasonAverageAttendance', 'M_RegularSeasonAverageScore',
    'M_Rk', 'M_Seed_Rank', 'M_NetRtg', 'M_Luck',
    'S_InstitutionName', 'S_InstitutionDMACode', 'S_InstitutionLatitude', 'S_InstitutionLongitude',
    'S_InstitutionConference', 'S_InstitutionEnrollment_Male', 'S_InstitutionEnrollment_Female',
    'S_InstitutionEnrollment_Total', 'S_InstitutionNCAAMemberSinceDate', 'S_RegularSeasonWins',
    'S_RegularSeasonLosses', 'S_RegularSeasonAverageAttendance', 'S_RegularSeasonAverageScore',
    'S_Rk', 'S_Seed_Rank', 'S_NetRtg', 'S_Luck',
    'W_InstitutionName', 'W_InstitutionDMACode', 'W_InstitutionLatitude', 'W_InstitutionLongitude',
    'W_InstitutionConference', 'W_InstitutionEnrollment_Male', 'W_InstitutionEnrollment_Female',
    'W_InstitutionEnrollment_Total', 'W_InstitutionNCAAMemberSinceDate', 'W_RegularSeasonWins',
    'W_RegularSeasonLosses', 'W_RegularSeasonAverageAttendance', 'W_RegularSeasonAverageScore',
    'W_Rk', 'W_Seed_Rank', 'W_NetRtg', 'W_Luck'
]]
# Create separate imputers:
num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='constant', fill_value='Unknown')

# List the columns to impute:
num_cols = ['CustomerPostalCodeLatitude', 'CustomerPostalCodeLongitude']
cat_cols = ['CustomerDMACode', 'CustomerDMADescription']

# Impute in the training set:
classic1_df_train[num_cols] = num_imputer.fit_transform(classic1_df_train[num_cols])
classic1_df_train[cat_cols] = cat_imputer.fit_transform(classic1_df_train[cat_cols])

# And in the test set:
classic1_df_test[num_cols] = num_imputer.transform(classic1_df_test[num_cols])
classic1_df_test[cat_cols] = cat_imputer.transform(classic1_df_test[cat_cols])

# Create win percentages
classic1_df_train['m_win_%'] = classic1_df_train['M_RegularSeasonWins'] / (classic1_df_train['M_RegularSeasonWins'] + classic1_df_train['M_RegularSeasonLosses'])
classic1_df_train['s_win_%'] = classic1_df_train['S_RegularSeasonWins'] / (classic1_df_train['S_RegularSeasonWins'] + classic1_df_train['S_RegularSeasonLosses'])
classic1_df_train['e_win_%'] = classic1_df_train['E_RegularSeasonWins'] / (classic1_df_train['E_RegularSeasonWins'] + classic1_df_train['E_RegularSeasonLosses'])
classic1_df_train['w_win_%'] = classic1_df_train['W_RegularSeasonWins'] / (classic1_df_train['W_RegularSeasonWins'] + classic1_df_train['W_RegularSeasonLosses'])

classic1_df_test['m_win_%'] = classic1_df_test['M_RegularSeasonWins'] / (classic1_df_test['M_RegularSeasonWins'] + classic1_df_test['M_RegularSeasonLosses'])
classic1_df_test['s_win_%'] = classic1_df_test['S_RegularSeasonWins'] / (classic1_df_test['S_RegularSeasonWins'] + classic1_df_test['S_RegularSeasonLosses'])
classic1_df_test['e_win_%'] = classic1_df_test['E_RegularSeasonWins'] / (classic1_df_test['E_RegularSeasonWins'] + classic1_df_test['E_RegularSeasonLosses'])
classic1_df_test['w_win_%'] = classic1_df_test['W_RegularSeasonWins'] / (classic1_df_test['W_RegularSeasonWins'] + classic1_df_test['W_RegularSeasonLosses'])



In [3]:
# -------------------------------
# 3. Define Helper Functions for Modeling and Feature Engineering
# -------------------------------
# Custom sigmoid transformer (for ordinal features if needed)
def sigmoid_transform(X):
    try:
        return  1 - (1 / (1 + np.exp(-X)))
    except ZeroDivisionError:
        print('Bad Process!')
        
sigmoid_transformer = FunctionTransformer(sigmoid_transform, validate=False)

def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371  # Earth's radius in km
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    delta_lat = lat2 - lat1
    delta_lon = lon2 - lon1
    a = np.sin(delta_lat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(delta_lon / 2) ** 2
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c
def multioutput_accuracy(y_true, y_pred):
    # y_true and y_pred should be numpy arrays of shape (n_samples, n_outputs)
    accuracies = []
    for i in range(y_true.shape[1]):
        accuracies.append(accuracy_score(y_true[:, i], y_pred[:, i]))
    return np.mean(accuracies)

custom_scorer = make_scorer(multioutput_accuracy)



In [None]:
# -------------------------------
# 4. Incorporate New Geographic & Popularity Features
# -------------------------------



df_train = classic1_df_train
df_test = classic1_df_test



#Apply haversine distance for each region winner column
for region, lat_data, lon_data in zip( #Zip function returns a list of tuples by column, ex.)[('E_', 'E_InstitutionLatitude', 'E_InstitutionLongtitude'), ...]
    ['E_','W_','M_', 'S_'],
    ['E_InstitutionLatitude', 'W_InstitutionLatitude', 'M_InstitutionLatitude', 'S_InstitutionLatitude'],
    ['E_InstitutionLongitude', 'W_InstitutionLongitude', 'M_InstitutionLongitude', 'S_InstitutionLongitude']
):
    df_train[f'{region}distance'] =  df_train.apply( #region_distance returns the distance in miles the customer is away from the team
        lambda row: haversine_distance(row['CustomerPostalCodeLatitude'],
                                       row['CustomerPostalCodeLongitude'],
                                       row[lat_data],
                                       row[lon_data])
    , axis = 1)

epsilon = 1e-5

for region in ['E_', 'W_', 'M_', 'S_']: # Math Behind this??
    df_train[f'{region}dist_score'] = 1 / (df_train[f'{region}distance'] + epsilon)
    df_train = df_train.drop(columns=[f'{region}distance'])

# Sum of East & West distance scores
df_train['total_dist_score_EW'] = df_train[['E_dist_score', 'W_dist_score']].sum(axis=1)

# Sum of Midwest & South distance scores
df_train['total_dist_score_MS'] = df_train[['M_dist_score', 'S_dist_score']].sum(axis=1)


# I applied a Normalization technique to remove scale from the data

# Normalize East & West probabilities
df_train['E_dist_prob'] = df_train['E_dist_score'] / df_train['total_dist_score_EW']
df_train['W_dist_prob'] = df_train['W_dist_score'] / df_train['total_dist_score_EW']

# Normalize Midwest & South probabilities
df_train['M_dist_prob'] = df_train['M_dist_score'] / df_train['total_dist_score_MS']
df_train['S_dist_prob'] = df_train['S_dist_score'] / df_train['total_dist_score_MS']

df_train = df_train.drop(columns=[
    'E_dist_score', 'W_dist_score', 'M_dist_score', 'S_dist_score',
    'total_dist_score_EW', 'total_dist_score_MS'
])

df_train

Unnamed: 0,CustomerID,CustomerPostalCodeLatitude,CustomerPostalCodeLongitude,CustomerDMACode,CustomerDMADescription,NCAACustomerRecordCreated,BracketEntryId,BracketEntryCreatedDate,RegionWinner_East,RegionWinner_West,RegionWinner_South,RegionWinner_Midwest,SemifinalWinner_East_West,SemifinalWinner_South_Midwest,NationalChampion,E_InstitutionName,E_InstitutionDMACode,E_InstitutionLatitude,E_InstitutionLongitude,E_InstitutionConference,E_InstitutionEnrollment_Male,E_InstitutionEnrollment_Female,E_InstitutionEnrollment_Total,E_InstitutionNCAAMemberSinceDate,E_RegularSeasonWins,E_RegularSeasonLosses,E_RegularSeasonAverageAttendance,E_RegularSeasonAverageScore,E_Rk,E_Seed_Rank,E_NetRtg,E_Luck,M_InstitutionName,M_InstitutionDMACode,M_InstitutionLatitude,M_InstitutionLongitude,M_InstitutionConference,M_InstitutionEnrollment_Male,M_InstitutionEnrollment_Female,M_InstitutionEnrollment_Total,M_InstitutionNCAAMemberSinceDate,M_RegularSeasonWins,M_RegularSeasonLosses,M_RegularSeasonAverageAttendance,M_RegularSeasonAverageScore,M_Rk,M_Seed_Rank,M_NetRtg,M_Luck,S_InstitutionName,S_InstitutionDMACode,S_InstitutionLatitude,S_InstitutionLongitude,S_InstitutionConference,S_InstitutionEnrollment_Male,S_InstitutionEnrollment_Female,S_InstitutionEnrollment_Total,S_InstitutionNCAAMemberSinceDate,S_RegularSeasonWins,S_RegularSeasonLosses,S_RegularSeasonAverageAttendance,S_RegularSeasonAverageScore,S_Rk,S_Seed_Rank,S_NetRtg,S_Luck,W_InstitutionName,W_InstitutionDMACode,W_InstitutionLatitude,W_InstitutionLongitude,W_InstitutionConference,W_InstitutionEnrollment_Male,W_InstitutionEnrollment_Female,W_InstitutionEnrollment_Total,W_InstitutionNCAAMemberSinceDate,W_RegularSeasonWins,W_RegularSeasonLosses,W_RegularSeasonAverageAttendance,W_RegularSeasonAverageScore,W_Rk,W_Seed_Rank,W_NetRtg,W_Luck,m_win_%,s_win_%,e_win_%,w_win_%,E_dist_prob,W_dist_prob,M_dist_prob,S_dist_prob
0,47028,32.5622,-86.0994,698.0,MONTGOMERY (SELMA),2021-12-25,1723503,2024-03-19 10:27:15 -0400,164,29,317,694,164,694,694,UConn,533,41.80910,-72.24995,Big East Conference,10645,11834,22479,09/01/1910,31,3,14017.88,81.47,1,1,36.43,0.037,Tennessee,557,35.95208,-83.92585,Southeastern Conference,15269,18536,33805,09/01/1909,24,8,16065.47,79.47,5,2,26.61,-0.026,James Madison,569,38.43631,-78.87048,Sun Belt Conference,8342,12004,20346,09/01/1969,31,3,4471.35,84.35,63,12,12.42,0.093,Arizona,789,32.23267,-110.95080,Pac-12 Conference,13090,17292,30382,09/01/1936,25,8,12118.18,87.94,6,2,26.55,-0.047,0.750000,0.911765,0.911765,0.757576,0.593122,0.406878,0.684155,0.315845
1,3511,42.8256,-86.0104,563.0,GRAND RAPIDS - KALMZOO - B. CRK,2021-04-02,963479,2024-03-18 10:16:39 -0400,164,51,334,328,164,334,164,UConn,533,41.80910,-72.24995,Big East Conference,10645,11834,22479,09/01/1910,31,3,14017.88,81.47,1,1,36.43,0.037,Kansas,616,38.95855,-95.24757,Big 12 Conference,8859,9845,18704,09/01/1908,22,10,13952.66,75.25,27,4,17.94,0.058,Kentucky,541,38.03891,-84.50475,Southeastern Conference,9596,13127,22723,09/01/1936,23,9,17427.94,89.44,23,3,19.29,-0.040,Baylor,625,31.54687,-97.12104,Big 12 Conference,6008,8877,14885,09/01/1922,23,10,9517.79,80.55,15,3,21.90,-0.016,0.687500,0.718750,0.911765,0.696970,0.583512,0.416488,0.381613,0.618387
2,58445,38.8808,-77.1129,511.0,"WASHINGTON, DC (HAGRSTWN)",2021-04-02,810038,2024-03-18 00:21:47 -0400,164,51,288,559,164,559,559,UConn,533,41.80910,-72.24995,Big East Conference,10645,11834,22479,09/01/1910,31,3,14017.88,81.47,1,1,36.43,0.037,Purdue,582,40.42821,-86.91444,Big Ten Conference,21670,16279,37949,09/01/1914,29,4,13329.06,83.39,3,1,30.62,0.048,Houston,618,29.72039,-95.34354,Big 12 Conference,18290,19653,37943,09/01/1949,30,4,9347.35,73.03,2,1,31.17,0.042,Baylor,625,31.54687,-97.12104,Big 12 Conference,6008,8877,14885,09/01/1922,23,10,9517.79,80.55,15,3,21.90,-0.016,0.878788,0.882353,0.911765,0.696970,0.790941,0.209059,0.695405,0.304595
3,28833,29.4969,-98.4032,641.0,SAN ANTONIO,2023-11-16,3384825,2024-03-21 10:28:56 -0400,311,610,490,559,311,490,490,Iowa St.,679,42.02621,-93.64851,Big 12 Conference,14070,11171,25241,09/01/1908,27,7,12059.44,75.56,8,2,26.47,0.002,Purdue,582,40.42821,-86.91444,Big Ten Conference,21670,16279,37949,09/01/1914,29,4,13329.06,83.39,3,1,30.62,0.048,NC State,560,35.78511,-78.67451,Atlantic Coast Conference,11649,11794,23443,09/01/1941,22,14,12147.11,76.36,45,11,15.90,0.014,Saint Mary's,807,37.84073,-122.10900,West Coast Conference,859,1090,1949,09/01/1961,26,7,3458.15,74.24,20,5,19.43,0.008,0.878788,0.611111,0.794118,0.787879,0.619592,0.380408,0.551714,0.448286
4,37899,42.8946,-78.8245,514.0,BUFFALO,2022-03-16,2828017,2024-03-20 20:14:52 -0400,37,457,387,169,457,387,457,Auburn,522,32.59938,-85.48826,Southeastern Conference,12695,12684,25379,09/01/1910,27,7,10182.91,83.32,4,4,27.99,-0.080,Creighton,652,41.26536,-95.94781,Big East Conference,1789,2501,4290,09/01/1923,23,9,13651.44,80.53,11,3,24.22,-0.018,Marquette,617,43.03903,-87.92796,Big East Conference,3328,4200,7528,09/01/1928,25,9,14084.65,78.29,13,2,23.02,0.035,North Carolina,560,35.91177,-79.05097,Atlantic Coast Conference,8068,12174,20242,09/01/1906,27,7,15767.47,81.47,9,1,26.19,-0.038,0.718750,0.735294,0.794118,0.794118,0.376739,0.623261,0.342400,0.657600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129997,70315,39.5414,-104.9218,751.0,DENVER,2023-05-01,3602437,2024-03-21 11:53:31 -0400,311,457,334,694,311,694,311,Iowa St.,679,42.02621,-93.64851,Big 12 Conference,14070,11171,25241,09/01/1908,27,7,12059.44,75.56,8,2,26.47,0.002,Tennessee,557,35.95208,-83.92585,Southeastern Conference,15269,18536,33805,09/01/1909,24,8,16065.47,79.47,5,2,26.61,-0.026,Kentucky,541,38.03891,-84.50475,Southeastern Conference,9596,13127,22723,09/01/1936,23,9,17427.94,89.44,23,3,19.29,-0.040,North Carolina,560,35.91177,-79.05097,Atlantic Coast Conference,8068,12174,20242,09/01/1906,27,7,15767.47,81.47,9,1,26.19,-0.038,0.750000,0.718750,0.794118,0.794118,0.699813,0.300187,0.484898,0.515102
129998,51130,29.6625,-95.7272,618.0,HOUSTON,2024-07-17,2260739,2024-03-19 23:06:28 -0400,311,8,288,694,8,288,288,Iowa St.,679,42.02621,-93.64851,Big 12 Conference,14070,11171,25241,09/01/1908,27,7,12059.44,75.56,8,2,26.47,0.002,Tennessee,557,35.95208,-83.92585,Southeastern Conference,15269,18536,33805,09/01/1909,24,8,16065.47,79.47,5,2,26.61,-0.026,Houston,618,29.72039,-95.34354,Big 12 Conference,18290,19653,37943,09/01/1949,30,4,9347.35,73.03,2,1,31.17,0.042,Alabama,630,33.21188,-87.54597,Southeastern Conference,14152,18306,32458,09/01/1936,21,11,10947.59,90.75,14,4,22.96,-0.001,0.750000,0.882353,0.794118,0.656250,0.385504,0.614496,0.028028,0.971972
129999,36902,39.5354,-119.8374,811.0,RENO,2021-02-20,1298238,2024-03-18 16:17:25 -0400,164,457,288,169,164,288,164,UConn,533,41.80910,-72.24995,Big East Conference,10645,11834,22479,09/01/1910,31,3,14017.88,81.47,1,1,36.43,0.037,Creighton,652,41.26536,-95.94781,Big East Conference,1789,2501,4290,09/01/1923,23,9,13651.44,80.53,11,3,24.22,-0.018,Houston,618,29.72039,-95.34354,Big 12 Conference,18290,19653,37943,09/01/1949,30,4,9347.35,73.03,2,1,31.17,0.042,North Carolina,560,35.91177,-79.05097,Atlantic Coast Conference,8068,12174,20242,09/01/1906,27,7,15767.47,81.47,9,1,26.19,-0.038,0.718750,0.882353,0.911765,0.794118,0.474127,0.525873,0.550592,0.449408
130000,57171,33.3538,-86.8254,630.0,BIRMINGHAM (ANN & TUSC),2019-06-01,681594,2024-03-17 21:14:21 -0400,164,8,334,559,164,559,559,UConn,533,41.80910,-72.24995,Big East Conference,10645,11834,22479,09/01/1910,31,3,14017.88,81.47,1,1,36.43,0.037,Purdue,582,40.42821,-86.91444,Big Ten Conference,21670,16279,37949,09/01/1914,29,4,13329.06,83.39,3,1,30.62,0.048,Kentucky,541,38.03891,-84.50475,Southeastern Conference,9596,13127,22723,09/01/1936,23,9,17427.94,89.44,23,3,19.29,-0.040,Alabama,630,33.21188,-87.54597,Southeastern Conference,14152,18306,32458,09/01/1936,21,11,10947.59,90.75,14,4,22.96,-0.001,0.878788,0.718750,0.911765,0.656250,0.041533,0.958467,0.416476,0.583524


In [5]:
# -------------------------------
# 5. Define Feature Columns and Multi-Output Targets for Joint Modeling
# -------------------------------

# Drop withheld features
features_to_withHold = ['CustomerID',
                         'CustomerPostalCodeLatitude', 
                         'CustomerPostalCodeLongitude',
                         'CustomerDMACode',
                         'CustomerDMADescription',
                        'NCAACustomerRecordCreated',
                        'BracketEntryId',
                        'BracketEntryCreatedDate',
    'E_InstitutionName', 'E_InstitutionDMACode', 'E_InstitutionLatitude', 'E_InstitutionLongitude','E_RegularSeasonWins', 'E_RegularSeasonLosses',
    'E_InstitutionNCAAMemberSinceDate', 'E_InstitutionEnrollment_Female','E_InstitutionEnrollment_Male','E_RegularSeasonAverageAttendance', "E_RegularSeasonAverageScore",
    'E_InstitutionConference',

    'W_InstitutionName', 'W_InstitutionDMACode', 'W_InstitutionLatitude', 'W_InstitutionLongitude','W_RegularSeasonWins', 'W_RegularSeasonLosses',
    'W_InstitutionEnrollment_Male', 'W_InstitutionEnrollment_Female', 'W_InstitutionNCAAMemberSinceDate', 'W_InstitutionEnrollment_Male',
    'W_RegularSeasonAverageAttendance', "W_RegularSeasonAverageScore", 'W_InstitutionConference',

    'M_InstitutionName',
 'M_InstitutionDMACode',
 'M_InstitutionLatitude',
 'M_InstitutionLongitude',
 'M_RegularSeasonWins', 'M_RegularSeasonLosses',
 'M_InstitutionNCAAMemberSinceDate',
 'M_InstitutionEnrollment_Female', 'M_InstitutionEnrollment_Male',
 'M_RegularSeasonAverageAttendance', "M_RegularSeasonAverageScore",
 'M_InstitutionConference',


 'S_InstitutionName',
 'S_InstitutionDMACode',
 'S_InstitutionLatitude',
 'S_InstitutionLongitude',
 'S_RegularSeasonWins', 'S_RegularSeasonLosses',
 'S_InstitutionEnrollment_Male',
 'S_InstitutionEnrollment_Female', 'S_InstitutionEnrollment_Male',
 'S_RegularSeasonAverageAttendance', "S_RegularSeasonAverageScore",
 'S_InstitutionNCAAMemberSinceDate',
 'S_InstitutionConference'
]

df_train = df_train.drop(columns=features_to_withHold)
df_test = df_test.drop(columns=features_to_withHold)


# Create lists of features based on column name prefixes.
features_east = [col for col in df_train.columns if col.startswith('E_') | col.startswith('e_')]
features_west = [col for col in df_train.columns if col.startswith('W_') | col.startswith('w_')]
features_midwest = [col for col in df_train.columns if col.startswith('M_') | col.startswith('m_')]
features_south = [col for col in df_train.columns if col.startswith('S_') | col.startswith('s_')]


# Define target column names.
from sklearn.feature_selection import RFE


target_EW = "SemifinalWinner_East_West"    # outcome for East-West semifinal
target_MS = "SemifinalWinner_South_Midwest"  # outcome for South-Midwest semifinal
target_NC = "NationalChampion"             # outcome for National Champion

df_train['target_EW_Binary'] = (df_train[target_EW] == df_train['RegionWinner_East']).astype(int)
# df_test['target_EW_Binary'] = (df_test[target_EW] == df_test['RegionWinner_East']).astype(int)

df_train['target_MS_Binary'] = (df_train[target_MS] == df_train['RegionWinner_Midwest']).astype(int)
# df_test['target_MS_Binary'] = (df_test[target_MS] == df_test['RegionWinner_Midwest']).astype(int)


# Remove target columns from the feature lists if present.
for target in [target_EW, target_MS, target_NC, 'target_EW_Binary', 'target_MS_Binary']:
    if target in features_east: features_east.remove(target)
    if target in features_west: features_west.remove(target)
    if target in features_midwest: features_midwest.remove(target)
    if target in features_south: features_south.remove(target)

# Combine feature lists.
features_ew = features_east + features_west
features_ms = features_south + features_midwest



In [6]:
# # # -------------------------------
# # # 6. Build a Shared Preprocessing Pipeline
# # # -------------------------------

ordinal_features_ew = [col for col in features_ew if col.endswith("_Rk") |  col.endswith('_Seed_Rank')]
numerical_features_ew = [col for col in features_ew if col not in ordinal_features_ew 
                      and col not in ['E_dist_prob', 'W_dist_prob']
                      and col not in ['RegionWinner_East','RegionWinner_South', 'RegionWinner_Midwest', 'RegionWinner_West']
                      and col not in ['target_EW_Binary', 'target_MS_Binary']]


ordinal_features_ms = [col for col in features_ms if col.endswith("_Rk") |  col.endswith('_Seed_Rank')]
numerical_features_ms = [col for col in features_ms if col not in ordinal_features_ms 
                      and col not in [ 'S_dist_prob', 'M_dist_prob']
                      and col not in ['RegionWinner_East','RegionWinner_South', 'RegionWinner_Midwest', 'RegionWinner_West']
                      and col not in ['target_EW_Binary', 'target_MS_Binary']]



y_ew = df_train['target_EW_Binary']
y_ms = df_train['target_MS_Binary']

y_ms

df_train = df_train.drop(columns= ['target_EW_Binary','target_MS_Binary' ])


X_train_ew, X_test_ew, y_train_ew, y_test_ew = train_test_split(df_train[features_ew], y_ew, test_size=.2, random_state=24)
X_train_ms, X_test_ms, y_train_ms, y_test_ms = train_test_split(df_train[features_ms], y_ms,  test_size=.2, random_state=24)

preprocessor_log_ew = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features_ew),
        ('ord', sigmoid_transformer, ordinal_features_ew)
    ],
    remainder='passthrough'
)

preprocessor_log_ms = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features_ms),
        ('ord', sigmoid_transformer, ordinal_features_ms)
    ],
    remainder='passthrough'
)



df_train[features_ew]
processed_data = pd.DataFrame(preprocessor_log_ew.fit_transform(df_train[features_ew]))
processed_data


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,-0.308014,0.719220,0.596521,0.780202,0.844432,0.516828,-0.613907,0.079665,0.268941,0.268941,2.472623e-03,0.119203,0.593122,0.406878
1,-0.308014,0.719220,0.596521,0.780202,-1.111487,-0.757863,0.859260,-1.104767,0.268941,0.268941,3.059022e-07,0.047426,0.583512,0.416488
2,-0.308014,0.719220,0.596521,0.780202,-1.111487,-0.757863,0.859260,-1.104767,0.268941,0.268941,3.059022e-07,0.047426,0.790941,0.209059
3,0.361692,-0.822227,-0.295481,-0.909279,-2.744176,-1.434958,1.999776,0.671880,0.000335,0.119203,2.061154e-09,0.006693,0.619592,0.380408
4,0.395153,-0.586986,-2.385313,-0.909279,-0.435365,0.418142,-0.186213,0.793807,0.017986,0.017986,1.233946e-04,0.268941,0.376739,0.623261
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129997,0.361692,-0.822227,-0.295481,-0.909279,-0.435365,0.418142,-0.186213,0.793807,0.000335,0.119203,1.233946e-04,0.268941,0.699813,0.300187
129998,0.361692,-0.822227,-0.295481,-0.909279,1.106450,-0.467289,1.572082,-1.900557,0.000335,0.119203,8.315280e-07,0.017986,0.385504,0.614496
129999,-0.308014,0.719220,0.596521,0.780202,-0.435365,0.418142,-0.186213,0.793807,0.268941,0.268941,1.233946e-04,0.268941,0.474127,0.525873
130000,-0.308014,0.719220,0.596521,0.780202,1.106450,-0.467289,1.572082,-1.900557,0.268941,0.268941,8.315280e-07,0.017986,0.041533,0.958467


In [7]:
# -------------------------------
# 7. Build the Joint Multi-Task Model
# -------------------------------
model_log_reg_ew = Pipeline(steps=[
    ('preprocessor', preprocessor_log_ew),
    ('selector', RFE(LogisticRegressionCV(cv=5, max_iter=2000), n_features_to_select=10)),
    ('classifier', LogisticRegressionCV(cv=5, max_iter=2000))
])

model_log_reg_ms = Pipeline(steps=[
    ('preprocessor', preprocessor_log_ms),
    ('selector', RFE(LogisticRegressionCV(cv=5, max_iter=2000), n_features_to_select=10)),
    ('classifier', LogisticRegressionCV(cv=5, max_iter=2000))
])


classic1_df_train

Unnamed: 0,CustomerID,CustomerPostalCodeLatitude,CustomerPostalCodeLongitude,CustomerDMACode,CustomerDMADescription,NCAACustomerRecordCreated,BracketEntryId,BracketEntryCreatedDate,RegionWinner_East,RegionWinner_West,RegionWinner_South,RegionWinner_Midwest,SemifinalWinner_East_West,SemifinalWinner_South_Midwest,NationalChampion,E_InstitutionName,E_InstitutionDMACode,E_InstitutionLatitude,E_InstitutionLongitude,E_InstitutionConference,E_InstitutionEnrollment_Male,E_InstitutionEnrollment_Female,E_InstitutionEnrollment_Total,E_InstitutionNCAAMemberSinceDate,E_RegularSeasonWins,E_RegularSeasonLosses,E_RegularSeasonAverageAttendance,E_RegularSeasonAverageScore,E_Rk,E_Seed_Rank,E_NetRtg,E_Luck,M_InstitutionName,M_InstitutionDMACode,M_InstitutionLatitude,M_InstitutionLongitude,M_InstitutionConference,M_InstitutionEnrollment_Male,M_InstitutionEnrollment_Female,M_InstitutionEnrollment_Total,M_InstitutionNCAAMemberSinceDate,M_RegularSeasonWins,M_RegularSeasonLosses,M_RegularSeasonAverageAttendance,M_RegularSeasonAverageScore,M_Rk,M_Seed_Rank,M_NetRtg,M_Luck,S_InstitutionName,S_InstitutionDMACode,S_InstitutionLatitude,S_InstitutionLongitude,S_InstitutionConference,S_InstitutionEnrollment_Male,S_InstitutionEnrollment_Female,S_InstitutionEnrollment_Total,S_InstitutionNCAAMemberSinceDate,S_RegularSeasonWins,S_RegularSeasonLosses,S_RegularSeasonAverageAttendance,S_RegularSeasonAverageScore,S_Rk,S_Seed_Rank,S_NetRtg,S_Luck,W_InstitutionName,W_InstitutionDMACode,W_InstitutionLatitude,W_InstitutionLongitude,W_InstitutionConference,W_InstitutionEnrollment_Male,W_InstitutionEnrollment_Female,W_InstitutionEnrollment_Total,W_InstitutionNCAAMemberSinceDate,W_RegularSeasonWins,W_RegularSeasonLosses,W_RegularSeasonAverageAttendance,W_RegularSeasonAverageScore,W_Rk,W_Seed_Rank,W_NetRtg,W_Luck,m_win_%,s_win_%,e_win_%,w_win_%,E_distance,W_distance,M_distance,S_distance,E_dist_score
0,47028,32.5622,-86.0994,698.0,MONTGOMERY (SELMA),2021-12-25,1723503,2024-03-19 10:27:15 -0400,164,29,317,694,164,694,694,UConn,533,41.80910,-72.24995,Big East Conference,10645,11834,22479,09/01/1910,31,3,14017.88,81.47,1,1,36.43,0.037,Tennessee,557,35.95208,-83.92585,Southeastern Conference,15269,18536,33805,09/01/1909,24,8,16065.47,79.47,5,2,26.61,-0.026,James Madison,569,38.43631,-78.87048,Sun Belt Conference,8342,12004,20346,09/01/1969,31,3,4471.35,84.35,63,12,12.42,0.093,Arizona,789,32.23267,-110.95080,Pac-12 Conference,13090,17292,30382,09/01/1936,25,8,12118.18,87.94,6,2,26.55,-0.047,0.750000,0.911765,0.911765,0.757576,1597.133829,2328.202760,426.563483,923.981672,0.000626
1,3511,42.8256,-86.0104,563.0,GRAND RAPIDS - KALMZOO - B. CRK,2021-04-02,963479,2024-03-18 10:16:39 -0400,164,51,334,328,164,334,164,UConn,533,41.80910,-72.24995,Big East Conference,10645,11834,22479,09/01/1910,31,3,14017.88,81.47,1,1,36.43,0.037,Kansas,616,38.95855,-95.24757,Big 12 Conference,8859,9845,18704,09/01/1908,22,10,13952.66,75.25,27,4,17.94,0.058,Kentucky,541,38.03891,-84.50475,Southeastern Conference,9596,13127,22723,09/01/1936,23,9,17427.94,89.44,23,3,19.29,-0.040,Baylor,625,31.54687,-97.12104,Big 12 Conference,6008,8877,14885,09/01/1922,23,10,9517.79,80.55,15,3,21.90,-0.016,0.687500,0.718750,0.911765,0.696970,1135.742592,1591.208951,886.829991,547.271060,0.000880
2,58445,38.8808,-77.1129,511.0,"WASHINGTON, DC (HAGRSTWN)",2021-04-02,810038,2024-03-18 00:21:47 -0400,164,51,288,559,164,559,559,UConn,533,41.80910,-72.24995,Big East Conference,10645,11834,22479,09/01/1910,31,3,14017.88,81.47,1,1,36.43,0.037,Purdue,582,40.42821,-86.91444,Big Ten Conference,21670,16279,37949,09/01/1914,29,4,13329.06,83.39,3,1,30.62,0.048,Houston,618,29.72039,-95.34354,Big 12 Conference,18290,19653,37943,09/01/1949,30,4,9347.35,73.03,2,1,31.17,0.042,Baylor,625,31.54687,-97.12104,Big 12 Conference,6008,8877,14885,09/01/1922,23,10,9517.79,80.55,15,3,21.90,-0.016,0.878788,0.882353,0.911765,0.696970,525.083169,1986.571818,856.081303,1954.472608,0.001904
3,28833,29.4969,-98.4032,641.0,SAN ANTONIO,2023-11-16,3384825,2024-03-21 10:28:56 -0400,311,610,490,559,311,490,490,Iowa St.,679,42.02621,-93.64851,Big 12 Conference,14070,11171,25241,09/01/1908,27,7,12059.44,75.56,8,2,26.47,0.002,Purdue,582,40.42821,-86.91444,Big Ten Conference,21670,16279,37949,09/01/1914,29,4,13329.06,83.39,3,1,30.62,0.048,NC State,560,35.78511,-78.67451,Atlantic Coast Conference,11649,11794,23443,09/01/1941,22,14,12147.11,76.36,45,11,15.90,0.014,Saint Mary's,807,37.84073,-122.10900,West Coast Conference,859,1090,1949,09/01/1961,26,7,3458.15,74.24,20,5,19.43,0.008,0.878788,0.611111,0.794118,0.787879,1457.097050,2373.254391,1601.282477,1970.726050,0.000686
4,37899,42.8946,-78.8245,514.0,BUFFALO,2022-03-16,2828017,2024-03-20 20:14:52 -0400,37,457,387,169,457,387,457,Auburn,522,32.59938,-85.48826,Southeastern Conference,12695,12684,25379,09/01/1910,27,7,10182.91,83.32,4,4,27.99,-0.080,Creighton,652,41.26536,-95.94781,Big East Conference,1789,2501,4290,09/01/1923,23,9,13651.44,80.53,11,3,24.22,-0.018,Marquette,617,43.03903,-87.92796,Big East Conference,3328,4200,7528,09/01/1928,25,9,14084.65,78.29,13,2,23.02,0.035,North Carolina,560,35.91177,-79.05097,Atlantic Coast Conference,8068,12174,20242,09/01/1906,27,7,15767.47,81.47,9,1,26.19,-0.038,0.718750,0.735294,0.794118,0.794118,1284.937265,776.698135,1422.235231,740.529855,0.000778
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129997,70315,39.5414,-104.9218,751.0,DENVER,2023-05-01,3602437,2024-03-21 11:53:31 -0400,311,457,334,694,311,694,311,Iowa St.,679,42.02621,-93.64851,Big 12 Conference,14070,11171,25241,09/01/1908,27,7,12059.44,75.56,8,2,26.47,0.002,Tennessee,557,35.95208,-83.92585,Southeastern Conference,15269,18536,33805,09/01/1909,24,8,16065.47,79.47,5,2,26.61,-0.026,Kentucky,541,38.03891,-84.50475,Southeastern Conference,9596,13127,22723,09/01/1936,23,9,17427.94,89.44,23,3,19.29,-0.040,North Carolina,560,35.91177,-79.05097,Atlantic Coast Conference,8068,12174,20242,09/01/1906,27,7,15767.47,81.47,9,1,26.19,-0.038,0.750000,0.718750,0.794118,0.794118,987.687636,2302.555934,1884.069336,1773.590211,0.001012
129998,51130,29.6625,-95.7272,618.0,HOUSTON,2024-07-17,2260739,2024-03-19 23:06:28 -0400,311,8,288,694,8,288,288,Iowa St.,679,42.02621,-93.64851,Big 12 Conference,14070,11171,25241,09/01/1908,27,7,12059.44,75.56,8,2,26.47,0.002,Tennessee,557,35.95208,-83.92585,Southeastern Conference,15269,18536,33805,09/01/1909,24,8,16065.47,79.47,5,2,26.61,-0.026,Houston,618,29.72039,-95.34354,Big 12 Conference,18290,19653,37943,09/01/1949,30,4,9347.35,73.03,2,1,31.17,0.042,Alabama,630,33.21188,-87.54597,Southeastern Conference,14152,18306,32458,09/01/1936,21,11,10947.59,90.75,14,4,22.96,-0.001,0.750000,0.882353,0.794118,0.656250,1387.363952,870.361899,1304.449227,37.614750,0.000721
129999,36902,39.5354,-119.8374,811.0,RENO,2021-02-20,1298238,2024-03-18 16:17:25 -0400,164,457,288,169,164,288,164,UConn,533,41.80910,-72.24995,Big East Conference,10645,11834,22479,09/01/1910,31,3,14017.88,81.47,1,1,36.43,0.037,Creighton,652,41.26536,-95.94781,Big East Conference,1789,2501,4290,09/01/1923,23,9,13651.44,80.53,11,3,24.22,-0.018,Houston,618,29.72039,-95.34354,Big 12 Conference,18290,19653,37943,09/01/1949,30,4,9347.35,73.03,2,1,31.17,0.042,North Carolina,560,35.91177,-79.05097,Atlantic Coast Conference,8068,12174,20242,09/01/1906,27,7,15767.47,81.47,9,1,26.19,-0.038,0.718750,0.882353,0.911765,0.794118,3969.724948,3579.108216,2025.637440,2481.708080,0.000252
130000,57171,33.3538,-86.8254,630.0,BIRMINGHAM (ANN & TUSC),2019-06-01,681594,2024-03-17 21:14:21 -0400,164,8,334,559,164,559,559,UConn,533,41.80910,-72.24995,Big East Conference,10645,11834,22479,09/01/1910,31,3,14017.88,81.47,1,1,36.43,0.037,Purdue,582,40.42821,-86.91444,Big Ten Conference,21670,16279,37949,09/01/1914,29,4,13329.06,83.39,3,1,30.62,0.048,Kentucky,541,38.03891,-84.50475,Southeastern Conference,9596,13127,22723,09/01/1936,23,9,17427.94,89.44,23,3,19.29,-0.040,Alabama,630,33.21188,-87.54597,Southeastern Conference,14152,18306,32458,09/01/1936,21,11,10947.59,90.75,14,4,22.96,-0.001,0.878788,0.718750,0.911765,0.656250,1588.073704,68.814880,786.678218,561.473181,0.000630


In [8]:
model_log_reg_ew.fit(X_train_ew, y_train_ew)
y_pred_ew = model_log_reg_ew.predict(X_test_ew)
print( f'Accuracy for EW Model: {accuracy_score(y_true=y_test_ew, y_pred=y_pred_ew)}')
model_log_reg_ms.fit(X_train_ms, y_train_ms)
y_pred_ms = model_log_reg_ms.predict(X_test_ms)
print( f'Accuracy for MS Model: {accuracy_score(y_true=y_test_ms, y_pred=y_pred_ms)}')



Accuracy for EW Model: 0.6748971193415638
Accuracy for MS Model: 0.6282835275566324


In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

################################################
#Retrieve Data from winning Team from Semifinal Predictions
###############################################

# Define mappings for the EW group: (suffix, true_prefix, false_prefix)
ew_cols = [
    ("InstitutionEnrollment_Total", "E_", "W_"),
    ("Rk", "E_", "W_"),
    ("Seed_Rank", "E_", "W_"),
    ("NetRtg", "E_", "W_"),
    ("Luck", "E_", "W_"),
    ("win_%", "e_", "w_"),
    ('dist_prob', 'E_','W_')
]

# Define mappings for the MS group: (suffix, true_prefix, false_prefix)
ms_cols = [
     ("InstitutionEnrollment_Total", "M_", "S_"),
    ("Rk", "M_", "S_"),
    ("Seed_Rank", "M_", "S_"),
    ("NetRtg", "M_", "S_"),
    ("Luck", "M_", "S_"),
    ("win_%", "m_", "s_"),
    ('dist_prob', 'M_','S_')
]

drop_list = []

# Create EW columns in the training dataset
for suffix, true_prefix, false_prefix in ew_cols:
    new_col = "EW_" + suffix
    df_train[new_col] = df_train[true_prefix + suffix].where(
        df_train["SemifinalWinner_East_West"] == df_train["RegionWinner_East"],
        df_train[false_prefix + suffix]
    )
    if f'true_prefix + suffix'.startswith('E_'):
        drop_list.append(f'{false_prefix + suffix}')
    else:
        drop_list.append(f'{true_prefix + suffix}')


# Create MS columns in the training dataset
for suffix, true_prefix, false_prefix in ms_cols:
    new_col = "MS_" + suffix
    df_train[new_col] = df_train[true_prefix + suffix].where(
        df_train["SemifinalWinner_South_Midwest"] == df_train["RegionWinner_Midwest"],
        df_train[false_prefix + suffix]
    )
    if f'true_prefix + suffix'.startswith('M_'):
        drop_list.append(f'{false_prefix + suffix}')
    else:
        drop_list.append(f'{true_prefix + suffix}')


print(drop_list)

#Drop Columns from Training set that are not
df_train_xgb = df_train.drop(columns=[col for col in df_train.columns if not (col.startswith('EW_') or col.startswith('MS_'))])



Cross-Validation Accuracy: 0.9164 ± 0.1672
[[7398    0    0 ...    0    0    0]
 [   0 7399    0 ...    0    0    0]
 [   0    0 7399 ...    0    0    0]
 ...
 [   0    0    0 ... 7398    0    0]
 [   0    0    0 ...    0 7399    0]
 [   0    0    0 ...    0    0 7399]]


In [None]:


# from imblearn.combine import SMOTETomek



# smote_tomek = SMOTETomek(random_state=42)

#-----
from imblearn.over_sampling import SMOTE

y_xgb = df_train['NationalChampion']

smote = SMOTE(random_state=42)
df_train_xgb, y_xgb = smote.fit_resample(df_train_xgb, y_xgb)


# Convert back to DataFrame
df_train_xgb = pd.DataFrame(df_train_xgb, columns=df_train_xgb.columns)
df_train_xgb['target'] = y_xgb
#------

# y_xgb = df_train['NationalChampion']
# df_train_xgb, y = smote_tomek.fit_resample(df_train_xgb, y_xgb)


# df_train_xgb = pd.DataFrame(df_train_xgb, columns=df_train_xgb.columns)
# df_train_xgb['NationalChampion'] = y


# # 🎯 1️⃣ Create Binary Target Variable
# y_xgb = (df_train['NationalChampion'] == df_train['SemifinalWinner_East_West']).astype(int)

# ---- Encode Labels Properly ----
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_xgb)  # Convert to [0, 1, ..., num_classes-1]
# y_test_encoded = label_encoder.transform(y_test)  # Apply same encoding to test labels

# 🎯 2️⃣ Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(df_train_xgb, y_train_encoded, test_size=0.2, random_state=42, stratify=y_train_encoded)

# 🎯 3️⃣ Define XGBoost Model
xgb_clf = XGBClassifier(
    objective='multi:softmax',  # Binary classification
    eval_metric='logloss',        # Log loss for binary problems
    use_label_encoder=False,
    max_depth=5,
    n_estimators=25,
    subsample=0.8,
    colsample_bytree=0.8
)

# 🎯 4️⃣ Set Up Cross-Validation (5-Fold Stratified)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform Cross-Validation
cv_scores = cross_val_score(xgb_clf, X_train, y_train, cv=cv, scoring='accuracy')

# 🎯 5️⃣ Print Cross-Validation Results
print(f"Cross-Validation Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
# After getting predictions from your model:
xgb_clf.fit(X_train, y_train)
y_pred = xgb_clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
# # 🎯 6️⃣ Train Final Model on Full Training Data
# xgb_clf.fit(X_train, y_train)

# # 🎯 7️⃣ Predict on Test Set
# y_pred_proba = xgb_clf.predict_proba(X_test)[:, 1]  # Get probabilities for class 1 (EW wins)
# y_pred_binary = (y_pred_proba >= 0.5).astype(int)   # Convert to binary predictions

# # 🎯 8️⃣ Store Predictions in df_test
# df_test['Predicted_Champion_Group'] = y_pred_binary  # 1 = EW wins, 0 = MW/S wins

# # Print a few prediction results
# print(df_test[['Predicted_Champion_Group']].head())


# y_xgb = (df_train['NationalChampion'] == df_train['SemifinalWinner_East_West']).astype(int)

# # Train/Test Split
# X_train, X_test, y_train, y_test = train_test_split(df_train_xgb, y_xgb, test_size=0.2, random_state=42)

# # Encode labels so they start from 0
# label_encoder = LabelEncoder()
# y_train_encoded = label_encoder.fit_transform(y_train)
# y_test_encoded = label_encoder.transform(y_test)

# # Train XGB Model
# xgb_clf = XGBClassifier(
#     objective='multi:softmax',
#     num_class=len(label_encoder.classes_),  # Ensure num_class matches the actual number of labels
#     eval_metric='mlogloss',
#     use_label_encoder=False,
#     max_depth=3,
#     n_estimators=50,
#     subsample=0.8,
#     colsample_bytree=0.8
# )
# xgb_clf.fit(X_train, y_train_encoded)

# # Predict and convert predictions back to original team IDs
# y_pred = xgb_clf.predict(X_test)
# y_pred_original = label_encoder.inverse_transform(y_pred).where(df_test['NationalChampion'] == df_train['SemifinalWinner_East_West'])  # Converts back to original labels
# # Print Accuracy
# from sklearn.metrics import accuracy_score
# print("Test Accuracy:", accuracy_score(y_test_encoded, y_pred))






#Results

'''
Cross-Validation Accuracy: 0.9164 ± 0.1672
[[7398    0    0 ...    0    0    0]
 [   0 7399    0 ...    0    0    0]
 [   0    0 7399 ...    0    0    0]
 ...
 [   0    0    0 ... 7398    0    0]
 [   0    0    0 ...    0 7399    0]
 [   0    0    0 ...    0    0 7399]]
'''


In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

# # df_train

# df_test['Pred_SemifinalWinner_East_West'] = predictions_df['SemifinalWinner_East_West']
# df_test['Pred_SemifinalWinner_South_Midwest'] = predictions_df['SemifinalWinner_South_Midwest']



# Define mappings for the EW group: (suffix, true_prefix, false_prefix)
ew_cols = [
    ("InstitutionEnrollment_Total", "E_", "W_"),
    ("Rk", "E_", "W_"),
    ("Seed_Rank", "E_", "W_"),
    ("NetRtg", "E_", "W_"),
    ("Luck", "E_", "W_"),
    ("win_%", "e_", "w_"),
    ('dist_prob', 'E_','W_')
]

# Define mappings for the MS group: (suffix, true_prefix, false_prefix)
ms_cols = [
     ("InstitutionEnrollment_Total", "M_", "S_"),
    ("Rk", "M_", "S_"),
    ("Seed_Rank", "M_", "S_"),
    ("NetRtg", "M_", "S_"),
    ("Luck", "M_", "S_"),
    ("win_%", "m_", "s_"),
    ('dist_prob', 'M_','S_')
]
drop_list = []

# Create EW columns in the training dataset
for suffix, true_prefix, false_prefix in ew_cols:
    new_col = "EW_" + suffix
    df_train[new_col] = df_train[true_prefix + suffix].where(
        df_train["SemifinalWinner_East_West"] == df_train["RegionWinner_East"],
        df_train[false_prefix + suffix]
    )
    if f'true_prefix + suffix'.startswith('E_'):
        drop_list.append(f'{false_prefix + suffix}')
    else:
        drop_list.append(f'{true_prefix + suffix}')


# Create MS columns in the training dataset
for suffix, true_prefix, false_prefix in ms_cols:
    new_col = "MS_" + suffix
    df_train[new_col] = df_train[true_prefix + suffix].where(
        df_train["SemifinalWinner_South_Midwest"] == df_train["RegionWinner_Midwest"],
        df_train[false_prefix + suffix]
    )
    if f'true_prefix + suffix'.startswith('M_'):
        drop_list.append(f'{false_prefix + suffix}')
    else:
        drop_list.append(f'{true_prefix + suffix}')

# # For the test dataset
# # Assume df_test is already loaded and classic1_df_test is available for source values.
# nat_champ_df_test = df_test

# # Create EW columns in the test dataset
# for suffix, true_prefix, false_prefix in ew_cols:
#     new_col = "EW_" + suffix
#     df_test[new_col] = df_test[true_prefix + suffix].where(
#         df_test["Pred_SemifinalWinner_East_West"] == df_test["RegionWinner_East"],
#         df_test[false_prefix + suffix]
#     )

# # Create MS columns in the test dataset
# for suffix, true_prefix, false_prefix in ms_cols:
#     new_col = "MS_" + suffix
#     df_test[new_col] = df_test[true_prefix + suffix].where(
#         df_test["Pred_SemifinalWinner_South_Midwest"] == df_test["RegionWinner_Midwest"],
#         df_test[false_prefix + suffix]
#     )

df_train_xgb = df_train.drop(columns=[col for col in df_train.columns if not (col.startswith('EW_') or col.startswith('MS_'))])
# df_test_xgb = df_test.drop(columns=[col for col in df_test.columns if not (col.startswith('EW_') or col.startswith('MS_'))])

# from imblearn.combine import SMOTETomek



# smote_tomek = SMOTETomek(random_state=42)

#-----
from imblearn.over_sampling import SMOTE

y_xgb = df_train['NationalChampion']

smote = SMOTE(random_state=42)
df_train_xgb, y_xgb = smote.fit_resample(df_train_xgb, y_xgb)


# Convert back to DataFrame
df_train_xgb = pd.DataFrame(df_train_xgb, columns=df_train_xgb.columns)
df_train_xgb['target'] = y_xgb
#------

# y_xgb = df_train['NationalChampion']
# df_train_xgb, y = smote_tomek.fit_resample(df_train_xgb, y_xgb)


# df_train_xgb = pd.DataFrame(df_train_xgb, columns=df_train_xgb.columns)
# df_train_xgb['NationalChampion'] = y


# # 🎯 1️⃣ Create Binary Target Variable
# y_xgb = (df_train['NationalChampion'] == df_train['SemifinalWinner_East_West']).astype(int)

# ---- Encode Labels Properly ----
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_xgb)  # Convert to [0, 1, ..., num_classes-1]
# y_test_encoded = label_encoder.transform(y_test)  # Apply same encoding to test labels

# 🎯 2️⃣ Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(df_train_xgb, y_train_encoded, test_size=0.2, random_state=42, stratify=y_train_encoded)

# 🎯 3️⃣ Define XGBoost Model
xgb_clf = XGBClassifier(
    objective='multi:softmax',  # Binary classification
    eval_metric='logloss',        # Log loss for binary problems
    use_label_encoder=False,
    max_depth=5,
    n_estimators=25,
    subsample=0.8,
    colsample_bytree=0.8
)

# 🎯 4️⃣ Set Up Cross-Validation (5-Fold Stratified)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform Cross-Validation
cv_scores = cross_val_score(xgb_clf, X_train, y_train, cv=cv, scoring='accuracy')

# 🎯 5️⃣ Print Cross-Validation Results
print(f"Cross-Validation Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
# After getting predictions from your model:
xgb_clf.fit(X_train, y_train)
y_pred = xgb_clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
# # 🎯 6️⃣ Train Final Model on Full Training Data
# xgb_clf.fit(X_train, y_train)

# # 🎯 7️⃣ Predict on Test Set
# y_pred_proba = xgb_clf.predict_proba(X_test)[:, 1]  # Get probabilities for class 1 (EW wins)
# y_pred_binary = (y_pred_proba >= 0.5).astype(int)   # Convert to binary predictions

# # 🎯 8️⃣ Store Predictions in df_test
# df_test['Predicted_Champion_Group'] = y_pred_binary  # 1 = EW wins, 0 = MW/S wins

# # Print a few prediction results
# print(df_test[['Predicted_Champion_Group']].head())


# y_xgb = (df_train['NationalChampion'] == df_train['SemifinalWinner_East_West']).astype(int)

# # Train/Test Split
# X_train, X_test, y_train, y_test = train_test_split(df_train_xgb, y_xgb, test_size=0.2, random_state=42)

# # Encode labels so they start from 0
# label_encoder = LabelEncoder()
# y_train_encoded = label_encoder.fit_transform(y_train)
# y_test_encoded = label_encoder.transform(y_test)

# # Train XGB Model
# xgb_clf = XGBClassifier(
#     objective='multi:softmax',
#     num_class=len(label_encoder.classes_),  # Ensure num_class matches the actual number of labels
#     eval_metric='mlogloss',
#     use_label_encoder=False,
#     max_depth=3,
#     n_estimators=50,
#     subsample=0.8,
#     colsample_bytree=0.8
# )
# xgb_clf.fit(X_train, y_train_encoded)

# # Predict and convert predictions back to original team IDs
# y_pred = xgb_clf.predict(X_test)
# y_pred_original = label_encoder.inverse_transform(y_pred).where(df_test['NationalChampion'] == df_train['SemifinalWinner_East_West'])  # Converts back to original labels
# # Print Accuracy
# from sklearn.metrics import accuracy_score
# print("Test Accuracy:", accuracy_score(y_test_encoded, y_pred))
