In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold,cross_val_score
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import log_loss
from xgboost import XGBClassifier

In [2]:
# Basic data
team = pd.read_csv("/kaggle/input/mens-march-mania-2022/MDataFiles_Stage1/MTeams.csv")
seasons = pd.read_csv("/kaggle/input/mens-march-mania-2022/MDataFiles_Stage1/MSeasons.csv")
seeds = pd.read_csv("/kaggle/input/mens-march-mania-2022/MDataFiles_Stage1/MNCAATourneySeeds.csv")
season_results = pd.read_csv("/kaggle/input/mens-march-mania-2022/MDataFiles_Stage1/MRegularSeasonCompactResults.csv")
tour_results = pd.read_csv("/kaggle/input/mens-march-mania-2022/MDataFiles_Stage1/MNCAATourneyCompactResults.csv")

In [3]:
train_row = season_results.query('Season < 2016')
test_row = season_results.query('Season == 2016',).reset_index()

In [4]:
test_row

Unnamed: 0,index,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,139920,2016,11,1104,77,1244,64,H,0
1,139921,2016,11,1105,68,1408,67,A,1
2,139922,2016,11,1112,79,1334,61,H,0
3,139923,2016,11,1115,58,1370,56,A,0
4,139924,2016,11,1116,86,1380,68,H,0
...,...,...,...,...,...,...,...,...,...
5364,145284,2016,132,1114,70,1419,50,N,0
5365,145285,2016,132,1163,72,1272,58,N,0
5366,145286,2016,132,1246,82,1401,77,N,1
5367,145287,2016,132,1277,66,1345,62,N,0


In [5]:
enc = LabelBinarizer()
enc.fit(train_row.WLoc.values)
d1 = pd.DataFrame(enc.transform(train_row.WLoc.values),columns=enc.classes_)
d2 = pd.DataFrame(enc.transform(test_row.WLoc.values),columns=enc.classes_)

In [6]:
train = pd.concat([train_row,d1],axis=1)
test = pd.concat([test_row,d2],axis=1)

In [7]:
train.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,A,H,N
0,1985,20,1228,81,1328,64,N,0,0,0,1
1,1985,25,1106,77,1354,70,H,0,0,1,0
2,1985,25,1112,63,1223,56,H,0,0,1,0
3,1985,25,1165,70,1432,54,H,0,0,1,0
4,1985,25,1192,86,1447,74,H,0,0,1,0


In [8]:
#%%time
#id_train = [str(row['Season'])+"_"+ str(min(row["WTeamID"],row["LTeamID"])) + "_"+ str(max(row["WTeamID"],row["LTeamID"])) for _,row in train.iterrows() ]
#team_a = [min(row["WTeamID"],row["LTeamID"]) for _,row in train.iterrows() ]
#team_b = [max(row["WTeamID"],row["LTeamID"]) for _,row in train.iterrows() ]
#win = [1 if row["WTeamID"] > row["LTeamID"] else 0 for _,row in train.iterrows()]

In [9]:
%%time
id_train = [] 
team_a = []
team_b = []
win = []
for _,row in train.iterrows():
    id_train.append(str(row['Season'])+"_"+ str(min(row["WTeamID"],row["LTeamID"])) + "_"+ str(max(row["WTeamID"],row["LTeamID"])))
    team_a.append(min(row["WTeamID"],row["LTeamID"]) )
    team_b.append(max(row["WTeamID"],row["LTeamID"]) )
    if row["WTeamID"] > row["LTeamID"]:
        win.append(1)
    else:
        win.append(0)

CPU times: user 14.8 s, sys: 52.7 ms, total: 14.9 s
Wall time: 14.9 s


In [10]:
df = pd.DataFrame({"Id":id_train,"TeamA":team_a,"TeamB":team_b,"Result":win})
df["DayNum"] = train["DayNum"]

df["WLoc"] = train["WLoc"]
df["Win_gap"] = train["WScore"] - train["LScore"]
df["N"] = train["N"]
df["H"] = train["H"]
df["A"] = train["A"]
df.head()

Unnamed: 0,Id,TeamA,TeamB,Result,DayNum,WLoc,Win_gap,N,H,A
0,1985_1228_1328,1228,1328,0,20,N,17,1,0,0
1,1985_1106_1354,1106,1354,0,25,H,7,0,1,0
2,1985_1112_1223,1112,1223,0,25,H,7,0,1,0
3,1985_1165_1432,1165,1432,0,25,H,16,0,1,0
4,1985_1192_1447,1192,1447,0,25,H,12,0,1,0


In [11]:
x_train = df[["TeamA","TeamB","DayNum","Win_gap","N","A","H"]]
y_train = df["Result"]

In [12]:
id_test = [] 
team_a = []
team_b = []
win_test = []
for _,row in test.iterrows():
    id_test.append(str(row['Season'])+"_"+ str(min(row["WTeamID"],row["LTeamID"])) + "_"+ str(max(row["WTeamID"],row["LTeamID"])))
    team_a.append(min(row["WTeamID"],row["LTeamID"]) )
    team_b.append(max(row["WTeamID"],row["LTeamID"]) )
    if row["WTeamID"] > row["LTeamID"]:
        win_test.append(1)
    else:
        win_test.append(0)

In [13]:
test.head()

Unnamed: 0,index,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,A,H,N
0,139920,2016,11,1104,77,1244,64,H,0,0,1,0
1,139921,2016,11,1105,68,1408,67,A,1,1,0,0
2,139922,2016,11,1112,79,1334,61,H,0,0,1,0
3,139923,2016,11,1115,58,1370,56,A,0,1,0,0
4,139924,2016,11,1116,86,1380,68,H,0,0,1,0


In [14]:
df_test = pd.DataFrame({"Id":id_test,"TeamA":team_a,"TeamB":team_b,"Result":win_test},index=test.index)
df_test["DayNum"] = test["DayNum"]
df_test["WLoc"] = test["WLoc"]
df_test["Win_gap"] = test["WScore"] - test["LScore"]
df_test["N"] = test["N"]
df_test["H"] = test["H"]
df_test["A"] = test["A"]
df_test.head()

Unnamed: 0,Id,TeamA,TeamB,Result,DayNum,WLoc,Win_gap,N,H,A
0,2016_1104_1244,1104,1244,0,11,H,13,0,1,0
1,2016_1105_1408,1105,1408,0,11,A,1,0,0,1
2,2016_1112_1334,1112,1334,0,11,H,18,0,1,0
3,2016_1115_1370,1115,1370,0,11,A,2,0,0,1
4,2016_1116_1380,1116,1380,0,11,H,18,0,1,0


In [15]:
x_test = df_test[["TeamA","TeamB","DayNum","Win_gap","N","H","A"]]
y_test = df_test[["Result"]]

In [16]:
model = XGBClassifier(use_label_encoder=False,eval_metric="logloss")

In [17]:
score = cross_val_score(model,x_train,y_train)
print(score.mean())

0.605817610062893


In [18]:
model.fit(x_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              eval_metric='logloss', gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=4,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [19]:
y_pred = model.predict_proba(x_test)

In [20]:
log_loss(y_test,y_pred)

0.6319887746502915