In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier

In [2]:
team = pd.read_csv("/kaggle/input/mens-march-mania-2022/MDataFiles_Stage1/MTeams.csv")
seasons = pd.read_csv("/kaggle/input/mens-march-mania-2022/MDataFiles_Stage1/MSeasons.csv")
seeds = pd.read_csv("/kaggle/input/mens-march-mania-2022/MDataFiles_Stage1/MNCAATourneySeeds.csv")
season_results = pd.read_csv("/kaggle/input/mens-march-mania-2022/MDataFiles_Stage1/MRegularSeasonCompactResults.csv")
tour_results = pd.read_csv("/kaggle/input/mens-march-mania-2022/MDataFiles_Stage1/MNCAATourneyCompactResults.csv")

In [3]:
# Makng train test split
season_train = season_results.query("Season < 2016")
season_test = season_results.query("Season >= 2016")
print(len(season_train))

139920


In [4]:
def generate_team_scores(frame):
    df = team.copy()
    df = df.set_index("TeamID")
    df["MeanWinScore"] = frame.groupby("WTeamID").mean()["WScore"].astype(int)
    df["MeanLossScore"] = frame.groupby("LTeamID").mean()["LScore"].astype(int)
    df["WinLossDiff"] = df["MeanWinScore"] - df["MeanLossScore"]


    df["Wins"] = frame["WTeamID"].value_counts() # add feature of the number of total wins
    return df
train_team_scores = generate_team_scores(season_train)

In [5]:
# For test score the values should be the entire history 
test_team_scores = team.copy()
test_team_scores = test_team_scores.set_index("TeamID")
test_team_scores["MeanWinScore"] = season_results.groupby("WTeamID").mean()["WScore"].astype(int)
test_team_scores["MeanLossScore"] = season_results.groupby("LTeamID").mean()["LScore"].astype(int)
test_team_scores["WinLossDiff"] = test_team_scores["MeanWinScore"] - test_team_scores["MeanLossScore"]


test_team_scores["Wins"] = season_results["WTeamID"].value_counts() # add feature of the number of total wins
test_team_scores = test_team_scores.query("LastD1Season >= 2016")

In [6]:
## Making training data set

In [7]:
def Add_features(season_res):
    df = pd.DataFrame()
    df["Season"] = season_res["Season"]
    df["TeamA"] = season_res[["WTeamID","LTeamID"]].apply(min,axis=1)
    df["TeamB"] = season_res[["WTeamID","LTeamID"]].apply(max,axis=1)
    df["ID"] = df.apply(lambda f: str(f["Season"])+"_"+ str(f["TeamA"]) + "_"+ str(f["TeamB"]),axis=1)
    df["Win"] = np.ones(len(df),dtype=int)
    df.loc[season_res["LTeamID"] > season_res["WTeamID"],"Win"] = 0
    return df

In [8]:
train = Add_features(season_train)
train.head()

Unnamed: 0,Season,TeamA,TeamB,ID,Win
0,1985,1228,1328,1985_1228_1328,0
1,1985,1106,1354,1985_1106_1354,0
2,1985,1112,1223,1985_1112_1223,0
3,1985,1165,1432,1985_1165_1432,0
4,1985,1192,1447,1985_1192_1447,0


In [9]:
train["TeamAWins"] = train_team_scores.loc[train["TeamA"].values]["Wins"].values
train["TeamBWins"] = train_team_scores.loc[train["TeamB"].values]["Wins"].values

train["TeamAMeanLossScore"] = train_team_scores.loc[train["TeamA"].values]["MeanLossScore"].values
train["TeamBMeanLossScore"] = train_team_scores.loc[train["TeamB"].values]["MeanLossScore"].values


train["TeamAMeanWinScore"] = train_team_scores.loc[train["TeamA"].values]["MeanWinScore"].values
train["TeamBMeanWinScore"] = train_team_scores.loc[train["TeamB"].values]["MeanWinScore"].values

train["TeamAWinLossDiff"] = train_team_scores.loc[train["TeamA"].values]["WinLossDiff"].values
train["TeamBWinLossDiff"] = train_team_scores.loc[train["TeamB"].values]["WinLossDiff"].values

In [10]:
colums = ["Season","TeamA","TeamB","TeamAWins","TeamBWins",
                "TeamAMeanLossScore","TeamBMeanLossScore",
                "TeamAMeanWinScore","TeamBMeanWinScore",
                "TeamAWinLossDiff","TeamBWinLossDiff"]

In [11]:
x_train = train[colums]
y_train = train[["Win"]]

In [12]:
x_train

Unnamed: 0,Season,TeamA,TeamB,TeamAWins,TeamBWins,TeamAMeanLossScore,TeamBMeanLossScore,TeamAMeanWinScore,TeamBMeanWinScore,TeamAWinLossDiff,TeamBWinLossDiff
0,1985,1228,1328,645.0,648.0,63.0,68.0,77.0,84.0,14.0,16.0
1,1985,1106,1354,382.0,420.0,66.0,62.0,76.0,74.0,10.0,12.0
2,1985,1112,1223,721.0,107.0,69.0,64.0,82.0,75.0,13.0,11.0
3,1985,1165,1432,348.0,28.0,60.0,61.0,73.0,71.0,13.0,10.0
4,1985,1192,1447,408.0,404.0,64.0,66.0,76.0,77.0,12.0,11.0
...,...,...,...,...,...,...,...,...,...,...,...
139915,2015,1204,1209,405.0,387.0,64.0,67.0,76.0,76.0,12.0,9.0
139916,2015,1116,1246,609.0,739.0,67.0,68.0,84.0,80.0,17.0,12.0
139917,2015,1163,1374,655.0,441.0,64.0,63.0,79.0,74.0,15.0,11.0
139918,2015,1173,1433,528.0,588.0,65.0,66.0,75.0,75.0,10.0,9.0


In [13]:
scaler = StandardScaler()
scale_columns = ["TeamAWins","TeamBWins",
               "TeamAMeanLossScore","TeamBMeanLossScore",
                "TeamAMeanWinScore","TeamBMeanWinScore",
               "TeamAWinLossDiff","TeamBWinLossDiff"]
scaler.fit(x_train[scale_columns])
val = scaler.transform(x_train[scale_columns])

In [14]:
x_train_scaled = x_train.copy()
x_train_scaled[scale_columns] = val

In [15]:
from sklearn.model_selection import GridSearchCV
#param = {"n_estimators":[100,300,500],
#         "subsample":[0.1,0.5,1,1.5,2],
#        "learning_rate":[0.01,0.05,0.1]}
#model = XGBClassifier(use_label_encoder=False,eval_metric="logloss")
#clf = GridSearchCV(model,param)
#clf.fit(x_train,y_train)
#sorted(clf.cv_results_.keys())

In [16]:
model = XGBClassifier(use_label_encoder=False,eval_metric="logloss",n_estimators = 350, subsample=0.99, learning_rate=0.01, 
                    colsample_bytree=1, max_depth = None, random_state=27)

In [17]:
# The score of the unscaled data
score = cross_val_score(model,x_train,y_train,cv=10)
print(score.mean())

0.6414451114922815


In [18]:
# Score of scaled data
score = cross_val_score(model,x_train_scaled,y_train,cv=10)
print(score.mean())

0.6414451114922815


In [19]:
# Test dataset
test = Add_features(season_test)
test["TeamAWins"] = test_team_scores.loc[test["TeamA"].values]["Wins"].values
test["TeamBWins"] = test_team_scores.loc[test["TeamB"].values]["Wins"].values

test["TeamAMeanLossScore"] = test_team_scores.loc[test["TeamA"].values]["MeanLossScore"].values
test["TeamBMeanLossScore"] = test_team_scores.loc[test["TeamB"].values]["MeanLossScore"].values


test["TeamAMeanWinScore"] = test_team_scores.loc[test["TeamA"].values]["MeanWinScore"].values
test["TeamBMeanWinScore"] = test_team_scores.loc[test["TeamB"].values]["MeanWinScore"].values

test["TeamAWinLossDiff"] = test_team_scores.loc[test["TeamA"].values]["WinLossDiff"].values
test["TeamBWinLossDiff"] = test_team_scores.loc[test["TeamB"].values]["WinLossDiff"].values

In [20]:
x_test = test[colums]
y_test = test[["Win"]]

In [21]:
x_test_scaled = x_test.copy()
scaled_test = scaler.transform(x_test_scaled[scale_columns])
x_test_scaled[scale_columns] = scaled_test

In [22]:
x_test_scaled

Unnamed: 0,Season,TeamA,TeamB,TeamAWins,TeamBWins,TeamAMeanLossScore,TeamBMeanLossScore,TeamAMeanWinScore,TeamBMeanWinScore,TeamAWinLossDiff,TeamBWinLossDiff
139920,2016,1104,1244,2.158788,-2.464442,-0.049932,-1.195430,0.028803,0.109997,0.128546,2.114138
139921,2016,1105,1408,-1.596857,0.089844,-1.644028,-0.418442,-0.633925,0.109997,1.340023,0.871924
139922,2016,1112,1334,3.327375,0.743990,1.942687,-0.418442,2.016986,-0.899960,0.734284,-0.991396
139923,2016,1115,1370,-1.736500,-2.012770,-2.839599,0.358546,-1.959381,-0.226656,0.734284,-0.991396
139924,2016,1116,1380,2.386626,0.494792,1.145639,0.747040,2.679714,2.466562,3.157238,3.356351
...,...,...,...,...,...,...,...,...,...,...,...
174466,2022,1242,1400,3.849196,2.511743,1.544163,1.524029,2.016986,1.119953,1.340023,-0.370289
174467,2022,1126,1411,-0.501767,0.619391,-0.846980,0.358546,-0.965289,0.783301,-0.477192,0.871924
174468,2022,1422,1441,-0.016693,-0.431916,-0.049932,0.747040,-0.302561,0.446649,-0.477192,-0.370289
174469,2022,1181,1438,3.915343,2.137945,3.536782,-0.418442,2.679714,-0.563308,-0.477192,-0.370289


In [23]:
model.fit(x_train,y_train)
model.score(x_test,y_test)

0.6243524065873636

In [24]:
model.fit(x_train_scaled,y_train)
model.score(x_test_scaled,y_test)

0.624497120199126

In [25]:
# Making test data set
submission = pd.read_csv("/kaggle/input/mens-march-mania-2022/MDataFiles_Stage1/MSampleSubmissionStage1.csv")
submission.head()

Unnamed: 0,ID,Pred
0,2016_1112_1114,0.5
1,2016_1112_1122,0.5
2,2016_1112_1124,0.5
3,2016_1112_1138,0.5
4,2016_1112_1139,0.5


In [26]:
submission["Season"] = submission.apply(lambda f: int(f["ID"].split("_")[0]),axis=1)
submission["TeamA"] = submission.apply(lambda f: int(f["ID"].split("_")[1]),axis=1)
submission["TeamB"] = submission.apply(lambda f: int(f["ID"].split("_")[2]),axis=1)

In [27]:
submission.head()

Unnamed: 0,ID,Pred,Season,TeamA,TeamB
0,2016_1112_1114,0.5,2016,1112,1114
1,2016_1112_1122,0.5,2016,1112,1122
2,2016_1112_1124,0.5,2016,1112,1124
3,2016_1112_1138,0.5,2016,1112,1138
4,2016_1112_1139,0.5,2016,1112,1139


In [28]:
submission["TeamAWins"] = test_team_scores.loc[submission["TeamA"].values]["Wins"].values
submission["TeamBWins"] = test_team_scores.loc[submission["TeamB"].values]["Wins"].values

submission["TeamAMeanLossScore"] = test_team_scores.loc[submission["TeamA"].values]["MeanLossScore"].values
submission["TeamBMeanLossScore"] = test_team_scores.loc[submission["TeamB"].values]["MeanLossScore"].values


submission["TeamAMeanWinScore"] = test_team_scores.loc[submission["TeamA"].values]["MeanWinScore"].values
submission["TeamBMeanWinScore"] = test_team_scores.loc[submission["TeamB"].values]["MeanWinScore"].values

submission["TeamAWinLossDiff"] = test_team_scores.loc[submission["TeamA"].values]["WinLossDiff"].values
submission["TeamBWinLossDiff"] = test_team_scores.loc[submission["TeamB"].values]["WinLossDiff"].values


In [29]:
submission

Unnamed: 0,ID,Pred,Season,TeamA,TeamB,TeamAWins,TeamBWins,TeamAMeanLossScore,TeamBMeanLossScore,TeamAMeanWinScore,TeamBMeanWinScore,TeamAWinLossDiff,TeamBWinLossDiff
0,2016_1112_1114,0.5,2016,1112,1114,878,562,69,64,82,76,13,12
1,2016_1112_1122,0.5,2016,1112,1122,878,518,69,68,82,78,13,10
2,2016_1112_1124,0.5,2016,1112,1124,878,585,69,65,82,78,13,13
3,2016_1112_1138,0.5,2016,1112,1138,878,443,69,66,82,77,13,11
4,2016_1112_1139,0.5,2016,1112,1139,878,682,69,63,82,73,13,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11385,2021_1452_1457,0.5,2021,1452,1457,693,506,66,63,78,75,12,12
11386,2021_1452_1458,0.5,2021,1452,1458,693,714,66,61,78,72,12,11
11387,2021_1455_1457,0.5,2021,1455,1457,662,506,61,63,75,75,14,12
11388,2021_1455_1458,0.5,2021,1455,1458,662,714,61,61,75,72,14,11


In [30]:
x_submission = submission[colums]
x_submission_scaled = x_submission.copy()

In [31]:
x_submission

Unnamed: 0,Season,TeamA,TeamB,TeamAWins,TeamBWins,TeamAMeanLossScore,TeamBMeanLossScore,TeamAMeanWinScore,TeamBMeanWinScore,TeamAWinLossDiff,TeamBWinLossDiff
0,2016,1112,1114,878,562,69,64,82,76,13,12
1,2016,1112,1122,878,518,69,68,82,78,13,10
2,2016,1112,1124,878,585,69,65,82,78,13,13
3,2016,1112,1138,878,443,69,66,82,77,13,11
4,2016,1112,1139,878,682,69,63,82,73,13,10
...,...,...,...,...,...,...,...,...,...,...,...
11385,2021,1452,1457,693,506,66,63,78,75,12,12
11386,2021,1452,1458,693,714,66,61,78,72,12,11
11387,2021,1455,1457,662,506,61,63,75,75,14,12
11388,2021,1455,1458,662,714,61,61,75,72,14,11


In [32]:
val_sumbmission = scaler.transform(x_submission_scaled[scale_columns])
x_submission_scaled[scale_columns] = val_sumbmission

In [33]:
model.fit(x_train_scaled,y_train)
y_pred = model.predict_proba(x_submission_scaled)

In [34]:
my_sub = pd.DataFrame({"ID":submission["ID"],"Pred":y_pred[:,1]})
my_sub.to_csv("1-submission",index=False)