In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier

In [2]:
team = pd.read_csv("/kaggle/input/mens-march-mania-2022/MDataFiles_Stage2/MTeams.csv")
seasons = pd.read_csv("/kaggle/input/mens-march-mania-2022/MDataFiles_Stage2/MSeasons.csv")
season_results = pd.read_csv("/kaggle/input/mens-march-mania-2022/MDataFiles_Stage2/MRegularSeasonCompactResults.csv")

In [3]:
# Makng train test split
season_train = season_results.query("Season < 2022")
season_test = season_results.query("Season >= 2022")
print(len(season_train))

170735


In [4]:
sub = pd.read_csv("/kaggle/input/mens-march-mania-2022/MDataFiles_Stage2/MSampleSubmissionStage2.csv")
sub.head()

Unnamed: 0,ID,Pred
0,2022_1103_1104,0.5
1,2022_1103_1112,0.5
2,2022_1103_1116,0.5
3,2022_1103_1120,0.5
4,2022_1103_1124,0.5


In [5]:
def generate_team_scores(frame):
    df = team.copy()
    df = df.set_index("TeamID")
    df["MeanWinScore"] = frame.groupby("WTeamID").mean()["WScore"].astype(int)
    df["MeanLossScore"] = frame.groupby("LTeamID").mean()["LScore"].astype(int)
    df["WinLossDiff"] = df["MeanWinScore"] - df["MeanLossScore"]


    df["Wins"] = frame["WTeamID"].value_counts() # add feature of the number of total wins
    return df
train_team_scores = generate_team_scores(season_train)

In [6]:
# For test score the values should be the entire history 
test_team_scores = team.copy()
test_team_scores = test_team_scores.set_index("TeamID")
test_team_scores["MeanWinScore"] = season_results.groupby("WTeamID").mean()["WScore"].astype(int)
test_team_scores["MeanLossScore"] = season_results.groupby("LTeamID").mean()["LScore"].astype(int)
test_team_scores["WinLossDiff"] = test_team_scores["MeanWinScore"] - test_team_scores["MeanLossScore"]


test_team_scores["Wins"] = season_results["WTeamID"].value_counts() # add feature of the number of total wins
test_team_scores = test_team_scores.query("LastD1Season >= 2022")

In [7]:
def Add_features(season_res):
    df = pd.DataFrame()
    df["Season"] = season_res["Season"]
    df["TeamA"] = season_res[["WTeamID","LTeamID"]].apply(min,axis=1)
    df["TeamB"] = season_res[["WTeamID","LTeamID"]].apply(max,axis=1)
    df["ID"] = df.apply(lambda f: str(f["Season"])+"_"+ str(f["TeamA"]) + "_"+ str(f["TeamB"]),axis=1)
    df["Win"] = np.ones(len(df),dtype=int)
    df.loc[season_res["LTeamID"] > season_res["WTeamID"],"Win"] = 0
    return df

In [8]:
train = Add_features(season_train)
train.head()

Unnamed: 0,Season,TeamA,TeamB,ID,Win
0,1985,1228,1328,1985_1228_1328,0
1,1985,1106,1354,1985_1106_1354,0
2,1985,1112,1223,1985_1112_1223,0
3,1985,1165,1432,1985_1165_1432,0
4,1985,1192,1447,1985_1192_1447,0


In [9]:
train["TeamAWins"] = train_team_scores.loc[train["TeamA"].values]["Wins"].values
train["TeamBWins"] = train_team_scores.loc[train["TeamB"].values]["Wins"].values

train["TeamAMeanLossScore"] = train_team_scores.loc[train["TeamA"].values]["MeanLossScore"].values
train["TeamBMeanLossScore"] = train_team_scores.loc[train["TeamB"].values]["MeanLossScore"].values


train["TeamAMeanWinScore"] = train_team_scores.loc[train["TeamA"].values]["MeanWinScore"].values
train["TeamBMeanWinScore"] = train_team_scores.loc[train["TeamB"].values]["MeanWinScore"].values

train["TeamAWinLossDiff"] = train_team_scores.loc[train["TeamA"].values]["WinLossDiff"].values
train["TeamBWinLossDiff"] = train_team_scores.loc[train["TeamB"].values]["WinLossDiff"].values

In [10]:
colums = ["Season","TeamA","TeamB","TeamAWins","TeamBWins",
                "TeamAMeanLossScore","TeamBMeanLossScore",
                "TeamAMeanWinScore","TeamBMeanWinScore",
                "TeamAWinLossDiff","TeamBWinLossDiff"]

x_train = train[colums]
y_train = train[["Win"]]



In [11]:
scaler = StandardScaler()
scale_columns = ["TeamAWins","TeamBWins",
               "TeamAMeanLossScore","TeamBMeanLossScore",
                "TeamAMeanWinScore","TeamBMeanWinScore",
               "TeamAWinLossDiff","TeamBWinLossDiff"]
scaler.fit(x_train[scale_columns])
val = scaler.transform(x_train[scale_columns])

In [12]:


x_train_scaled = x_train.copy()
x_train_scaled[scale_columns] = val



In [13]:


model = XGBClassifier(use_label_encoder=False,eval_metric="logloss",n_estimators = 350, subsample=0.99, learning_rate=0.01, 
                    colsample_bytree=1, max_depth = None, random_state=27)



In [14]:
test = Add_features(season_test)
test["TeamAWins"] = test_team_scores.loc[test["TeamA"].values]["Wins"].values
test["TeamBWins"] = test_team_scores.loc[test["TeamB"].values]["Wins"].values

test["TeamAMeanLossScore"] = test_team_scores.loc[test["TeamA"].values]["MeanLossScore"].values
test["TeamBMeanLossScore"] = test_team_scores.loc[test["TeamB"].values]["MeanLossScore"].values


test["TeamAMeanWinScore"] = test_team_scores.loc[test["TeamA"].values]["MeanWinScore"].values
test["TeamBMeanWinScore"] = test_team_scores.loc[test["TeamB"].values]["MeanWinScore"].values

test["TeamAWinLossDiff"] = test_team_scores.loc[test["TeamA"].values]["WinLossDiff"].values
test["TeamBWinLossDiff"] = test_team_scores.loc[test["TeamB"].values]["WinLossDiff"].values

In [15]:
x_test = test[colums]
y_test = test[["Win"]]

In [16]:
x_test_scaled = x_test.copy()
scaled_test = scaler.transform(x_test_scaled[scale_columns])
x_test_scaled[scale_columns] = scaled_test

In [17]:


# Making test data set
submission = pd.read_csv("/kaggle/input/mens-march-mania-2022/MDataFiles_Stage2/MSampleSubmissionStage2.csv")
submission.head()



Unnamed: 0,ID,Pred
0,2022_1103_1104,0.5
1,2022_1103_1112,0.5
2,2022_1103_1116,0.5
3,2022_1103_1120,0.5
4,2022_1103_1124,0.5


In [18]:
submission["Season"] = submission.apply(lambda f: int(f["ID"].split("_")[0]),axis=1)
submission["TeamA"] = submission.apply(lambda f: int(f["ID"].split("_")[1]),axis=1)
submission["TeamB"] = submission.apply(lambda f: int(f["ID"].split("_")[2]),axis=1)

In [19]:
submission["TeamAWins"] = test_team_scores.loc[submission["TeamA"].values]["Wins"].values
submission["TeamBWins"] = test_team_scores.loc[submission["TeamB"].values]["Wins"].values

submission["TeamAMeanLossScore"] = test_team_scores.loc[submission["TeamA"].values]["MeanLossScore"].values
submission["TeamBMeanLossScore"] = test_team_scores.loc[submission["TeamB"].values]["MeanLossScore"].values


submission["TeamAMeanWinScore"] = test_team_scores.loc[submission["TeamA"].values]["MeanWinScore"].values
submission["TeamBMeanWinScore"] = test_team_scores.loc[submission["TeamB"].values]["MeanWinScore"].values

submission["TeamAWinLossDiff"] = test_team_scores.loc[submission["TeamA"].values]["WinLossDiff"].values
submission["TeamBWinLossDiff"] = test_team_scores.loc[submission["TeamB"].values]["WinLossDiff"].values

In [20]:
x_submission = submission[colums]
x_submission_scaled = x_submission.copy()

In [21]:
val_sumbmission = scaler.transform(x_submission_scaled[scale_columns])
x_submission_scaled[scale_columns] = val_sumbmission


In [22]:
model.fit(x_train_scaled,y_train)
y_pred = model.predict_proba(x_submission_scaled)


In [23]:
my_sub = pd.DataFrame({"ID":submission["ID"],"Pred":y_pred[:,1]})
my_sub.to_csv("1-submission",index=False)

In [24]:
my_sub.head(20)

Unnamed: 0,ID,Pred
0,2022_1103_1104,0.559983
1,2022_1103_1112,0.639147
2,2022_1103_1116,0.556753
3,2022_1103_1120,0.522861
4,2022_1103_1124,0.52405
5,2022_1103_1129,0.522462
6,2022_1103_1136,0.117103
7,2022_1103_1151,0.526313
8,2022_1103_1159,0.295357
9,2022_1103_1161,0.519905
