## Trying with just the historical data from that season

In [2]:
import pandas as pd
import numpy as np

tournament_data = pd.read_csv("./data/WNCAATourneyDetailedResults.csv")
season_data = pd.read_csv("./data/WRegularSeasonDetailedResults.csv")
season_data.shape

(81308, 34)

In [3]:
combined_data = pd.concat([season_data, tournament_data], axis=0)
sorted_data = combined_data.sort_values(by=["Season", "DayNum"]).reset_index(drop=True)
cols = ["Season", "first_id", "second_id"]
sorted_data["first_id"] = sorted_data[['WTeamID','LTeamID']].min(axis=1)
sorted_data["second_id"] = sorted_data[['WTeamID','LTeamID']].max(axis=1)
sorted_data["prob"] = 0
sorted_data.loc[sorted_data.first_id == sorted_data.WTeamID, "prob"] = 1
sorted_data["game_id"] = sorted_data[cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
sorted_data.drop(columns=["first_id", "second_id"], inplace=True)
sorted_data["LLoc"] = sorted_data.WLoc
sorted_data = sorted_data.replace({"LLoc":{"H":"A", "A":"H", "N":"N"}})
sorted_data.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LOR,LDR,LAst,LTO,LStl,LBlk,LPF,prob,game_id,LLoc
0,2010,11,3103,63,3237,49,H,0,23,54,...,11,27,11,23,7,6,19,1,2010_3103_3237,A
1,2010,11,3104,73,3399,68,N,0,26,62,...,14,26,7,20,4,2,27,1,2010_3104_3399,N
2,2010,11,3110,71,3224,59,A,0,29,62,...,17,23,8,15,6,0,15,1,2010_3110_3224,H
3,2010,11,3111,63,3267,58,A,0,27,52,...,22,22,15,11,14,5,14,1,2010_3111_3267,H
4,2010,11,3119,74,3447,70,H,1,30,74,...,21,32,12,14,4,2,14,1,2010_3119_3447,A


In [4]:
winning_cols = [c for c in sorted_data.columns if c.startswith("W")]
losing_cols = [c for c in sorted_data.columns if c.startswith("L")]
neutral_cols = [c for c in sorted_data.columns if not(c.startswith("W") or c.startswith("L"))]

#Figure out location for losing team

df_w = sorted_data[neutral_cols+winning_cols].copy()
df_l = sorted_data[neutral_cols+losing_cols].copy()

df_w = df_w.rename(columns=lambda x:x[1:] if x.startswith("W") else x)
df_l = df_l.rename(columns=lambda x:x[1:] if x.startswith("L") else x)
df_w["result"] = "Win"
df_l["result"] = "Loss"

df = pd.concat([df_w, df_l], ignore_index=True)
df = df.sort_values(by=["Season", "TeamID", "DayNum"]).reset_index(drop=True)
display(df.isna().sum())
saved_df = df.copy()
df.head()

Season     0
DayNum     0
NumOT      0
prob       0
game_id    0
TeamID     0
Score      0
Loc        0
FGM        0
FGA        0
FGM3       0
FGA3       0
FTM        0
FTA        0
OR         0
DR         0
Ast        0
TO         0
Stl        0
Blk        0
PF         0
result     0
dtype: int64

Unnamed: 0,Season,DayNum,NumOT,prob,game_id,TeamID,Score,Loc,FGM,FGA,...,FTM,FTA,OR,DR,Ast,TO,Stl,Blk,PF,result
0,2010,11,0,0,2010_3102_3394,3102,46,A,15,47,...,11,20,12,21,8,25,10,0,19,Loss
1,2010,12,0,0,2010_3102_3399,3102,49,N,14,51,...,20,29,17,26,5,15,1,0,16,Loss
2,2010,18,0,0,2010_3102_3339,3102,65,H,26,59,...,5,7,11,23,13,12,6,0,14,Loss
3,2010,23,0,0,2010_3102_3119,3102,42,A,16,60,...,7,10,19,24,6,17,7,1,14,Loss
4,2010,25,0,0,2010_3102_3392,3102,60,A,20,53,...,20,29,20,21,5,31,8,0,31,Loss


In [None]:
df = saved_df.copy()
df = df.sort_values(by=["Season", "TeamID", "DayNum"]).reset_index(drop=True)
stats = ["Score", "FGM", "FGA", "FGM3", "FGA3", "FTM", "FTA", "OR", "DR", "Ast", "TO", "Stl", "Blk", "PF"]
cum_stats = [f"cum_{s}" for s in stats]

for stat in stats:
    df[f"cum_{stat}"] = df.groupby(["Season", "TeamID"])[stat].cumsum().shift(fill_value=0)

df["prev_TeamID"] = df["TeamID"].shift(1)
for stat in cum_stats:
    df.loc[df["TeamID"] != df["prev_TeamID"], stat] = 0

df = df.drop(columns=["prev_TeamID"])

Unnamed: 0,Season,DayNum,NumOT,prob,game_id,TeamID,Score,Loc,FGM,FGA,...,cum_FGA3,cum_FTM,cum_FTA,cum_OR,cum_DR,cum_Ast,cum_TO,cum_Stl,cum_Blk,cum_PF
8182,2010,11,0,0,2010_3102_3394,3394,65,H,25,64,...,0,0,0,0,0,0,0,0,0,0
8183,2010,12,1,0,2010_3104_3394,3394,96,H,29,79,...,18,13,18,21,24,13,18,10,1,16
8184,2010,19,0,1,2010_3394_3411,3394,71,A,28,56,...,33,43,60,44,53,26,38,19,4,39
8185,2010,25,0,1,2010_3208_3394,3394,58,N,20,56,...,42,57,92,68,89,32,58,22,7,53
8186,2010,26,0,0,2010_3238_3394,3394,57,A,18,48,...,66,67,113,80,117,46,77,26,11,63
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162121,2025,103,1,1,2025_3230_3394,3394,55,H,21,61,...,293,210,355,200,492,244,349,198,64,369
162122,2025,108,0,0,2025_3311_3394,3394,59,A,21,49,...,312,219,368,210,516,254,367,208,67,390
162123,2025,110,0,1,2025_3270_3394,3394,60,A,20,66,...,330,231,388,220,540,270,384,221,68,405
162124,2025,115,0,0,2025_3309_3394,3394,90,H,30,59,...,346,250,412,235,558,278,396,232,68,425


## Adding in opposing team information?