In [92]:
import pandas as pd
import tensorflow as tf

In [93]:
df = pd.read_csv("nba_games_with_player_data.csv", index_col=0)
#sort by date so the previous values always already happen
df = df.sort_values("date")
#resetting the column number
df = df.reset_index(drop=True)


In [94]:
#deleting/adding columns
df=df.drop(["index_opp", "+/-"], axis=1)
df["won_opp"] = ~df["won"]
df=df.copy()
#filling up the NaN values
extra = ["won_opp", "home", "home_opp", "total", "total_opp", "season", 'date', 'team', 'team_opp', 'won']
away_players_columns = [col for col in df.columns if col.endswith('_opp') and col not in extra]
home_players_columns = [col for col in df.columns if '_opp' not in col and col not in extra]

#fill missing rows with their previous performance and if no performance, then fill with 0
def fill_missing(team):
    filled = team.loc[:,home_players_columns].ffill()
    filled = filled.fillna(0)
    return filled
df_home_fill=df.groupby(["team"], group_keys=False).apply(fill_missing)

def fill_missing_opp(team):
    filled = team.loc[:,away_players_columns].ffill()
    filled = filled.fillna(0)
    return filled
df_away_fill=df.groupby(["team_opp"], group_keys=False).apply(fill_missing_opp)

In [95]:
#reset df
df=pd.concat([df_home_fill, df_away_fill, df[extra]], axis=1)
df = df.copy()

In [96]:
team_cols = df.columns[~df.columns.str.contains('_')]
team_opp_columns = [f'{col}_opp' for col in team_cols]

In [97]:
extra = ["date_opp", "season_opp"]

In [98]:
team_df_cols = list(team_cols) + list(team_opp_columns)

In [99]:
team_df_cols = [col for col in team_df_cols if col not in extra]

In [100]:
team_df_cols

['fg',
 'fga',
 'fg%',
 '3p',
 '3pa',
 '3p%',
 'ft',
 'fta',
 'ft%',
 'orb',
 'drb',
 'trb',
 'ast',
 'stl',
 'blk',
 'tov',
 'pf',
 'pts',
 'ts%',
 'efg%',
 '3par',
 'ftr',
 'orb%',
 'drb%',
 'trb%',
 'ast%',
 'stl%',
 'blk%',
 'tov%',
 'usg%',
 'ortg',
 'drtg',
 'home',
 'total',
 'season',
 'date',
 'team',
 'won',
 'fg_opp',
 'fga_opp',
 'fg%_opp',
 '3p_opp',
 '3pa_opp',
 '3p%_opp',
 'ft_opp',
 'fta_opp',
 'ft%_opp',
 'orb_opp',
 'drb_opp',
 'trb_opp',
 'ast_opp',
 'stl_opp',
 'blk_opp',
 'tov_opp',
 'pf_opp',
 'pts_opp',
 'ts%_opp',
 'efg%_opp',
 '3par_opp',
 'ftr_opp',
 'orb%_opp',
 'drb%_opp',
 'trb%_opp',
 'ast%_opp',
 'stl%_opp',
 'blk%_opp',
 'tov%_opp',
 'usg%_opp',
 'ortg_opp',
 'drtg_opp',
 'home_opp',
 'total_opp',
 'team_opp',
 'won_opp']

In [101]:
print("+/-" in df.columns)

False


In [102]:
team_df = df[team_df_cols]

In [103]:
team_df

Unnamed: 0,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,orb,...,stl%_opp,blk%_opp,tov%_opp,usg%_opp,ortg_opp,drtg_opp,home_opp,total_opp,team_opp,won_opp
0,37.0,96.0,0.385,12.0,29.0,0.414,20.0,26.0,0.769,23.0,...,9.4,6.0,14.5,100.0,98.6,111.2,0,94,ATL,False
1,37.0,82.0,0.451,8.0,27.0,0.296,12.0,15.0,0.800,7.0,...,5.2,5.5,12.3,100.0,111.2,98.6,1,106,DET,True
2,38.0,94.0,0.404,9.0,29.0,0.310,10.0,17.0,0.588,11.0,...,6.0,15.4,11.8,100.0,97.5,95.5,0,97,CHI,True
3,37.0,87.0,0.425,7.0,19.0,0.368,16.0,23.0,0.696,7.0,...,5.0,10.3,9.0,100.0,95.5,97.5,1,95,CLE,False
4,35.0,83.0,0.422,6.0,18.0,0.333,19.0,27.0,0.704,8.0,...,8.0,10.8,15.9,100.0,110.9,94.9,0,111,GSW,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20407,34.0,92.0,0.370,11.0,35.0,0.314,15.0,19.0,0.789,10.0,...,3.3,8.8,12.4,100.0,120.4,103.8,1,109,DEN,True
20408,39.0,79.0,0.494,14.0,28.0,0.500,16.0,21.0,0.762,5.0,...,2.3,5.9,13.9,100.0,107.2,121.9,0,95,MIA,False
20409,35.0,78.0,0.449,8.0,25.0,0.320,17.0,20.0,0.850,8.0,...,12.4,13.2,6.4,100.0,121.9,107.2,1,108,DEN,True
20410,33.0,96.0,0.344,9.0,35.0,0.257,14.0,16.0,0.875,11.0,...,6.2,11.5,12.9,100.0,97.8,92.6,0,94,DEN,True


In [104]:
df = df.copy()
team_df = team_df.copy()

In [105]:
# creating new columns net rtng, Assist turnover, Rebound Deferential & b2b
team_df["net_rtg"] = team_df["ortg"] - team_df["drtg"]
team_df["net_rtg_opp"] = team_df["ortg_opp"] - team_df["drtg_opp"]

team_df["ast_tov"] = team_df["ast"] / team_df["tov"]
team_df["ast_tov_opp"] = team_df["ast_opp"] / team_df["tov_opp"]

team_df["rb_diff"] = team_df["trb"] - team_df["trb_opp"]
team_df["rb_diff_opp"] = team_df["trb_opp"] - team_df["trb"]

In [106]:
team_df['prev_game_date'] = team_df.groupby('team')['date'].shift(1)
team_df['prev_game_date_opp'] = team_df.groupby('team_opp')['date'].shift(1)
timeDelta = pd.to_datetime(team_df["date"], format="%Y-%m-%d") - pd.to_datetime(team_df["prev_game_date"], format="%Y-%m-%d")
timeDelta_opp = pd.to_datetime(team_df["date"], format="%Y-%m-%d") - pd.to_datetime(team_df['prev_game_date_opp'], format="%Y-%m-%d")
team_df["b2b"] = timeDelta.dt.days == 1
team_df["b2b_opp"] = timeDelta_opp.dt.days == 1

team_df[["b2b", "b2b_opp"]] = team_df[["b2b", "b2b_opp"]].astype(int, errors="ignore")

In [107]:
team_df = team_df[team_df["home"] == 1]

In [108]:
team_df.reset_index(drop=True)

Unnamed: 0,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,orb,...,net_rtg,net_rtg_opp,ast_tov,ast_tov_opp,rb_diff,rb_diff_opp,prev_game_date,prev_game_date_opp,b2b,b2b_opp
0,37.0,96.0,0.385,12.0,29.0,0.414,20.0,26.0,0.769,23.0,...,12.6,-12.6,1.533333,1.466667,19.0,-19.0,,,0,0
1,38.0,94.0,0.404,9.0,29.0,0.310,10.0,17.0,0.588,11.0,...,-2.0,2.0,2.600000,1.000000,3.0,-3.0,,,0,0
2,35.0,83.0,0.422,6.0,18.0,0.333,19.0,27.0,0.704,8.0,...,-16.0,16.0,1.166667,1.450000,-23.0,23.0,,,0,0
3,45.0,93.0,0.484,5.0,15.0,0.333,11.0,13.0,0.846,7.0,...,-5.9,5.9,1.692308,1.105263,-9.0,9.0,,,0,0
4,33.0,84.0,0.393,7.0,28.0,0.250,15.0,25.0,0.600,15.0,...,1.1,-1.1,1.000000,1.428571,-7.0,7.0,,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10201,39.0,96.0,0.406,13.0,39.0,0.333,2.0,2.0,1.000,11.0,...,-12.1,12.1,3.250000,2.900000,-2.0,2.0,2023-05-29,2023-05-22,0,0
10202,38.0,78.0,0.487,17.0,35.0,0.486,18.0,20.0,0.900,8.0,...,3.5,-3.5,2.545455,1.769231,-7.0,7.0,2023-06-01,2023-06-01,0,0
10203,41.0,80.0,0.513,5.0,18.0,0.278,22.0,27.0,0.815,13.0,...,16.6,-16.6,2.153846,5.000000,25.0,-25.0,2023-06-04,2023-06-04,0,0
10204,39.0,79.0,0.494,14.0,28.0,0.500,16.0,21.0,0.762,5.0,...,14.7,-14.7,4.333333,1.642857,-3.0,3.0,2023-06-07,2023-06-07,0,0


In [109]:
team_df.to_csv("team_features.csv")

In [110]:
player_cols = [col for col in df.columns if col not in team_df_cols]

In [111]:
cols_to_add = ["team", "team_opp","date", "season", "won", "home"]

In [112]:
player_cols = player_cols + cols_to_add

In [113]:
player_df = df[player_cols]

In [114]:
player_df = player_df[player_df["home"] == 1]

In [115]:
player_df.to_csv("player_features.csv")

# Rolling the team averages

### import team features csv from here

In [116]:
team_df = pd.read_csv("team_features.csv")

In [117]:
team_df

Unnamed: 0.1,Unnamed: 0,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,net_rtg,net_rtg_opp,ast_tov,ast_tov_opp,rb_diff,rb_diff_opp,prev_game_date,prev_game_date_opp,b2b,b2b_opp
0,0,37.0,96.0,0.385,12.0,29.0,0.414,20.0,26.0,0.769,...,12.6,-12.6,1.533333,1.466667,19.0,-19.0,,,0,0
1,2,38.0,94.0,0.404,9.0,29.0,0.310,10.0,17.0,0.588,...,-2.0,2.0,2.600000,1.000000,3.0,-3.0,,,0,0
2,4,35.0,83.0,0.422,6.0,18.0,0.333,19.0,27.0,0.704,...,-16.0,16.0,1.166667,1.450000,-23.0,23.0,,,0,0
3,6,45.0,93.0,0.484,5.0,15.0,0.333,11.0,13.0,0.846,...,-5.9,5.9,1.692308,1.105263,-9.0,9.0,,,0,0
4,8,33.0,84.0,0.393,7.0,28.0,0.250,15.0,25.0,0.600,...,1.1,-1.1,1.000000,1.428571,-7.0,7.0,,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10201,20402,39.0,96.0,0.406,13.0,39.0,0.333,2.0,2.0,1.000,...,-12.1,12.1,3.250000,2.900000,-2.0,2.0,2023-05-29,2023-05-22,0,0
10202,20404,38.0,78.0,0.487,17.0,35.0,0.486,18.0,20.0,0.900,...,3.5,-3.5,2.545455,1.769231,-7.0,7.0,2023-06-01,2023-06-01,0,0
10203,20406,41.0,80.0,0.513,5.0,18.0,0.278,22.0,27.0,0.815,...,16.6,-16.6,2.153846,5.000000,25.0,-25.0,2023-06-04,2023-06-04,0,0
10204,20408,39.0,79.0,0.494,14.0,28.0,0.500,16.0,21.0,0.762,...,14.7,-14.7,4.333333,1.642857,-3.0,3.0,2023-06-07,2023-06-07,0,0


In [118]:
team_df = team_df.drop(["prev_game_date", "prev_game_date_opp", "Unnamed: 0"], axis=1)

In [119]:
home_team_cols = team_df.columns[~team_df.columns.str.contains('_opp')]

In [122]:
home_team_cols

Index(['fg', 'fga', 'fg%', '3p', '3pa', '3p%', 'ft', 'fta', 'ft%', 'orb',
       'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', 'ts%', 'efg%',
       '3par', 'ftr', 'orb%', 'drb%', 'trb%', 'ast%', 'stl%', 'blk%', 'tov%',
       'usg%', 'ortg', 'drtg', 'home', 'total', 'season', 'date', 'team',
       'won', 'net_rtg', 'ast_tov', 'rb_diff', 'b2b'],
      dtype='object')

In [123]:
# select rolling columns for home  (done)
# create a df of rolling columns for home
no_roll_columns = ["date", "home", "b2b", "team", "season"]
home_team_cols = [col for col in home_team_cols if col not in no_roll_columns]
df_home_rolling = team_df[home_team_cols]

# append those rolling cols to team_df
# select rolling columns for away
# create a df of rolling columns for away
# append those rolling cols to team_df
# collectively selecting rolling cols


['fg',
 'fga',
 'fg%',
 '3p',
 '3pa',
 '3p%',
 'ft',
 'fta',
 'ft%',
 'orb',
 'drb',
 'trb',
 'ast',
 'stl',
 'blk',
 'tov',
 'pf',
 'pts',
 'ts%',
 'efg%',
 '3par',
 'ftr',
 'orb%',
 'drb%',
 'trb%',
 'ast%',
 'stl%',
 'blk%',
 'tov%',
 'usg%',
 'ortg',
 'drtg',
 'total',
 'won',
 'net_rtg',
 'ast_tov',
 'rb_diff']

In [124]:
window_size = 82 
df_home_rolling = team_df[home_team_cols + ["team", "season"]]
def find_team_averages(team):
    rolling = team[home_team_cols].shift(1).rolling(window_size, min_periods=1).mean()
    return rolling
    #using rolling average on team
df_home_rolling = df_home_rolling.groupby(["team", "season"], group_keys=False).apply(find_team_averages)
df_home_rolling = df_home_rolling.add_suffix(f'_{window_size}')

In [125]:
df_home_rolling = df_home_rolling.reset_index(drop=True)

In [126]:
df_home_rolling

Unnamed: 0,fg_82,fga_82,fg%_82,3p_82,3pa_82,3p%_82,ft_82,fta_82,ft%_82,orb_82,...,blk%_82,tov%_82,usg%_82,ortg_82,drtg_82,total_82,won_82,net_rtg_82,ast_tov_82,rb_diff_82
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10201,39.450980,86.137255,0.458451,12.333333,35.960784,0.343373,17.352941,21.627451,0.802275,9.411765,...,6.168627,11.831373,100.0,112.615686,113.941176,108.588235,0.450980,-1.325490,2.045027,-2.921569
10202,39.442308,86.326923,0.457442,12.346154,36.019231,0.343173,17.057692,21.250000,0.806077,9.442308,...,6.198077,11.750000,100.0,112.417308,113.950000,108.288462,0.442308,-1.532692,2.068199,-2.903846
10203,42.812500,86.354167,0.497292,11.270833,30.729167,0.364063,16.083333,20.625000,0.776625,9.833333,...,7.814583,12.539583,100.0,116.368750,118.420833,112.979167,0.479167,-2.052083,2.275350,0.145833
10204,42.775510,86.224490,0.497612,11.142857,30.469388,0.362306,16.204082,20.755102,0.777408,9.897959,...,7.834694,12.536735,100.0,116.451020,118.122449,112.897959,0.489796,-1.671429,2.272871,0.653061


In [127]:
#adding columns to make a wide dataframe
team_df=pd.concat([team_df, df_home_rolling], axis=1)

In [129]:
away_team_cols = [f'{col}_opp' for col in home_team_cols]

In [130]:
away_team_cols

['fg_opp',
 'fga_opp',
 'fg%_opp',
 '3p_opp',
 '3pa_opp',
 '3p%_opp',
 'ft_opp',
 'fta_opp',
 'ft%_opp',
 'orb_opp',
 'drb_opp',
 'trb_opp',
 'ast_opp',
 'stl_opp',
 'blk_opp',
 'tov_opp',
 'pf_opp',
 'pts_opp',
 'ts%_opp',
 'efg%_opp',
 '3par_opp',
 'ftr_opp',
 'orb%_opp',
 'drb%_opp',
 'trb%_opp',
 'ast%_opp',
 'stl%_opp',
 'blk%_opp',
 'tov%_opp',
 'usg%_opp',
 'ortg_opp',
 'drtg_opp',
 'total_opp',
 'won_opp',
 'net_rtg_opp',
 'ast_tov_opp',
 'rb_diff_opp']

In [131]:
window_size = 82 
df_away_rolling = team_df[away_team_cols + ["team_opp", "season"]]
def find_team_averages_away(team):
    rolling = team[away_team_cols].shift(1).rolling(window_size, min_periods=1).mean()
    return rolling
    #using rolling average on team
df_away_rolling = df_away_rolling.groupby(["team_opp", "season"], group_keys=False).apply(find_team_averages_away)
df_away_rolling = df_away_rolling.add_suffix(f'_{window_size}')

In [132]:
df_away_rolling

Unnamed: 0,fg_opp_82,fga_opp_82,fg%_opp_82,3p_opp_82,3pa_opp_82,3p%_opp_82,ft_opp_82,fta_opp_82,ft%_opp_82,orb_opp_82,...,blk%_opp_82,tov%_opp_82,usg%_opp_82,ortg_opp_82,drtg_opp_82,total_opp_82,won_opp_82,net_rtg_opp_82,ast_tov_opp_82,rb_diff_opp_82
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10201,44.224490,87.040816,0.508857,12.448980,31.734694,0.390367,17.836735,23.938776,0.747735,10.469388,...,8.332653,11.600000,100.0,121.420408,111.138776,118.734694,0.857143,10.281633,2.367678,5.306122
10202,44.140000,86.880000,0.508800,12.360000,31.640000,0.388480,17.800000,23.860000,0.748780,10.380000,...,8.306000,11.572000,100.0,121.280000,110.962000,118.440000,0.860000,10.318000,2.378324,5.240000
10203,39.392157,84.784314,0.465529,11.921569,33.078431,0.362706,20.196078,23.764706,0.844961,9.725490,...,6.574510,11.358824,100.0,116.194118,114.013725,110.901961,0.666667,2.180392,2.135988,-0.156863
10204,39.288462,84.923077,0.463692,11.903846,33.115385,0.361769,20.096154,23.673077,0.843885,9.730769,...,6.540385,11.213462,100.0,115.955769,114.136538,110.576923,0.653846,1.819231,2.191065,-0.634615


In [133]:
team_df

Unnamed: 0,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,orb,...,blk%_82,tov%_82,usg%_82,ortg_82,drtg_82,total_82,won_82,net_rtg_82,ast_tov_82,rb_diff_82
0,37.0,96.0,0.385,12.0,29.0,0.414,20.0,26.0,0.769,23.0,...,,,,,,,,,,
1,38.0,94.0,0.404,9.0,29.0,0.310,10.0,17.0,0.588,11.0,...,,,,,,,,,,
2,35.0,83.0,0.422,6.0,18.0,0.333,19.0,27.0,0.704,8.0,...,,,,,,,,,,
3,45.0,93.0,0.484,5.0,15.0,0.333,11.0,13.0,0.846,7.0,...,,,,,,,,,,
4,33.0,84.0,0.393,7.0,28.0,0.250,15.0,25.0,0.600,15.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10201,39.0,96.0,0.406,13.0,39.0,0.333,2.0,2.0,1.000,11.0,...,6.168627,11.831373,100.0,112.615686,113.941176,108.588235,0.450980,-1.325490,2.045027,-2.921569
10202,38.0,78.0,0.487,17.0,35.0,0.486,18.0,20.0,0.900,8.0,...,6.198077,11.750000,100.0,112.417308,113.950000,108.288462,0.442308,-1.532692,2.068199,-2.903846
10203,41.0,80.0,0.513,5.0,18.0,0.278,22.0,27.0,0.815,13.0,...,7.814583,12.539583,100.0,116.368750,118.420833,112.979167,0.479167,-2.052083,2.275350,0.145833
10204,39.0,79.0,0.494,14.0,28.0,0.500,16.0,21.0,0.762,5.0,...,7.834694,12.536735,100.0,116.451020,118.122449,112.897959,0.489796,-1.671429,2.272871,0.653061


In [135]:
team_df=pd.concat([team_df, df_away_rolling], axis=1)

In [136]:
team_df

Unnamed: 0,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,orb,...,blk%_opp_82,tov%_opp_82,usg%_opp_82,ortg_opp_82,drtg_opp_82,total_opp_82,won_opp_82,net_rtg_opp_82,ast_tov_opp_82,rb_diff_opp_82
0,37.0,96.0,0.385,12.0,29.0,0.414,20.0,26.0,0.769,23.0,...,,,,,,,,,,
1,38.0,94.0,0.404,9.0,29.0,0.310,10.0,17.0,0.588,11.0,...,,,,,,,,,,
2,35.0,83.0,0.422,6.0,18.0,0.333,19.0,27.0,0.704,8.0,...,,,,,,,,,,
3,45.0,93.0,0.484,5.0,15.0,0.333,11.0,13.0,0.846,7.0,...,,,,,,,,,,
4,33.0,84.0,0.393,7.0,28.0,0.250,15.0,25.0,0.600,15.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10201,39.0,96.0,0.406,13.0,39.0,0.333,2.0,2.0,1.000,11.0,...,8.332653,11.600000,100.0,121.420408,111.138776,118.734694,0.857143,10.281633,2.367678,5.306122
10202,38.0,78.0,0.487,17.0,35.0,0.486,18.0,20.0,0.900,8.0,...,8.306000,11.572000,100.0,121.280000,110.962000,118.440000,0.860000,10.318000,2.378324,5.240000
10203,41.0,80.0,0.513,5.0,18.0,0.278,22.0,27.0,0.815,13.0,...,6.574510,11.358824,100.0,116.194118,114.013725,110.901961,0.666667,2.180392,2.135988,-0.156863
10204,39.0,79.0,0.494,14.0,28.0,0.500,16.0,21.0,0.762,5.0,...,6.540385,11.213462,100.0,115.955769,114.136538,110.576923,0.653846,1.819231,2.191065,-0.634615


In [137]:
features_to_train = list(df_home_rolling.columns) + list(df_away_rolling.columns)

In [138]:
features_to_train

['fg_82',
 'fga_82',
 'fg%_82',
 '3p_82',
 '3pa_82',
 '3p%_82',
 'ft_82',
 'fta_82',
 'ft%_82',
 'orb_82',
 'drb_82',
 'trb_82',
 'ast_82',
 'stl_82',
 'blk_82',
 'tov_82',
 'pf_82',
 'pts_82',
 'ts%_82',
 'efg%_82',
 '3par_82',
 'ftr_82',
 'orb%_82',
 'drb%_82',
 'trb%_82',
 'ast%_82',
 'stl%_82',
 'blk%_82',
 'tov%_82',
 'usg%_82',
 'ortg_82',
 'drtg_82',
 'total_82',
 'won_82',
 'net_rtg_82',
 'ast_tov_82',
 'rb_diff_82',
 'fg_opp_82',
 'fga_opp_82',
 'fg%_opp_82',
 '3p_opp_82',
 '3pa_opp_82',
 '3p%_opp_82',
 'ft_opp_82',
 'fta_opp_82',
 'ft%_opp_82',
 'orb_opp_82',
 'drb_opp_82',
 'trb_opp_82',
 'ast_opp_82',
 'stl_opp_82',
 'blk_opp_82',
 'tov_opp_82',
 'pf_opp_82',
 'pts_opp_82',
 'ts%_opp_82',
 'efg%_opp_82',
 '3par_opp_82',
 'ftr_opp_82',
 'orb%_opp_82',
 'drb%_opp_82',
 'trb%_opp_82',
 'ast%_opp_82',
 'stl%_opp_82',
 'blk%_opp_82',
 'tov%_opp_82',
 'usg%_opp_82',
 'ortg_opp_82',
 'drtg_opp_82',
 'total_opp_82',
 'won_opp_82',
 'net_rtg_opp_82',
 'ast_tov_opp_82',
 'rb_diff_opp

In [139]:
team_df=team_df.dropna()

In [140]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
team_df[features_to_train] = scaler.fit_transform(team_df[features_to_train])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  team_df[features_to_train] = scaler.fit_transform(team_df[features_to_train])


In [141]:
features_to_train=features_to_train + ["b2b", "b2b_opp", "home", "home_opp"]

In [121]:
list(team_df.columns)

['fg',
 'fga',
 'fg%',
 '3p',
 '3pa',
 '3p%',
 'ft',
 'fta',
 'ft%',
 'orb',
 'drb',
 'trb',
 'ast',
 'stl',
 'blk',
 'tov',
 'pf',
 'pts',
 'ts%',
 'efg%',
 '3par',
 'ftr',
 'orb%',
 'drb%',
 'trb%',
 'ast%',
 'stl%',
 'blk%',
 'tov%',
 'usg%',
 'ortg',
 'drtg',
 'team',
 'total',
 'home',
 'season',
 'date',
 'won',
 'fg_opp',
 'fga_opp',
 'fg%_opp',
 '3p_opp',
 '3pa_opp',
 '3p%_opp',
 'ft_opp',
 'fta_opp',
 'ft%_opp',
 'orb_opp',
 'drb_opp',
 'trb_opp',
 'ast_opp',
 'stl_opp',
 'blk_opp',
 'tov_opp',
 'pf_opp',
 'pts_opp',
 'ts%_opp',
 'efg%_opp',
 '3par_opp',
 'ftr_opp',
 'orb%_opp',
 'drb%_opp',
 'trb%_opp',
 'ast%_opp',
 'stl%_opp',
 'blk%_opp',
 'tov%_opp',
 'usg%_opp',
 'ortg_opp',
 'drtg_opp',
 'team_opp',
 'total_opp',
 'home_opp',
 'won_opp',
 'net_rtg',
 'net_rtg_opp',
 'ast_tov',
 'ast_tov_opp',
 'rb_diff',
 'rb_diff_opp',
 'b2b',
 'b2b_opp',
 'fg_82',
 'fga_82',
 'fg%_82',
 '3p_82',
 '3pa_82',
 '3p%_82',
 'ft_82',
 'fta_82',
 'ft%_82',
 'orb_82',
 'drb_82',
 'trb_82',
 'a

In [142]:
team_df_processed=team_df[features_to_train + ["team", "team_opp", "date", "season", "won"]]

In [143]:
team_df_processed

Unnamed: 0,fg_82,fga_82,fg%_82,3p_82,3pa_82,3p%_82,ft_82,fta_82,ft%_82,orb_82,...,rb_diff_opp_82,b2b,b2b_opp,home,home_opp,team,team_opp,date,season,won
25,0.266667,0.326531,0.290520,0.260870,0.355556,0.334000,0.344828,0.432432,0.200000,0.619048,...,0.367347,0,0,1,0,WAS,MIL,2015-10-30,2016,True
27,0.266667,0.326531,0.290520,0.217391,0.266667,0.334000,0.586207,0.540541,0.518000,0.428571,...,0.020408,0,1,1,0,CHO,ATL,2015-10-30,2016,False
31,0.566667,0.204082,0.733945,0.565217,0.355556,0.834000,0.413793,0.297297,0.700000,0.142857,...,0.510204,0,0,1,0,CHI,DET,2015-10-30,2016,False
35,0.200000,0.183673,0.321101,0.173913,0.088889,0.460000,0.103448,0.162162,0.066000,0.000000,...,0.122449,1,0,1,0,BRK,MEM,2015-10-31,2016,False
37,0.350000,0.275510,0.428135,0.152174,0.011111,0.539000,0.431034,0.378378,0.514000,0.309524,...,0.326531,1,0,1,0,UTA,IND,2015-10-31,2016,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10201,0.481699,0.370148,0.490676,0.492754,0.532462,0.520745,0.425963,0.341282,0.604549,0.352941,...,0.516451,0,0,1,0,MIA,DEN,2023-06-01,2023,False
10202,0.481410,0.374019,0.487591,0.493311,0.533761,0.520346,0.415782,0.331081,0.612154,0.354396,...,0.515102,0,0,1,0,MIA,DEN,2023-06-04,2023,True
10203,0.593750,0.374575,0.609455,0.446558,0.416204,0.562125,0.382184,0.314189,0.553250,0.373016,...,0.404962,0,0,1,0,DEN,MIA,2023-06-07,2023,True
10204,0.592517,0.371928,0.610435,0.440994,0.410431,0.558612,0.386348,0.317705,0.554816,0.376093,...,0.395212,0,0,1,0,DEN,MIA,2023-06-09,2023,True


In [144]:
team_df_processed.to_csv("team_df_processed.csv")

In [152]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import preprocessing
X = team_df_processed[features_to_train]
y = team_df_processed["won"]
features = X.columns
rf = GradientBoostingClassifier()
rf.fit(X, y)
importances = rf.feature_importances_

In [153]:
# Create a list of tuples where each tuple contains the feature and its importance
feature_importance_tuples = list(zip(features, importances))

# Sort the list of tuples based on the importance scores in descending order
sorted_feature_importance = sorted(feature_importance_tuples, key=lambda x: x[1], reverse=True)
sorted_feature_importance

[('net_rtg_opp_82', 0.2185614262213244),
 ('net_rtg_82', 0.13594456769050114),
 ('won_82', 0.05341947719197078),
 ('won_opp_82', 0.04806460400138376),
 ('ast%_opp_82', 0.02277912427775786),
 ('ts%_82', 0.022057073238688715),
 ('3par_82', 0.02160557662581929),
 ('stl%_opp_82', 0.019969967556169373),
 ('drb_opp_82', 0.019694544576860474),
 ('3pa_opp_82', 0.019319656011863435),
 ('ortg_opp_82', 0.017109764008153408),
 ('b2b', 0.015391015643290895),
 ('fg%_82', 0.014349787378074122),
 ('ft%_82', 0.013666850508441537),
 ('blk_opp_82', 0.01200005734327334),
 ('3pa_82', 0.01193649330979495),
 ('orb%_82', 0.011052879143524673),
 ('ast_tov_82', 0.010940987859296742),
 ('tov%_opp_82', 0.010872385273003705),
 ('ast%_82', 0.010558588822690573),
 ('drb%_opp_82', 0.010488691973624707),
 ('stl%_82', 0.01039475137267607),
 ('ast_tov_opp_82', 0.009624203548313193),
 ('blk%_opp_82', 0.00934808223691359),
 ('pf_82', 0.009146130982262054),
 ('fg%_opp_82', 0.00892576942209806),
 ('drb%_82', 0.0089122609253

In [154]:
top_features = [feature[0] for feature in sorted_feature_importance[:30] if feature[1] > 0]
top_features

['net_rtg_opp_82',
 'net_rtg_82',
 'won_82',
 'won_opp_82',
 'ast%_opp_82',
 'ts%_82',
 '3par_82',
 'stl%_opp_82',
 'drb_opp_82',
 '3pa_opp_82',
 'ortg_opp_82',
 'b2b',
 'fg%_82',
 'ft%_82',
 'blk_opp_82',
 '3pa_82',
 'orb%_82',
 'ast_tov_82',
 'tov%_opp_82',
 'ast%_82',
 'drb%_opp_82',
 'stl%_82',
 'ast_tov_opp_82',
 'blk%_opp_82',
 'pf_82',
 'fg%_opp_82',
 'drb%_82',
 'blk%_82',
 'trb%_82',
 'pf_opp_82']

In [155]:
list1 = ['net_rtg_opp_82',
 'net_rtg_82',
 'won_82',
 'won_opp_82',
 'ts%_82',
 'ast%_opp_82',
 'drb_opp_82',
 '3par_82',
 'ft%_82',
 '3pa_82',
 'stl%_opp_82',
 'ortg_opp_82',
 'b2b',
 '3pa_opp_82',
 'fg%_82',
 'ast_tov_82',
 '3par_opp_82',
 'blk_opp_82',
 'orb%_82',
 'stl%_82',
 'trb_opp_82',
 'tov_opp_82',
 'drb%_82',
 'stl_opp_82',
 'tov%_opp_82',
 'blk%_opp_82',
 'drb%_opp_82',
 'drtg_82',
 'rb_diff_opp_82',
 'pf_opp_82']

list2 = ['net_rtg_opp_82',
 'net_rtg_82',
 'won_82',
 'won_opp_82',
 'ast%_opp_82',
 'ts%_82',
 '3par_82',
 'stl%_opp_82',
 'drb_opp_82',
 '3pa_opp_82',
 'ortg_opp_82',
 'b2b',
 'fg%_82',
 'ft%_82',
 'blk_opp_82',
 '3pa_82',
 'orb%_82',
 'ast_tov_82',
 'tov%_opp_82',
 'ast%_82',
 'drb%_opp_82',
 'stl%_82',
 'ast_tov_opp_82',
 'blk%_opp_82',
 'pf_82',
 'fg%_opp_82',
 'drb%_82',
 'blk%_82',
 'trb%_82',
 'pf_opp_82']

difference = set(list1) ^ set(list2)
print(list(difference))


['trb_opp_82', 'pf_82', 'ast%_82', '3par_opp_82', 'blk%_82', 'drtg_82', 'ast_tov_opp_82', 'trb%_82', 'tov_opp_82', 'rb_diff_opp_82', 'stl_opp_82', 'fg%_opp_82']


In [134]:
team_df_processed=team_df[features_to_train + ["team", "team_opp", "date", "season", "won",]]

In [135]:
team_df_processed.to_csv("team_df_processed.csv")

In [156]:
team_df_top_features_processed = team_df[top_features + ["team", "team_opp", "date", "season", "won",]]

In [157]:
team_df_top_features_processed.to_csv("team_df_top_features_processed.csv")