In [16]:
import pandas as pd
from nbapy import game, shot_chart, player, scoreboard, team
import datetime

In [19]:
def datascrape(start):
    start = datetime.datetime.strptime(start, "%d-%m-%Y")
    #end = datetime.datetime.strptime("21-07-2014", "%d-%m-%Y")
    end = datetime.datetime.today()
    dates = [start + datetime.timedelta(days=x) for x in range(0, (end-start).days)]
    df = pd.DataFrame({"date": dates})
    df["date"] = df["date"].astype(str)
    df[['Year', 'Month', "Day"]] = df["date"].str.split('-', 2, expand=True)
    df["Month"] = df["Month"].str.lstrip('0')
    df["Day"] = df["Day"].str.lstrip('0')
    
    #to limit to five
    #df = df.head()
    dates = []
    for i, row in df.iterrows():
        day = {}
        day["year"] = row["Year"]
        day["month"] = row["Month"]
        day["day"] = row["Day"]
        dates.append(day)
    
    init = pd.DataFrame({
    "GAME_DATE_EST_x": [],
    "GAME_ID": [],
    "TEAM_ABBREVIATION_x": [],
    "TEAM_ABBREVIATION_y": [],
    "PTS_x": [],
    "PTS_y": []
    })
    
    for date in dates:
        scores = scoreboard.Scoreboard(month=int(date["month"]), day=int(date["day"]), year=int(date["year"]), league_id='00', offset=0)
        df = scores.line_score()
        if df.empty:
            pass
        else:
            df_1 = df.drop_duplicates(subset=['GAME_ID'], keep='first')
            df_2 = df.drop_duplicates(subset=['GAME_ID'], keep='last')
            merged = df_1.merge(df_2, on=["GAME_ID"])                                                                              #FG_PCT	FT_PCT	FG3_PCT	AST	REB	TOV
            merge = merged[["GAME_DATE_EST_x", "GAME_ID", "TEAM_ABBREVIATION_x", "TEAM_ID_x", "TEAM_ID_y", "TEAM_ABBREVIATION_y", "PTS_x", "PTS_y", "FG_PCT_x", "FG_PCT_y", "FT_PCT_x", "FT_PCT_y", "FG3_PCT_x", "FG3_PCT_y", "AST_x", "AST_y", "REB_x", "REB_y", "TOV_x", "TOV_y"]]
            init = pd.concat([init, merge], sort=True)
    
    return init

In [20]:
df = datascrape("15-12-2020")

In [21]:
#Rename columns
df = df.rename(columns={
            "GAME_DATE_EST_x": "date",
            "GAME_ID": "game_id",
            "TEAM_ABBREVIATION_x": "away",
            "TEAM_ABBREVIATION_y": "home",
            "TEAM_ID_x": "away_id",
            "TEAM_ID_y": "home_id",
            "PTS_x": "away_pts",
            "PTS_y": "home_pts",
            "FG_PCT_x": "away_fg",
            "FG_PCT_y": "home_fg",
            "FT_PCT_x": "away_ft",
            "FT_PCT_y": "home_ft",
            "FG3_PCT_x": "away_fg3",
            "FG3_PCT_y": "home_fg3",
            "AST_x": "away_ast",
            "AST_y": "home_ast",
            "REB_x": "away_reb",
            "REB_y": "home_reb",
            "TOV_x": "away_tov",
            "TOV_y": "home_tov",
            })

df = df[["game_id", "date", "away", "home","away_id", "home_id","away_pts","home_pts", "away_fg","home_fg", "away_ft",
         "home_ft", "away_fg3", "home_fg3","away_ast", "home_ast", "away_reb", "home_reb", "away_tov","home_tov"]]


In [22]:
#Add catagorical for win
df.loc[df["home_pts"] > df["away_pts"], "home_win"] = 1
df.loc[df["home_pts"] < df["away_pts"], "home_win"] = 0

In [23]:
#Remove time
df["date"] = df["date"].str.replace("T00:00:00", "")

In [24]:
df = df.reset_index(drop=True).dropna()

In [25]:
df = df[["game_id", "date", "away", "home","away_id", "home_id", "home_win"]]

df.head()

Unnamed: 0,game_id,date,away,home,away_id,home_id,home_win
0,12000025,2020-12-15,BOS,PHI,1610613000.0,1610613000.0,1.0
1,12000026,2020-12-15,SAS,HOU,1610613000.0,1610613000.0,1.0
2,12000027,2020-12-15,GSW,SAC,1610613000.0,1610613000.0,1.0
3,12000028,2020-12-16,CLE,NYK,1610613000.0,1610613000.0,1.0
4,12000029,2020-12-16,CHI,OKC,1610613000.0,1610613000.0,0.0


In [26]:
def home_log():
    
    from nba_api.stats.static import teams
    nba_teams = teams.get_teams()
    team_id = []
    for t in nba_teams:
        team_id.append(t["id"])
    
    init = pd.DataFrame({
        'GAME_ID': [],
        'TEAM_ID': [],
        'fg_pct_3_h': [],
        'fg3_pct_3_h': [], 
        'ft_pct_3_h': [],
        'oreb_3_h': [], 
        'dreb_3_h': [],
        'ast_3_h': [],
        'tov_3_h': [],
        'stl_3_h': [], 
        'blk_3_h': [],
        'pts_3_h': [],
        'fg_pct_7_h': [],
        'fg3_pct_7_h': [],
        'ft_pct_7_h': [],
        'oreb_7_h': [],
        'dreb_7_h': [],
        'ast_7_h': [],
        'tov_7_h': [],
        'stl_7_h': [],
        'blk_7_h': [],
        'pts_7_h': []
        })
    
    for t in team_id:
        game_log = team.GameLogs(t, location="Home").logs()
        df = game_log.sort_values(by=['GAME_ID'], ascending=True)
        
        
        game_num = 3
        for i in range(0, len(df)-game_num):
            col = 11
            df.loc[df.index[i+game_num], 'fg_pct_3_h'] = ((df.iloc[i,col] + df.iloc[i+1,col] + df.iloc[i+2, col])/game_num)
            col = 14
            df.loc[df.index[i+game_num], 'fg3_pct_3_h'] = ((df.iloc[i,col] + df.iloc[i+1,col] + df.iloc[i+2, col])/game_num)
            col = 17
            df.loc[df.index[i+game_num], 'ft_pct_3_h'] = ((df.iloc[i,col] + df.iloc[i+1,col] + df.iloc[i+2, col])/game_num)
            col = 18
            df.loc[df.index[i+game_num], 'oreb_3_h'] = ((df.iloc[i,col] + df.iloc[i+1,col] + df.iloc[i+2, col])/game_num)
            col = 19
            df.loc[df.index[i+game_num], 'dreb_3_h'] = ((df.iloc[i,col] + df.iloc[i+1,col] + df.iloc[i+2, col])/game_num)
            col = 21
            df.loc[df.index[i+game_num], 'ast_3_h'] = ((df.iloc[i,col] + df.iloc[i+1,col] + df.iloc[i+2, col])/game_num)
            col = 22
            df.loc[df.index[i+game_num], 'tov_3_h'] = ((df.iloc[i,col] + df.iloc[i+1,col] + df.iloc[i+2, col])/game_num)
            col = 23
            df.loc[df.index[i+game_num], 'stl_3_h'] = ((df.iloc[i,col] + df.iloc[i+1,col] + df.iloc[i+2, col])/game_num)
            col = 24
            df.loc[df.index[i+game_num], 'blk_3_h'] = ((df.iloc[i,col] + df.iloc[i+1,col] + df.iloc[i+2, col])/game_num)
            col = 28
            df.loc[df.index[i+game_num], 'pts_3_h'] = ((df.iloc[i,col] + df.iloc[i+1,col] + df.iloc[i+2, col])/game_num)

        game_num = 7
        for i in range(0, len(df)-game_num):
            col = 11
            df.loc[df.index[i+game_num], 'fg_pct_7_h'] = ((df.iloc[i,col] + df.iloc[i+1,col] + df.iloc[i+2, col]
                                              + df.iloc[i+3, col] + df.iloc[i+4, col]
                                              + df.iloc[i+5, col] + df.iloc[i+6, col])/game_num)
            col = 14
            df.loc[df.index[i+game_num], 'fg3_pct_7_h'] = ((df.iloc[i,col] + df.iloc[i+1,col] + df.iloc[i+2, col]
                                              + df.iloc[i+3, col] + df.iloc[i+4, col]
                                              + df.iloc[i+5, col] + df.iloc[i+6, col])/game_num)
            col = 17
            df.loc[df.index[i+game_num], 'ft_pct_7_h'] = ((df.iloc[i,col] + df.iloc[i+1,col] + df.iloc[i+2, col]
                                              + df.iloc[i+3, col] + df.iloc[i+4, col]
                                              + df.iloc[i+5, col] + df.iloc[i+6, col])/game_num)
            col = 18
            df.loc[df.index[i+game_num], 'oreb_7_h'] = ((df.iloc[i,col] + df.iloc[i+1,col] + df.iloc[i+2, col]
                                              + df.iloc[i+3, col] + df.iloc[i+4, col]
                                              + df.iloc[i+5, col] + df.iloc[i+6, col])/game_num)
            col = 19
            df.loc[df.index[i+game_num], 'dreb_7_h'] = ((df.iloc[i,col] + df.iloc[i+1,col] + df.iloc[i+2, col]
                                              + df.iloc[i+3, col] + df.iloc[i+4, col]
                                              + df.iloc[i+5, col] + df.iloc[i+6, col])/game_num)
            col = 21
            df.loc[df.index[i+game_num], 'ast_7_h'] = ((df.iloc[i,col] + df.iloc[i+1,col] + df.iloc[i+2, col]
                                              + df.iloc[i+3, col] + df.iloc[i+4, col]
                                              + df.iloc[i+5, col] + df.iloc[i+6, col])/game_num)
            col = 22
            df.loc[df.index[i+game_num], 'tov_7_h'] = ((df.iloc[i,col] + df.iloc[i+1,col] + df.iloc[i+2, col]
                                              + df.iloc[i+3, col] + df.iloc[i+4, col]
                                              + df.iloc[i+5, col] + df.iloc[i+6, col])/game_num)
            col = 23
            df.loc[df.index[i+game_num], 'stl_7_h'] = ((df.iloc[i,col] + df.iloc[i+1,col] + df.iloc[i+2, col]
                                              + df.iloc[i+3, col] + df.iloc[i+4, col]
                                              + df.iloc[i+5, col] + df.iloc[i+6, col])/game_num)
            col = 24
            df.loc[df.index[i+game_num], 'blk_7_h'] = ((df.iloc[i,col] + df.iloc[i+1,col] + df.iloc[i+2, col]
                                              + df.iloc[i+3, col] + df.iloc[i+4, col]
                                              + df.iloc[i+5, col] + df.iloc[i+6, col])/game_num)
            col = 28
            df.loc[df.index[i+game_num], 'pts_7_h'] = ((df.iloc[i,col] + df.iloc[i+1,col] + df.iloc[i+2, col]
                                              + df.iloc[i+3, col] + df.iloc[i+4, col]
                                              + df.iloc[i+5, col] + df.iloc[i+6, col])/game_num)
        
        df = df[['GAME_ID','TEAM_ID','fg_pct_3_h','fg3_pct_3_h', 'ft_pct_3_h', 'oreb_3_h', 'dreb_3_h', 'ast_3_h',
                 'tov_3_h', 'stl_3_h', 'blk_3_h', 'pts_3_h', 'fg_pct_7_h', 'fg3_pct_7_h','ft_pct_7_h', 'oreb_7_h', 
                 'dreb_7_h', 'ast_7_h', 'tov_7_h', 'stl_7_h','blk_7_h', 'pts_7_h']]
        
        init = pd.concat([init, df])
        
        
    return init

In [27]:
home_logs = home_log()

In [28]:
home_logs.drop_duplicates()

Unnamed: 0,GAME_ID,TEAM_ID,fg_pct_3_h,fg3_pct_3_h,ft_pct_3_h,oreb_3_h,dreb_3_h,ast_3_h,tov_3_h,stl_3_h,...,fg_pct_7_h,fg3_pct_7_h,ft_pct_7_h,oreb_7_h,dreb_7_h,ast_7_h,tov_7_h,stl_7_h,blk_7_h,pts_7_h
18,0022000041,1.610613e+09,,,,,,,,,...,,,,,,,,,,
17,0022000083,1.610613e+09,,,,,,,,,...,,,,,,,,,,
16,0022000095,1.610613e+09,,,,,,,,,...,,,,,,,,,,
15,0022000110,1.610613e+09,0.453333,0.381000,0.728000,11.000000,33.333333,25.333333,16.000000,6.000000,...,,,,,,,,,,
14,0022000153,1.610613e+09,0.415667,0.280667,0.714000,12.333333,36.000000,23.666667,16.000000,6.333333,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,0022000414,1.610613e+09,0.480000,0.379333,0.915667,12.333333,36.333333,24.666667,14.000000,6.000000,...,0.469000,0.395286,0.856429,10.571429,33.000000,28.142857,14.000000,5.857143,4.000000,115.857143
3,0022000459,1.610613e+09,0.484000,0.392333,0.883333,11.666667,40.000000,23.666667,17.333333,5.000000,...,0.478000,0.396857,0.831000,10.714286,34.285714,27.428571,14.857143,6.285714,4.428571,116.428571
2,0022000568,1.610613e+09,0.485333,0.404667,0.872000,9.333333,40.000000,22.000000,20.666667,5.666667,...,0.476143,0.406571,0.862714,10.000000,35.285714,25.714286,16.000000,6.285714,4.000000,115.571429
1,0022000588,1.610613e+09,0.455667,0.395333,0.858000,9.333333,36.666667,24.000000,17.000000,7.333333,...,0.465000,0.386000,0.878143,10.714286,35.142857,24.857143,16.000000,6.428571,3.714286,112.571429


In [29]:
def away_log():
    
    from nba_api.stats.static import teams
    nba_teams = teams.get_teams()
    team_id = []
    for t in nba_teams:
        team_id.append(t["id"])
    
    init = pd.DataFrame({
        'GAME_ID': [],
        'TEAM_ID': [],
        'fg_pct_3_a': [],
        'fg3_pct_3_a': [], 
        'ft_pct_3_a': [],
        'oreb_3_a': [], 
        'dreb_3_a': [],
        'ast_3_a': [],
        'tov_3_a': [],
        'stl_3_a': [], 
        'blk_3_a': [],
        'pts_3_a': [],
        'fg_pct_7_a': [],
        'fg3_pct_7_a': [],
        'ft_pct_7_a': [],
        'oreb_7_a': [],
        'dreb_7_a': [],
        'ast_7_a': [],
        'tov_7_a': [],
        'stl_7_a': [],
        'blk_7_a': [],
        'pts_7_a': []
        })
    
    for t in team_id:
        game_log = team.GameLogs(t, location="Road").logs()
        df = game_log.sort_values(by=['GAME_ID'], ascending=True)
        
        game_num = 3
        for i in range(0, len(df)-game_num):
            col = 11
            df.loc[df.index[i+game_num], 'fg_pct_3_a'] = ((df.iloc[i,col] + df.iloc[i+1,col] + df.iloc[i+2, col])/game_num)
            col = 14
            df.loc[df.index[i+game_num], 'fg3_pct_3_a'] = ((df.iloc[i,col] + df.iloc[i+1,col] + df.iloc[i+2, col])/game_num)
            col = 17
            df.loc[df.index[i+game_num], 'ft_pct_3_a'] = ((df.iloc[i,col] + df.iloc[i+1,col] + df.iloc[i+2, col])/game_num)
            col = 18
            df.loc[df.index[i+game_num], 'oreb_3_a'] = ((df.iloc[i,col] + df.iloc[i+1,col] + df.iloc[i+2, col])/game_num)
            col = 19
            df.loc[df.index[i+game_num], 'dreb_3_a'] = ((df.iloc[i,col] + df.iloc[i+1,col] + df.iloc[i+2, col])/game_num)
            col = 21
            df.loc[df.index[i+game_num], 'ast_3_a'] = ((df.iloc[i,col] + df.iloc[i+1,col] + df.iloc[i+2, col])/game_num)
            col = 22
            df.loc[df.index[i+game_num], 'tov_3_a'] = ((df.iloc[i,col] + df.iloc[i+1,col] + df.iloc[i+2, col])/game_num)
            col = 23
            df.loc[df.index[i+game_num], 'stl_3_a'] = ((df.iloc[i,col] + df.iloc[i+1,col] + df.iloc[i+2, col])/game_num)
            col = 24
            df.loc[df.index[i+game_num], 'blk_3_a'] = ((df.iloc[i,col] + df.iloc[i+1,col] + df.iloc[i+2, col])/game_num)
            col = 28
            df.loc[df.index[i+game_num], 'pts_3_a'] = ((df.iloc[i,col] + df.iloc[i+1,col] + df.iloc[i+2, col])/game_num)

        game_num = 7
        for i in range(0, len(df)-game_num):
            col = 11
            df.loc[df.index[i+game_num], 'fg_pct_7_a'] = ((df.iloc[i,col] + df.iloc[i+1,col] + df.iloc[i+2, col]
                                              + df.iloc[i+3, col] + df.iloc[i+4, col]
                                              + df.iloc[i+5, col] + df.iloc[i+6, col])/game_num)
            col = 14
            df.loc[df.index[i+game_num], 'fg3_pct_7_a'] = ((df.iloc[i,col] + df.iloc[i+1,col] + df.iloc[i+2, col]
                                              + df.iloc[i+3, col] + df.iloc[i+4, col]
                                              + df.iloc[i+5, col] + df.iloc[i+6, col])/game_num)
            col = 17
            df.loc[df.index[i+game_num], 'ft_pct_7_a'] = ((df.iloc[i,col] + df.iloc[i+1,col] + df.iloc[i+2, col]
                                              + df.iloc[i+3, col] + df.iloc[i+4, col]
                                              + df.iloc[i+5, col] + df.iloc[i+6, col])/game_num)
            col = 18
            df.loc[df.index[i+game_num], 'oreb_7_a'] = ((df.iloc[i,col] + df.iloc[i+1,col] + df.iloc[i+2, col]
                                              + df.iloc[i+3, col] + df.iloc[i+4, col]
                                              + df.iloc[i+5, col] + df.iloc[i+6, col])/game_num)
            col = 19
            df.loc[df.index[i+game_num], 'dreb_7_a'] = ((df.iloc[i,col] + df.iloc[i+1,col] + df.iloc[i+2, col]
                                              + df.iloc[i+3, col] + df.iloc[i+4, col]
                                              + df.iloc[i+5, col] + df.iloc[i+6, col])/game_num)
            col = 21
            df.loc[df.index[i+game_num], 'ast_7_a'] = ((df.iloc[i,col] + df.iloc[i+1,col] + df.iloc[i+2, col]
                                              + df.iloc[i+3, col] + df.iloc[i+4, col]
                                              + df.iloc[i+5, col] + df.iloc[i+6, col])/game_num)
            col = 22
            df.loc[df.index[i+game_num], 'tov_7_a'] = ((df.iloc[i,col] + df.iloc[i+1,col] + df.iloc[i+2, col]
                                              + df.iloc[i+3, col] + df.iloc[i+4, col]
                                              + df.iloc[i+5, col] + df.iloc[i+6, col])/game_num)
            col = 23
            df.loc[df.index[i+game_num], 'stl_7_a'] = ((df.iloc[i,col] + df.iloc[i+1,col] + df.iloc[i+2, col]
                                              + df.iloc[i+3, col] + df.iloc[i+4, col]
                                              + df.iloc[i+5, col] + df.iloc[i+6, col])/game_num)
            col = 24
            df.loc[df.index[i+game_num], 'blk_7_a'] = ((df.iloc[i,col] + df.iloc[i+1,col] + df.iloc[i+2, col]
                                              + df.iloc[i+3, col] + df.iloc[i+4, col]
                                              + df.iloc[i+5, col] + df.iloc[i+6, col])/game_num)
            col = 28
            df.loc[df.index[i+game_num], 'pts_7_a'] = ((df.iloc[i,col] + df.iloc[i+1,col] + df.iloc[i+2, col]
                                              + df.iloc[i+3, col] + df.iloc[i+4, col]
                                              + df.iloc[i+5, col] + df.iloc[i+6, col])/game_num)
        
        df = df[['GAME_ID','TEAM_ID','fg_pct_3_a','fg3_pct_3_a', 'ft_pct_3_a', 'oreb_3_a', 'dreb_3_a', 'ast_3_a',
                 'tov_3_a', 'stl_3_a', 'blk_3_a', 'pts_3_a', 'fg_pct_7_a', 'fg3_pct_7_a','ft_pct_7_a', 'oreb_7_a', 
                 'dreb_7_a', 'ast_7_a', 'tov_7_a', 'stl_7_a','blk_7_a', 'pts_7_a']]
        
        init = pd.concat([init, df])
        
        
    return init

In [30]:
away_logs = away_log()

In [31]:
away_logs

Unnamed: 0,GAME_ID,TEAM_ID,fg_pct_3_a,fg3_pct_3_a,ft_pct_3_a,oreb_3_a,dreb_3_a,ast_3_a,tov_3_a,stl_3_a,...,fg_pct_7_a,fg3_pct_7_a,ft_pct_7_a,oreb_7_a,dreb_7_a,ast_7_a,tov_7_a,stl_7_a,blk_7_a,pts_7_a
20,0022000015,1.610613e+09,,,,,,,,,...,,,,,,,,,,
19,0022000021,1.610613e+09,,,,,,,,,...,,,,,,,,,,
18,0022000057,1.610613e+09,,,,,,,,,...,,,,,,,,,,
17,0022000072,1.610613e+09,0.480000,0.384000,0.841667,12.666667,38.666667,24.666667,16.666667,5.000000,...,,,,,,,,,,
16,0022000134,1.610613e+09,0.454667,0.387333,0.851333,13.333333,38.666667,24.666667,12.666667,5.666667,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,0022000496,1.610613e+09,0.487667,0.433667,0.829667,9.666667,28.333333,24.000000,15.666667,10.000000,...,0.471143,0.403571,0.752143,10.714286,30.714286,25.714286,15.428571,8.857143,4.428571,112.714286
3,0022000511,1.610613e+09,0.499667,0.383333,0.856333,8.333333,30.333333,21.666667,16.333333,10.666667,...,0.472000,0.393857,0.777286,10.000000,31.714286,24.571429,14.571429,8.428571,4.428571,115.000000
2,0022000528,1.610613e+09,0.470333,0.360667,0.808000,10.000000,29.666667,23.000000,17.000000,10.666667,...,0.475429,0.393571,0.767143,10.142857,30.142857,24.714286,15.428571,9.142857,3.857143,116.142857
1,0022000535,1.610613e+09,0.497333,0.381000,0.859000,9.666667,30.666667,27.000000,17.000000,7.333333,...,0.484714,0.390429,0.824286,9.857143,29.571429,25.000000,16.000000,8.142857,4.000000,119.000000


In [32]:
merged = home_logs.merge(away_logs, on="GAME_ID")

In [33]:
merged

Unnamed: 0,GAME_ID,TEAM_ID_x,fg_pct_3_h,fg3_pct_3_h,ft_pct_3_h,oreb_3_h,dreb_3_h,ast_3_h,tov_3_h,stl_3_h,...,fg_pct_7_a,fg3_pct_7_a,ft_pct_7_a,oreb_7_a,dreb_7_a,ast_7_a,tov_7_a,stl_7_a,blk_7_a,pts_7_a
0,0022000041,1.610613e+09,,,,,,,,,...,,,,,,,,,,
1,0022000083,1.610613e+09,,,,,,,,,...,,,,,,,,,,
2,0022000095,1.610613e+09,,,,,,,,,...,,,,,,,,,,
3,0022000110,1.610613e+09,0.453333,0.381000,0.728000,11.000000,33.333333,25.333333,16.000000,6.000000,...,,,,,,,,,,
4,0022000153,1.610613e+09,0.415667,0.280667,0.714000,12.333333,36.000000,23.666667,16.000000,6.333333,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
580,0022000414,1.610613e+09,0.480000,0.379333,0.915667,12.333333,36.333333,24.666667,14.000000,6.000000,...,0.444286,0.320286,0.802286,9.571429,36.142857,24.000000,9.571429,6.428571,4.857143,112.142857
581,0022000459,1.610613e+09,0.484000,0.392333,0.883333,11.666667,40.000000,23.666667,17.333333,5.000000,...,0.474571,0.404571,0.736571,7.285714,37.285714,29.571429,13.428571,7.285714,4.142857,116.285714
582,0022000568,1.610613e+09,0.485333,0.404667,0.872000,9.333333,40.000000,22.000000,20.666667,5.666667,...,0.464714,0.343429,0.750714,9.000000,33.714286,26.000000,14.285714,8.857143,4.857143,107.000000
583,0022000588,1.610613e+09,0.455667,0.395333,0.858000,9.333333,36.666667,24.000000,17.000000,7.333333,...,0.476714,0.404000,0.828000,6.857143,31.714286,26.714286,13.428571,6.428571,5.000000,113.714286


In [34]:
merged = merged.drop_duplicates()
merged = merged.dropna()

In [35]:
final = df.merge(merged, left_on="game_id", right_on="GAME_ID", how="inner")
final

Unnamed: 0,game_id,date,away,home,away_id,home_id,home_win,GAME_ID,TEAM_ID_x,fg_pct_3_h,...,fg_pct_7_a,fg3_pct_7_a,ft_pct_7_a,oreb_7_a,dreb_7_a,ast_7_a,tov_7_a,stl_7_a,blk_7_a,pts_7_a
0,0022000200,2021-01-17,UTA,DEN,1.610613e+09,1.610613e+09,0.0,0022000200,1.610613e+09,0.472667,...,0.472429,0.404857,0.665857,10.285714,38.857143,23.428571,15.857143,5.857143,7.142857,111.428571
1,0022000207,2021-01-18,SAS,POR,1.610613e+09,1.610613e+09,0.0,0022000207,1.610613e+09,0.404667,...,0.444000,0.374571,0.834000,8.142857,37.285714,24.857143,9.285714,6.571429,5.285714,112.142857
2,0022000208,2021-01-18,PHX,MEM,1.610613e+09,1.610613e+09,1.0,0022000208,1.610613e+09,0.442333,...,0.467143,0.343000,0.796857,7.428571,34.571429,26.714286,11.714286,5.571429,4.714286,109.714286
3,0022000217,2021-01-20,DAL,IND,1.610613e+09,1.610613e+09,0.0,0022000217,1.610613e+09,0.451667,...,0.450000,0.335714,0.736714,10.142857,37.428571,23.428571,12.857143,7.285714,5.428571,110.714286
4,0022000223,2021-01-20,SAS,GSW,1.610613e+09,1.610613e+09,1.0,0022000223,1.610613e+09,0.426333,...,0.448000,0.382429,0.835857,8.714286,36.714286,25.571429,9.571429,6.285714,5.000000,111.285714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345,0022000611,2021-03-16,NYK,PHI,1.610613e+09,1.610613e+09,1.0,0022000611,1.610613e+09,0.551667,...,0.436857,0.362000,0.773714,10.571429,34.428571,19.571429,13.000000,7.571429,4.285714,104.571429
346,0022000612,2021-03-16,OKC,CHI,1.610613e+09,1.610613e+09,1.0,0022000612,1.610613e+09,0.431333,...,0.428000,0.328143,0.744000,7.857143,36.142857,22.571429,14.571429,7.428571,3.714286,101.142857
347,0022000613,2021-03-16,ATL,HOU,1.610613e+09,1.610613e+09,0.0,0022000613,1.610613e+09,0.403667,...,0.441000,0.357000,0.826286,12.857143,33.857143,22.571429,14.857143,8.857143,5.857143,108.285714
348,0022000614,2021-03-16,NOP,POR,1.610613e+09,1.610613e+09,1.0,0022000614,1.610613e+09,0.451000,...,0.511000,0.387286,0.769143,12.428571,32.285714,28.000000,13.000000,6.857143,3.142857,122.142857


In [56]:
# Assign X (data) and y (target)
X = final[['fg_pct_3_h', 'fg3_pct_3_h', 'ft_pct_3_h',
       'oreb_3_h', 'dreb_3_h', 'ast_3_h', 'tov_3_h', 'stl_3_h', 'blk_3_h',
       'pts_3_h', 'fg_pct_7_h', 'fg3_pct_7_h', 'ft_pct_7_h', 'oreb_7_h',
       'dreb_7_h', 'ast_7_h', 'tov_7_h', 'stl_7_h', 'blk_7_h', 'pts_7_h',
        'fg_pct_3_a', 'fg3_pct_3_a', 'ft_pct_3_a', 'oreb_3_a',
       'dreb_3_a', 'ast_3_a', 'tov_3_a', 'stl_3_a', 'blk_3_a', 'pts_3_a',
       'fg_pct_7_a', 'fg3_pct_7_a', 'ft_pct_7_a', 'oreb_7_a', 'dreb_7_a',
       'ast_7_a', 'tov_7_a', 'stl_7_a', 'blk_7_a', 'pts_7_a']]
y = final["home_win"]
feature_names = X.columns
print(X.shape, y.shape)

(350, 40) (350,)


In [57]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=85)

In [58]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.48863636363636365

In [59]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.5340909090909091

In [60]:
predictions = rf.predict(X_test)
predictions

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1.,
       1., 0., 1., 0., 1., 1., 1., 0., 1., 1., 1., 0., 1., 1., 1., 1., 0.,
       1., 0., 1., 1., 0., 1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 0., 1., 1., 1., 0.,
       1., 0., 1.])

In [61]:
y_test

61     1.0
324    0.0
117    0.0
346    1.0
348    1.0
      ... 
216    0.0
92     1.0
34     0.0
347    0.0
275    0.0
Name: home_win, Length: 88, dtype: float64

In [62]:
score_df = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)

In [63]:
score_df.loc[score_df["Prediction"] == score_df["Actual"], "Correct"] = 1

In [64]:
print(score_df["Correct"].sum()/len(score_df))

0.5340909090909091


In [65]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.04249087078688201, 'ft_pct_7_a'),
 (0.03317365984188134, 'ft_pct_3_h'),
 (0.030063742101767273, 'fg3_pct_7_a'),
 (0.029459718583993263, 'blk_3_h'),
 (0.029094341187187855, 'dreb_7_h'),
 (0.02855704381224598, 'fg3_pct_3_h'),
 (0.028490885766336137, 'fg3_pct_3_a'),
 (0.028338535997167888, 'oreb_7_h'),
 (0.028278330097093497, 'ft_pct_3_a'),
 (0.028210224668657385, 'pts_7_h'),
 (0.02780961176616008, 'fg_pct_3_h'),
 (0.026861638558293866, 'fg3_pct_7_h'),
 (0.026850675111196768, 'fg_pct_7_a'),
 (0.026622999068940275, 'ast_3_h'),
 (0.026166958028014612, 'ast_7_h'),
 (0.026067473124300165, 'pts_3_a'),
 (0.02585194006316492, 'pts_7_a'),
 (0.025842759596706514, 'pts_3_h'),
 (0.025619780917794287, 'stl_7_a'),
 (0.025090300948124353, 'dreb_3_a'),
 (0.024403139022477434, 'fg_pct_7_h'),
 (0.024266919279064766, 'tov_7_h'),
 (0.024084213674701996, 'ast_7_a'),
 (0.02348311865009499, 'tov_3_h'),
 (0.023367156649275734, 'blk_7_h'),
 (0.02316157764542323, 'dreb_7_a'),
 (0.023094421757785522, 'oreb_3_h