In [3]:
import pandas as pd
from pathlib import Path
import time
from tqdm.notebook import tqdm
import numpy as np


In [89]:
args = {}
args['data_path'] = './Data'
args['data_filename'] = 'full_data_2014to2021.pkl'
args['summarize_n_games']= 5
args['columns_to_summarize']=['Team', 'Pace', 'eFG%', 'TOV%', 'ORB%', 'FT/FGA', 'ORtg']
args['home_columns_to_summarize']=[column + '_'+'home' for column in args['columns_to_summarize']]
args['visitor_columns_to_summarize']=[column + '_'+'visitor' for column in args['columns_to_summarize']]
args['']

In [13]:
data_path = Path(args['data_path'])
assert data_path.exists()

# Read Data

In [15]:
dataset_df = pd.read_pickle(data_path/args['data_filename'])
dataset_df.head()

Unnamed: 0,DATE,VISITOR,VISITOR_PTS,HOME,HOME_PTS,boxscores_url,game_id,Team_home,Pace_home,eFG%_home,...,ORB%_home,FT/FGA_home,ORtg_home,Team_visitor,Pace_visitor,eFG%_visitor,TOV%_visitor,ORB%_visitor,FT/FGA_visitor,ORtg_visitor
0,2013-10-29,ORL,87,IND,97,https://www.basketball-reference.com/boxscores...,201310290IND,IND,95.0,0.528,...,27.8,0.31,102.1,ORL,95.0,0.435,14.9,27.7,0.065,91.6
1,2013-10-29,CHI,95,MIA,107,https://www.basketball-reference.com/boxscores...,201310290MIA,MIA,97.1,0.59,...,14.3,0.306,110.2,CHI,97.1,0.464,16.2,23.9,0.217,97.9
2,2013-10-29,LAC,103,LAL,116,https://www.basketball-reference.com/boxscores...,201310290LAL,LAL,100.4,0.527,...,37.5,0.194,115.6,LAC,100.4,0.542,14.7,22.7,0.157,102.6
3,2013-10-30,BRK,94,CLE,98,https://www.basketball-reference.com/boxscores...,201310300CLE,CLE,92.3,0.446,...,36.4,0.274,106.2,BRK,92.3,0.457,13.9,22.0,0.232,101.8
4,2013-10-30,BOS,87,TOR,93,https://www.basketball-reference.com/boxscores...,201310300TOR,TOR,91.5,0.471,...,42.2,0.14,101.6,BOS,91.5,0.508,21.8,19.4,0.303,95.1


In [16]:
# convert string columns to numeric columns
for column in ['VISITOR_PTS','HOME_PTS']:
    dataset_df[column] = pd.to_numeric(dataset_df[column])

In [17]:
dataset_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8743 entries, 0 to 8742
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   DATE            8743 non-null   datetime64[ns]
 1   VISITOR         8743 non-null   object        
 2   VISITOR_PTS     8743 non-null   float64       
 3   HOME            8743 non-null   object        
 4   HOME_PTS        8743 non-null   float64       
 5   boxscores_url   8743 non-null   object        
 6   game_id         8743 non-null   object        
 7   Team_home       8743 non-null   object        
 8   Pace_home       8743 non-null   float64       
 9   eFG%_home       8743 non-null   float64       
 10  TOV%_home       8743 non-null   float64       
 11  ORB%_home       8743 non-null   float64       
 12  FT/FGA_home     8743 non-null   float64       
 13  ORtg_home       8743 non-null   float64       
 14  Team_visitor    8743 non-null   object        
 15  Pace

In [29]:
dataset_df.columns

Index(['DATE', 'VISITOR', 'VISITOR_PTS', 'HOME', 'HOME_PTS', 'boxscores_url',
       'game_id', 'Team_home', 'Pace_home', 'eFG%_home', 'TOV%_home',
       'ORB%_home', 'FT/FGA_home', 'ORtg_home', 'Team_visitor', 'Pace_visitor',
       'eFG%_visitor', 'TOV%_visitor', 'ORB%_visitor', 'FT/FGA_visitor',
       'ORtg_visitor'],
      dtype='object')

# Models

## Team based - simple average of past xx games


In [159]:
### Data Processing

Unnamed: 0,DATE,VISITOR,VISITOR_PTS,HOME,HOME_PTS,boxscores_url,game_id,Pace_home,eFG%_home,TOV%_home,...,FT/FGA_home,ORtg_home,Team_home,Pace_visitor,eFG%_visitor,TOV%_visitor,ORB%_visitor,FT/FGA_visitor,ORtg_visitor,Team_visitor
0,2013-11-07,LAL,99.0,HOU,98.0,https://www.basketball-reference.com/boxscores...,201311070HOU,96.70,0.5558,16.98,...,0.3586,113.12,HOU,100.26,0.4738,12.98,20.76,0.1930,100.50,LAL
1,2013-11-07,LAC,97.0,MIA,102.0,https://www.basketball-reference.com/boxscores...,201311070MIA,92.82,0.5738,15.48,...,0.2518,113.12,MIA,98.74,0.5342,12.76,27.82,0.2716,114.66,LAC
2,2013-11-08,TOR,84.0,IND,91.0,https://www.basketball-reference.com/boxscores...,201311080IND,93.20,0.5100,16.92,...,0.2526,102.48,IND,89.96,0.4732,13.72,32.76,0.2260,104.62,TOR
3,2013-11-08,BOS,91.0,ORL,89.0,https://www.basketball-reference.com/boxscores...,201311080ORL,96.76,0.5240,14.96,...,0.1628,104.68,ORL,91.54,0.4862,18.10,24.24,0.2266,97.76,BOS
4,2013-11-08,CLE,79.0,PHI,94.0,https://www.basketball-reference.com/boxscores...,201311080PHI,102.72,0.5140,16.50,...,0.1884,102.06,PHI,94.72,0.4448,13.94,25.66,0.1854,96.16,CLE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8656,2021-02-02,BOS,111.0,GSW,107.0,https://www.basketball-reference.com/boxscores...,202102020GSW,101.74,0.5562,12.96,...,0.1550,112.50,GSW,98.22,0.5504,11.38,24.42,0.1682,115.90,BOS
8657,2021-02-02,LAC,120.0,BRK,124.0,https://www.basketball-reference.com/boxscores...,202102020BRK,100.88,0.6076,10.54,...,0.2194,126.28,BRK,96.04,0.5378,9.60,21.96,0.1972,117.08,LAC
8658,2021-02-02,POR,132.0,WAS,121.0,https://www.basketball-reference.com/boxscores...,202102020WAS,102.34,0.4752,11.16,...,0.2206,105.74,WAS,97.04,0.5358,9.56,23.78,0.1576,117.48,POR
8659,2021-02-02,MEM,116.0,IND,134.0,https://www.basketball-reference.com/boxscores...,202102020IND,98.14,0.5526,13.78,...,0.2270,114.60,IND,99.72,0.5676,9.98,24.40,0.1314,119.22,MEM


In [None]:
# get most recent x games data as training data
# test on 2021 data
    # sort by dates
    # iterate rows
    # get current index
    # from current index, go backwards index until found previous X games
        # if out of index before fetching X games, stop and continue (give up predicting this game)
    # if HOME or VISITOR equals to current row HOME, then get that row
    # save into a dictionary, key is game_id
    # transform as data frame, and join with the full data
# average
# 2 teams, concatenate to the same row
# predict

In [146]:
def strip_col_suffix(df):
    """
    strip the '_home' or '_visitor' suffix from the stats columns, using args
    """
    rename_dict = {old_col:new_col for old_col,new_col in zip(args['home_columns_to_summarize']+args['visitor_columns_to_summarize'],args['columns_to_summarize']*2)}
    return df.rename(columns=rename_dict)

def get_previous_games(curr_row_index,team_name,dataset_df:pd.DataFrame,previous_n=args['summarize_n_games'])->list:
    """
    return a list of pandas dataframes of previous of games, regardless of home or away.
    """
    previous_games = []
    # start iterating previous games
    for index in range(curr_row_index-1,-1,-1):
        try:
            # if home team matches
            if (dataset_df.loc[index]['HOME']==team_name):
                previous_game=dataset_df.loc[[index]][args['home_columns_to_summarize']]
                previous_games.append(strip_col_suffix(previous_game))
            # if visitor team matches
            if (dataset_df.loc[index]['VISITOR']==team_name):
                previous_game=dataset_df.loc[[index]][args['visitor_columns_to_summarize']]
                previous_games.append(strip_col_suffix(previous_game))
            # stop when we have enough games
            if len(previous_games)==previous_n:
                break
        except Exception as exc:
            print(exc)
    if len(previous_games)<args['summarize_n_games']:
        raise ValueError(f"Less than {args['summarize_n_games']} previous games.")
    # return a list of dataframes
    return previous_games

def summarize_previous_games(curr_row_index,team_name,original_df,previous_n=args['summarize_n_games'],summarize_method=np.mean)->pd.DataFrame:
    """
    calculate previous games' stats using `method`
    `previous_n`: number of previous games
    `summarize_method`: method to summarize prevous games, default to `np.mean`
    """

    previous_games = get_previous_games(curr_row_index,team_name,original_df,previous_n)
    # concat return dataframes into one dataframe
    previous_games = pd.concat(previous_games)
    return summarize_method(previous_games)

In [147]:
#TODO: use dataclass
dataset_df = dataset_df.sort_values(by='DATE').reset_index(drop=True)
games_summaries=[]
bad_index=[]
for index,row in tqdm(dataset_df.iterrows(),total=dataset_df.shape[0]):
    try:
        # home team summary
        summarized_home = summarize_previous_games(index, row['HOME'],dataset_df, args['summarize_n_games']).to_dict()
        summarized_home['Team'] = row['HOME']
        summarized_home['game_id'] = row['game_id']
        games_summaries.append(summarized_home)

        # visitor team summary
        summarized_visitor = summarize_previous_games(index, row['VISITOR'],dataset_df, args['summarize_n_games']).to_dict()
        summarized_visitor['Team'] = row['VISITOR']
        summarized_visitor['game_id'] = row['game_id']
        games_summaries.append(summarized_visitor)
    except Exception as exc:
        print(f"bad index {index}, error: {exc}")
        print('-'*100)
        bad_index.append(index)

    

HBox(children=(FloatProgress(value=0.0, max=8743.0), HTML(value='')))

bad index 0, error: Less than 5 previous games.
----------------------------------------------------------------------------------------------------
bad index 1, error: Less than 5 previous games.
----------------------------------------------------------------------------------------------------
bad index 2, error: Less than 5 previous games.
----------------------------------------------------------------------------------------------------
bad index 3, error: Less than 5 previous games.
----------------------------------------------------------------------------------------------------
bad index 4, error: Less than 5 previous games.
----------------------------------------------------------------------------------------------------
bad index 5, error: Less than 5 previous games.
----------------------------------------------------------------------------------------------------
bad index 6, error: Less than 5 previous games.
----------------------------------------------------------

In [150]:
len(games_summaries)

Unnamed: 0,Pace,eFG%,TOV%,ORB%,FT/FGA,ORtg,Team,game_id
0,96.70,0.5558,16.98,32.24,0.3586,113.12,HOU,201311070HOU
1,100.26,0.4738,12.98,20.76,0.1930,100.50,LAL,201311070HOU
2,92.82,0.5738,15.48,19.00,0.2518,113.12,MIA,201311070MIA
3,98.74,0.5342,12.76,27.82,0.2716,114.66,LAC,201311070MIA
4,93.20,0.5100,16.92,23.78,0.2526,102.48,IND,201311080IND
...,...,...,...,...,...,...,...,...
17321,97.04,0.5358,9.56,23.78,0.1576,117.48,POR,202102020WAS
17322,98.14,0.5526,13.78,24.12,0.2270,114.60,IND,202102020IND
17323,99.72,0.5676,9.98,24.40,0.1314,119.22,MEM,202102020IND
17324,95.96,0.5460,10.72,29.58,0.2560,122.54,UTA,202102020UTA


games_summaries_df = pd.DataFrame(games_summaries)
games_summaries_df

In [153]:
dataset_df.head()

Unnamed: 0,DATE,VISITOR,VISITOR_PTS,HOME,HOME_PTS,boxscores_url,game_id,Team_home,Pace_home,eFG%_home,...,ORB%_home,FT/FGA_home,ORtg_home,Team_visitor,Pace_visitor,eFG%_visitor,TOV%_visitor,ORB%_visitor,FT/FGA_visitor,ORtg_visitor
0,2013-10-29,ORL,87.0,IND,97.0,https://www.basketball-reference.com/boxscores...,201310290IND,IND,95.0,0.528,...,27.8,0.31,102.1,ORL,95.0,0.435,14.9,27.7,0.065,91.6
1,2013-10-29,CHI,95.0,MIA,107.0,https://www.basketball-reference.com/boxscores...,201310290MIA,MIA,97.1,0.59,...,14.3,0.306,110.2,CHI,97.1,0.464,16.2,23.9,0.217,97.9
2,2013-10-29,LAC,103.0,LAL,116.0,https://www.basketball-reference.com/boxscores...,201310290LAL,LAL,100.4,0.527,...,37.5,0.194,115.6,LAC,100.4,0.542,14.7,22.7,0.157,102.6
3,2013-10-30,BRK,94.0,CLE,98.0,https://www.basketball-reference.com/boxscores...,201310300CLE,CLE,92.3,0.446,...,36.4,0.274,106.2,BRK,92.3,0.457,13.9,22.0,0.232,101.8
4,2013-10-30,BOS,87.0,TOR,93.0,https://www.basketball-reference.com/boxscores...,201310300TOR,TOR,91.5,0.471,...,42.2,0.14,101.6,BOS,91.5,0.508,21.8,19.4,0.303,95.1


In [154]:
dataset_df.columns

Index(['DATE', 'VISITOR', 'VISITOR_PTS', 'HOME', 'HOME_PTS', 'boxscores_url',
       'game_id', 'Team_home', 'Pace_home', 'eFG%_home', 'TOV%_home',
       'ORB%_home', 'FT/FGA_home', 'ORtg_home', 'Team_visitor', 'Pace_visitor',
       'eFG%_visitor', 'TOV%_visitor', 'ORB%_visitor', 'FT/FGA_visitor',
       'ORtg_visitor'],
      dtype='object')

temp_home = pd.merge(dataset_df[['DATE','VISITOR', 'VISITOR_PTS', 'HOME', 'HOME_PTS', 'boxscores_url','game_id']],games_summaries_df,left_on=['game_id','HOME'],right_on=['game_id','Team'])
processed_dataset=pd.merge(temp_home,games_summaries_df,left_on=['game_id','VISITOR'],right_on=['game_id','Team'],suffixes=('_home','_visitor'))
processed_dataset

In [158]:
processed_dataset.columns

Index(['VISITOR', 'VISITOR_PTS', 'HOME', 'HOME_PTS', 'boxscores_url',
       'game_id', 'Pace_home', 'eFG%_home', 'TOV%_home', 'ORB%_home',
       'FT/FGA_home', 'ORtg_home', 'Team_home', 'Pace_visitor', 'eFG%_visitor',
       'TOV%_visitor', 'ORB%_visitor', 'FT/FGA_visitor', 'ORtg_visitor',
       'Team_visitor'],
      dtype='object')

In [None]:
Model - team based - similar opponents as training data

In [None]:
## Model - Player based

In [None]:
# players? -> include minutes played
# injured players?
# use news to guess how many minutes he will play