# NBA Outcome Predictor


In [18]:
import sys
from pathlib import Path

# Add project root to Python path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Load Data

We will load the processed data to perform our exploratory analysis.

In [20]:
from src.data.preprocess import load_and_preprocess_data

data_path = '../src/data/data.csv'
nba_df, numerical_cols, categorical_cols = load_and_preprocess_data(data_path)

nba_df.head()

Unnamed: 0,Gtm,Date,At,Opp,Tm,Opp.1,OT,ORtg,DRtg,Pace,FTr,3PAr,TS%,TRB%,AST%,STL%,BLK%,eFG%,TOV%,ORB%,FT/FGA,eFG%.1,TOV%.1,ORB%.1,FT/FGA.1,Win,Team,Season
0,1,2014-10-28,@,SAS,100,101,,115.5,116.6,86.6,0.244,0.269,0.579,53.5,44.7,10.4,7.1,0.538,10.4,23.7,0.205,0.629,21.4,27.3,0.186,0,DAL,2015
1,1,2014-10-28,@,NOP,84,101,,88.2,106.1,95.2,0.25,0.131,0.45,52.5,53.1,5.3,10.7,0.405,16.2,30.8,0.19,0.426,7.3,39.4,0.149,0,ORL,2015
2,1,2014-10-28,@,LAL,108,90,,116.9,97.4,92.4,0.685,0.397,0.568,43.4,71.0,7.6,4.3,0.507,12.8,35.9,0.466,0.373,11.9,25.0,0.392,1,HOU,2015
3,1,2014-10-28,,ORL,101,84,,106.1,88.2,95.2,0.307,0.168,0.441,47.5,48.8,10.5,23.3,0.426,7.3,39.4,0.149,0.405,16.2,30.8,0.19,1,NOP,2015
4,1,2014-10-28,,HOU,90,108,,97.4,116.9,92.4,0.494,0.127,0.468,56.6,57.1,7.6,6.8,0.373,11.9,25.0,0.392,0.507,12.8,35.9,0.466,0,LAL,2015


## Data Overview

Let's check the shape and basic statistics of the dataset.

In [21]:
#change some names
nba_df = nba_df.rename(columns={'Opp.1': 'Opp_Points', 'eFG%': 'OFF_eFG%', 'eFG%.1': 'DEF_eFG%', 'TOV%': 'OFF_TOV%', 'TOV%.1': 'DEF_TOV%', 'ORB%': 'OFF_ORB%', 'ORB%.1': 'DEF_ORB%', 'FT/FGA': 'OFF_FT/FGA', 'FT/FGA.1': 'DEF_FT/FGA'})

nba_df['At'] = nba_df['At'].map(lambda x: 1 if x=='@' else 0) #away=1, home=0

nba_df.dtypes

Gtm             int64
Date           object
At              int64
Opp            object
Tm              int64
Opp_Points      int64
OT             object
ORtg          float64
DRtg          float64
Pace          float64
FTr           float64
3PAr          float64
TS%           float64
TRB%          float64
AST%          float64
STL%          float64
BLK%          float64
OFF_eFG%      float64
OFF_TOV%      float64
OFF_ORB%      float64
OFF_FT/FGA    float64
DEF_eFG%      float64
DEF_TOV%      float64
DEF_ORB%      float64
DEF_FT/FGA    float64
Win             int64
Team           object
Season          int64
dtype: object

In [22]:
nba_df.head()

Unnamed: 0,Gtm,Date,At,Opp,Tm,Opp_Points,OT,ORtg,DRtg,Pace,FTr,3PAr,TS%,TRB%,AST%,STL%,BLK%,OFF_eFG%,OFF_TOV%,OFF_ORB%,OFF_FT/FGA,DEF_eFG%,DEF_TOV%,DEF_ORB%,DEF_FT/FGA,Win,Team,Season
0,1,2014-10-28,1,SAS,100,101,,115.5,116.6,86.6,0.244,0.269,0.579,53.5,44.7,10.4,7.1,0.538,10.4,23.7,0.205,0.629,21.4,27.3,0.186,0,DAL,2015
1,1,2014-10-28,1,NOP,84,101,,88.2,106.1,95.2,0.25,0.131,0.45,52.5,53.1,5.3,10.7,0.405,16.2,30.8,0.19,0.426,7.3,39.4,0.149,0,ORL,2015
2,1,2014-10-28,1,LAL,108,90,,116.9,97.4,92.4,0.685,0.397,0.568,43.4,71.0,7.6,4.3,0.507,12.8,35.9,0.466,0.373,11.9,25.0,0.392,1,HOU,2015
3,1,2014-10-28,0,ORL,101,84,,106.1,88.2,95.2,0.307,0.168,0.441,47.5,48.8,10.5,23.3,0.426,7.3,39.4,0.149,0.405,16.2,30.8,0.19,1,NOP,2015
4,1,2014-10-28,0,HOU,90,108,,97.4,116.9,92.4,0.494,0.127,0.468,56.6,57.1,7.6,6.8,0.373,11.9,25.0,0.392,0.507,12.8,35.9,0.466,0,LAL,2015


In [23]:
#counting null values for each column
for column in nba_df.columns:
    nulls = pd.isnull(nba_df[column]).sum()
    print(f'{column}: {nulls}')


Gtm: 0
Date: 0
At: 0
Opp: 0
Tm: 0
Opp_Points: 0
OT: 26692
ORtg: 0
DRtg: 0
Pace: 0
FTr: 0
3PAr: 0
TS%: 0
TRB%: 0
AST%: 0
STL%: 0
BLK%: 0
OFF_eFG%: 0
OFF_TOV%: 0
OFF_ORB%: 0
OFF_FT/FGA: 0
DEF_eFG%: 0
DEF_TOV%: 0
DEF_ORB%: 0
DEF_FT/FGA: 0
Win: 0
Team: 0
Season: 0


we see that only OT field has some nulls --> let's put some zeros there

In [24]:
def change_OT(row):
    if not pd.isnull(row['OT']) and row['OT'][0] != 'O':
        row['OT'] = int(row['OT'][0])
    elif not pd.isnull(row['OT']):
        row['OT'] = 1
    else:
        row['OT'] = 0
    return row

In [25]:
nba_df = nba_df.apply(change_OT, axis='columns')

#counting null values for each column
for column in nba_df.columns:
    nulls = pd.isnull(nba_df[column]).sum()
    print(f'{column}: {nulls}')

nba_df.head()

Gtm: 0
Date: 0
At: 0
Opp: 0
Tm: 0
Opp_Points: 0
OT: 0
ORtg: 0
DRtg: 0
Pace: 0
FTr: 0
3PAr: 0
TS%: 0
TRB%: 0
AST%: 0
STL%: 0
BLK%: 0
OFF_eFG%: 0
OFF_TOV%: 0
OFF_ORB%: 0
OFF_FT/FGA: 0
DEF_eFG%: 0
DEF_TOV%: 0
DEF_ORB%: 0
DEF_FT/FGA: 0
Win: 0
Team: 0
Season: 0


Unnamed: 0,Gtm,Date,At,Opp,Tm,Opp_Points,OT,ORtg,DRtg,Pace,FTr,3PAr,TS%,TRB%,AST%,STL%,BLK%,OFF_eFG%,OFF_TOV%,OFF_ORB%,OFF_FT/FGA,DEF_eFG%,DEF_TOV%,DEF_ORB%,DEF_FT/FGA,Win,Team,Season
0,1,2014-10-28,1,SAS,100,101,0,115.5,116.6,86.6,0.244,0.269,0.579,53.5,44.7,10.4,7.1,0.538,10.4,23.7,0.205,0.629,21.4,27.3,0.186,0,DAL,2015
1,1,2014-10-28,1,NOP,84,101,0,88.2,106.1,95.2,0.25,0.131,0.45,52.5,53.1,5.3,10.7,0.405,16.2,30.8,0.19,0.426,7.3,39.4,0.149,0,ORL,2015
2,1,2014-10-28,1,LAL,108,90,0,116.9,97.4,92.4,0.685,0.397,0.568,43.4,71.0,7.6,4.3,0.507,12.8,35.9,0.466,0.373,11.9,25.0,0.392,1,HOU,2015
3,1,2014-10-28,0,ORL,101,84,0,106.1,88.2,95.2,0.307,0.168,0.441,47.5,48.8,10.5,23.3,0.426,7.3,39.4,0.149,0.405,16.2,30.8,0.19,1,NOP,2015
4,1,2014-10-28,0,HOU,90,108,0,97.4,116.9,92.4,0.494,0.127,0.468,56.6,57.1,7.6,6.8,0.373,11.9,25.0,0.392,0.507,12.8,35.9,0.466,0,LAL,2015


In [26]:
#take new lists of numerical and categorical columns
numerical_cols = list(nba_df.select_dtypes(['int64', 'float64']).columns)
categorical_cols = list(nba_df.select_dtypes(['object']).columns)

print(numerical_cols)
print(categorical_cols)

['Gtm', 'At', 'Tm', 'Opp_Points', 'OT', 'ORtg', 'DRtg', 'Pace', 'FTr', '3PAr', 'TS%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'OFF_eFG%', 'OFF_TOV%', 'OFF_ORB%', 'OFF_FT/FGA', 'DEF_eFG%', 'DEF_TOV%', 'DEF_ORB%', 'DEF_FT/FGA', 'Win', 'Season']
['Date', 'Opp', 'Team']


In [27]:
#we can eliminate points, opponent points
nba_df = nba_df.drop(columns=['Tm', 'Opp_Points'])

#take new lists of numerical and categorical columns
numerical_cols = list(nba_df.select_dtypes(['int64', 'float64']).columns)
categorical_cols = list(nba_df.select_dtypes(['object']).columns)

print(numerical_cols)
print(categorical_cols)

['Gtm', 'At', 'OT', 'ORtg', 'DRtg', 'Pace', 'FTr', '3PAr', 'TS%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'OFF_eFG%', 'OFF_TOV%', 'OFF_ORB%', 'OFF_FT/FGA', 'DEF_eFG%', 'DEF_TOV%', 'DEF_ORB%', 'DEF_FT/FGA', 'Win', 'Season']
['Date', 'Opp', 'Team']


In [29]:
#change Date from string to DateTime to make comparisons

nba_df['Date'] = pd.to_datetime(nba_df['Date'])

nba_df.head()

Unnamed: 0,Gtm,Date,At,Opp,OT,ORtg,DRtg,Pace,FTr,3PAr,TS%,TRB%,AST%,STL%,BLK%,OFF_eFG%,OFF_TOV%,OFF_ORB%,OFF_FT/FGA,DEF_eFG%,DEF_TOV%,DEF_ORB%,DEF_FT/FGA,Win,Team,Season
0,1,2014-10-28,1,SAS,0,115.5,116.6,86.6,0.244,0.269,0.579,53.5,44.7,10.4,7.1,0.538,10.4,23.7,0.205,0.629,21.4,27.3,0.186,0,DAL,2015
1,1,2014-10-28,1,NOP,0,88.2,106.1,95.2,0.25,0.131,0.45,52.5,53.1,5.3,10.7,0.405,16.2,30.8,0.19,0.426,7.3,39.4,0.149,0,ORL,2015
2,1,2014-10-28,1,LAL,0,116.9,97.4,92.4,0.685,0.397,0.568,43.4,71.0,7.6,4.3,0.507,12.8,35.9,0.466,0.373,11.9,25.0,0.392,1,HOU,2015
3,1,2014-10-28,0,ORL,0,106.1,88.2,95.2,0.307,0.168,0.441,47.5,48.8,10.5,23.3,0.426,7.3,39.4,0.149,0.405,16.2,30.8,0.19,1,NOP,2015
4,1,2014-10-28,0,HOU,0,97.4,116.9,92.4,0.494,0.127,0.468,56.6,57.1,7.6,6.8,0.373,11.9,25.0,0.392,0.507,12.8,35.9,0.466,0,LAL,2015


Encode categorical features (even if it is useless for tree classifiers, but can be useful to try different models)

In [30]:
#encoding Opp and Team categorical columns
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
all_teams = pd.concat([nba_df['Team'], nba_df['Opp']]).unique()
le.fit(all_teams)

nba_df['Team_encoded'] = le.transform(nba_df['Team'])
nba_df['Opp_encoded'] = le.transform(nba_df['Opp'])

nba_df.head()


Unnamed: 0,Gtm,Date,At,Opp,OT,ORtg,DRtg,Pace,FTr,3PAr,TS%,TRB%,AST%,STL%,BLK%,OFF_eFG%,OFF_TOV%,OFF_ORB%,OFF_FT/FGA,DEF_eFG%,DEF_TOV%,DEF_ORB%,DEF_FT/FGA,Win,Team,Season,Team_encoded,Opp_encoded
0,1,2014-10-28,1,SAS,0,115.5,116.6,86.6,0.244,0.269,0.579,53.5,44.7,10.4,7.1,0.538,10.4,23.7,0.205,0.629,21.4,27.3,0.186,0,DAL,2015,6,26
1,1,2014-10-28,1,NOP,0,88.2,106.1,95.2,0.25,0.131,0.45,52.5,53.1,5.3,10.7,0.405,16.2,30.8,0.19,0.426,7.3,39.4,0.149,0,ORL,2015,21,18
2,1,2014-10-28,1,LAL,0,116.9,97.4,92.4,0.685,0.397,0.568,43.4,71.0,7.6,4.3,0.507,12.8,35.9,0.466,0.373,11.9,25.0,0.392,1,HOU,2015,10,13
3,1,2014-10-28,0,ORL,0,106.1,88.2,95.2,0.307,0.168,0.441,47.5,48.8,10.5,23.3,0.426,7.3,39.4,0.149,0.405,16.2,30.8,0.19,1,NOP,2015,18,21
4,1,2014-10-28,0,HOU,0,97.4,116.9,92.4,0.494,0.127,0.468,56.6,57.1,7.6,6.8,0.373,11.9,25.0,0.392,0.507,12.8,35.9,0.466,0,LAL,2015,13,10


In [31]:
#eliminate Gtm (order already present)

nba_df = nba_df.drop(columns=['Gtm'])

nba_df.head()


Unnamed: 0,Date,At,Opp,OT,ORtg,DRtg,Pace,FTr,3PAr,TS%,TRB%,AST%,STL%,BLK%,OFF_eFG%,OFF_TOV%,OFF_ORB%,OFF_FT/FGA,DEF_eFG%,DEF_TOV%,DEF_ORB%,DEF_FT/FGA,Win,Team,Season,Team_encoded,Opp_encoded
0,2014-10-28,1,SAS,0,115.5,116.6,86.6,0.244,0.269,0.579,53.5,44.7,10.4,7.1,0.538,10.4,23.7,0.205,0.629,21.4,27.3,0.186,0,DAL,2015,6,26
1,2014-10-28,1,NOP,0,88.2,106.1,95.2,0.25,0.131,0.45,52.5,53.1,5.3,10.7,0.405,16.2,30.8,0.19,0.426,7.3,39.4,0.149,0,ORL,2015,21,18
2,2014-10-28,1,LAL,0,116.9,97.4,92.4,0.685,0.397,0.568,43.4,71.0,7.6,4.3,0.507,12.8,35.9,0.466,0.373,11.9,25.0,0.392,1,HOU,2015,10,13
3,2014-10-28,0,ORL,0,106.1,88.2,95.2,0.307,0.168,0.441,47.5,48.8,10.5,23.3,0.426,7.3,39.4,0.149,0.405,16.2,30.8,0.19,1,NOP,2015,18,21
4,2014-10-28,0,HOU,0,97.4,116.9,92.4,0.494,0.127,0.468,56.6,57.1,7.6,6.8,0.373,11.9,25.0,0.392,0.507,12.8,35.9,0.466,0,LAL,2015,13,10


# Compute Rolling Averages
We eill compute the following statistics (new features to add) for both Team and Opponent

- home_win_pt
- away_win_pt
- rolling_window_averages for the following features:
    features_to_roll = [
        'OT' (number of OT in last matches),
        'ORtg', 'DRtg', 'Pace',
        'OFF_eFG%', 'DEF_eFG%',
        'OFF_TOV%', 'DEF_TOV%',
        'OFF_ORB%', 'DEF_ORB%',
        'OFF_FT/FGA', 'DEF_FT/FGA',
        'TS%', 'TRB%', 'AST%', 'STL%', 'BLK%',
        'RestDays', 'WinStreak', 'LoseStreak', 'Win'
    ]

In [194]:
#features_to_add: ['RestDays', 'WinStreak', 'LoseStreak', 'home_win_pt', 'away_win_pt']
def compute_adding_features(df):
    df = df.copy()

    team_games = df[['Date', 'Team', 'Win', 'At', 'Season']].copy() #takes a dataset with all matches by Team
    opp_games = df[['Date', 'Opp', 'Win', 'At', 'Season']].copy() #takes a dataset with all matches by Opp

    opp_games = opp_games.rename(columns={'Opp': 'Team'}) #change name
    opp_games['Win'] = 1 - opp_games['Win'] #Win refers to Team, so for Opp is opposite
    opp_games['At'] = 1 - opp_games['At'] #At refers to Team, so for Opp is opposite

    #create a sigle df with all matches with the obly view of a single team
    all_games = pd.concat([team_games, opp_games]) #put all together
    all_games = all_games.drop_duplicates(subset=['Team', 'Season', 'Date'])
    all_games = all_games.sort_values(['Team', 'Season', 'Date']).reset_index(drop=True) #ordering

    #!RESTDAYS
    all_games['RestDays'] = all_games.groupby(['Team', 'Season'])['Date'].diff().dt.days.fillna(3) #fillna because the first match has no previous matches
    #dt.days because date is a datetime type, so the fillna has to be a datetime type too

    df = pd.merge(df, all_games[['Team', 'Date', 'RestDays']], on=['Team', 'Date'], how='left')

    #!HOME AND AWAY WIN PT
    home_games = all_games[all_games['At'] == 0].copy()
    away_games = all_games[all_games['At'] == 1].copy()




    home_games.sort_values(['Team', 'Season', 'Date']).reset_index(drop=True) #ordering
    home_games['home_TEAM_win_pt_at_HOME'] = home_games.groupby(['Team', 'Season'])['Win'].expanding().mean().shift(1).fillna(0).reset_index(level=[0,1], drop=True)
    
    dfs = []
    for _, group in home_games.groupby(['Team', 'Season']):
        group.loc[group.index[0], 'home_TEAM_win_pt_at_HOME'] = 0
        dfs.append(group)

    home_games = pd.concat(dfs, axis=0).sort_index()
    

    #shift excludes the current row because we want win percentage until the current match excluded --> leave out team and season from index
    #fillna puts a 0 in the first match of the season
    

    away_games.sort_values(['Team', 'Season', 'Date']).reset_index(drop=True) #ordering
    away_games['away_TEAM_win_pt_AWAY'] = away_games.groupby(['Team', 'Season'])['Win'].expanding().mean().shift(1).fillna(0).reset_index(level=[0,1], drop=True)
    
    dfs = []
    for _, group in away_games.groupby(['Team', 'Season']):
        group.loc[group.index[0], 'away_TEAM_win_pt_AWAY'] = 0
        dfs.append(group)

    away_games = pd.concat(dfs, axis=0).sort_index()

    
    away_games = away_games.rename(columns={'Team': 'Opp'})
    all_games1 = pd.merge(df[df['At']==0], home_games[['Team', 'Date','home_TEAM_win_pt_at_HOME']], on=['Team', 'Date'], how='left').merge(away_games[['Opp', 'Date', 'away_TEAM_win_pt_AWAY']], on=['Opp', 'Date'], how='left')
    
    away_games = away_games.rename(columns={'Opp': 'Team'})
    home_games = home_games.rename(columns={'Team': 'Opp'})
    all_games2 = pd.merge(df[df['At']==1], away_games[['Team', 'Date', 'away_TEAM_win_pt_AWAY']], on=['Team', 'Date'], how='left').merge(home_games[['Opp', 'Date','home_TEAM_win_pt_at_HOME']], on=['Opp', 'Date'], how='left')

    df = pd.concat([all_games1, all_games2], axis=0).sort_values(['Team', 'Season', 'Date']).reset_index(drop=True)
    df = df.drop_duplicates(subset=['Team', 'Season', 'Date'])


    #!WIN AND LOSE STREAK
    all_games = all_games.sort_values(['Team', 'Season', 'Date']).reset_index(drop=True) #ordering
    streak = 0
    ws = []
    prev_team = None
    prev_season = None
    prev_win = 0
    for win, team, season in zip(all_games['Win'], all_games['Team'], all_games['Season']):
        if (team != prev_team) or season != prev_season:
            streak=0
        else:
            if prev_win == 1:
                streak +=1
            else:
                streak = 0
        prev_team=team
        prev_win = win
        prev_season = season
        ws.append(streak)

    all_games['WinStreak'] = ws

    prev_season = None
    streak = 0
    ls = []
    prev_team = None
    prev_win = 0
    for win, team, season in zip(all_games['Win'], all_games['Team'], all_games['Season']):
        if (team != prev_team) or season != prev_season:
            streak=0
        else:
            if prev_win == 0:
                streak +=1
            else:
                streak = 0
        prev_team=team
        prev_win = win
        prev_season = season
        ls.append(streak)

    all_games['LoseStreak'] = ls

    final_df = pd.merge(df, all_games[['Team', 'Date', 'WinStreak', 'LoseStreak']], on=['Team', 'Date'], how='left')

    return final_df

In [168]:
features_to_roll = [
        'OT', #(number of OT in last matches)
        'ORtg', 'DRtg', 'Pace',
        'OFF_eFG%', 'DEF_eFG%',
        'OFF_TOV%', 'DEF_TOV%',
        'OFF_ORB%', 'DEF_ORB%',
        'OFF_FT/FGA', 'DEF_FT/FGA',
        'TS%', 'TRB%', 'AST%', 'STL%', 'BLK%',
        'Team', 'Win', 'Season'
    ]

In [169]:
def ComputeRollingAverages(row):
    row=row.rolling(10).mean().shift(1)
    return row

def compute_rolling_averages(df):
    df_rolling = df[features_to_roll].copy()
    df_rolling = df_rolling.groupby(['Team', 'Season'], group_keys=False).apply(ComputeRollingAverages) #we don't have (team, season) as a key and this concatenates each view across the rows
    #hroup_keys works only with apply

    new_columns = {col: f'{col}_L10' for col in features_to_roll if col not in ['Season', 'Team']}
    df_rolling = df_rolling.rename(columns=new_columns)

    final_df = pd.concat([df, df_rolling], axis=1)
    final_df = final_df.dropna()
    return final_df


# Collect Opponent Statistics
For every row, we add the same exact statistics, but for the opponent too

In [170]:
print(len(nba_df))

28248


In [195]:
new_df = compute_adding_features(nba_df).sort_values(['Team', 'Season', 'Date'])
new_df.head()


Unnamed: 0,Date,At,Opp,OT,ORtg,DRtg,Pace,FTr,3PAr,TS%,TRB%,AST%,STL%,BLK%,OFF_eFG%,OFF_TOV%,OFF_ORB%,OFF_FT/FGA,DEF_eFG%,DEF_TOV%,DEF_ORB%,DEF_FT/FGA,Win,Team,Season,Team_encoded,Opp_encoded,RestDays,home_TEAM_win_pt_at_HOME,away_TEAM_win_pt_AWAY,WinStreak,LoseStreak
0,2014-10-29,1,TOR,0,107.4,114.8,95.0,0.213,0.275,0.583,53.3,65.0,6.3,12.5,0.581,17.8,23.8,0.113,0.456,8.7,33.3,0.3,0,ATL,2015,0,27,3.0,0.0,0.0,0,0
1,2014-11-01,0,IND,0,109.4,98.7,93.2,0.478,0.29,0.611,54.3,74.3,10.7,12.2,0.558,13.5,8.3,0.362,0.457,16.6,24.4,0.222,1,ATL,2015,0,11,3.0,0.0,0.0,0,1
2,2014-11-05,1,SAS,0,95.8,97.9,96.0,0.12,0.272,0.475,57.5,68.4,14.6,9.6,0.457,12.6,20.4,0.087,0.486,19.7,28.9,0.391,0,ATL,2015,0,26,4.0,1.0,0.0,1,0
3,2014-11-07,1,CHO,2,103.7,106.3,95.0,0.28,0.355,0.57,57.3,65.1,7.0,3.9,0.532,15.4,14.9,0.215,0.526,16.2,26.2,0.206,0,ATL,2015,0,4,2.0,0.666667,0.0,0,1
4,2014-11-08,0,NYK,0,115.2,107.3,89.4,0.444,0.272,0.532,51.8,54.5,11.2,7.9,0.463,8.5,27.9,0.346,0.524,14.4,31.0,0.095,1,ATL,2015,0,19,1.0,1.0,0.333333,0,2


In [196]:
team_dict = {name[0]: team for name, team in  new_df.groupby(['Team', 'Season'])[['Win', 'Season', 'At', 'home_TEAM_win_pt_at_HOME', 'away_TEAM_win_pt_AWAY']]}
print(len(new_df))
print(team_dict['LAL'].iloc[0:50,:])

28248
       Win  Season  At  home_TEAM_win_pt_at_HOME  away_TEAM_win_pt_AWAY
13216    1    2025   0                  0.000000               0.000000
13217    1    2025   0                  1.000000               1.000000
13218    1    2025   0                  1.000000               0.000000
13219    0    2025   1                  1.000000               0.000000
13220    0    2025   1                  1.000000               0.000000
13221    1    2025   1                  0.333333               0.000000
13222    0    2025   1                  0.000000               0.333333
13223    0    2025   1                  0.500000               0.250000
13224    1    2025   0                  1.000000               0.250000
13225    1    2025   0                  1.000000               0.000000
13226    1    2025   0                  1.000000               0.600000
13227    1    2025   1                  0.714286               0.200000
13228    1    2025   1                  0.428571          

In [197]:
for col in new_df.columns:
    print(f'{col}: {pd.isna(new_df[col]).sum()}')

Date: 0
At: 0
Opp: 0
OT: 0
ORtg: 0
DRtg: 0
Pace: 0
FTr: 0
3PAr: 0
TS%: 0
TRB%: 0
AST%: 0
STL%: 0
BLK%: 0
OFF_eFG%: 0
OFF_TOV%: 0
OFF_ORB%: 0
OFF_FT/FGA: 0
DEF_eFG%: 0
DEF_TOV%: 0
DEF_ORB%: 0
DEF_FT/FGA: 0
Win: 0
Team: 0
Season: 0
Team_encoded: 0
Opp_encoded: 0
RestDays: 0
home_TEAM_win_pt_at_HOME: 0
away_TEAM_win_pt_AWAY: 0
WinStreak: 0
LoseStreak: 0


In [174]:
#compute rolling averages (for OT too!)


In [175]:
#we can probably Date, FTr and 3PAr (check with correlation_matrix or SequentialFeatureSelector or FeatureImportance)