In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from nba_api.stats.static import teams

In [2]:
df = pd.read_csv("NBA_Stats.csv")
df_extra = pd.read_csv("EXTRA_NBA_Stats.csv")

In [3]:
df_extra_home = df_extra[df_extra["MATCHUP"].str.contains("vs.")]
df_extra_away = df_extra[df_extra["MATCHUP"].str.contains("@")]

In [4]:
df_extra_home = df_extra_home[["GAME_ID", "FG_PCT", "REB", "TOV"]]
df_extra_home = df_extra_home.rename(
    columns={
        "GAME_ID": "Game_ID",
        "FG_PCT": "FG_PCT_Home",
        "REB": "REB_Home",
        "TOV": "TOV_Home",
    }
)

df_extra_away = df_extra_away[["GAME_ID", "FG_PCT", "REB", "TOV"]]
df_extra_away = df_extra_away.rename(
    columns={
        "GAME_ID": "Game_ID",
        "FG_PCT": "FG_PCT_Away",
        "REB": "REB_Away",
        "TOV": "TOV_Away",
    }
)

In [5]:
teams = teams.get_teams()

In [6]:
Capacity = {
    "Atlanta Hawks": 16600,
    "Boston Celtics": 18624,
    "Brooklyn Nets": 17732,
    "Charlotte Hornets": 19077,
    "Chicago Bulls": 20917,
    "Cleveland Cavaliers": 19432,
    "Dallas Mavericks": 19200,
    "Denver Nuggets": 19520,
    "Detroit Pistons": 20332,
    "Golden State Warriors": 18064,
    "Houston Rockets": 18055,
    "Indiana Pacers": 17923,
    "Los Angeles Clippers": 19060,
    "Los Angeles Lakers": 19060,
    "Memphis Grizzlies": 18119,
    "Miami Heat": 19600,
    "Milwaukee Bucks": 17341,
    "Minnesota Timberwolves": 18798,
    "New Orleans Pelicans": 16867,
    "New York Knicks": 19812,
    "Oklahoma City Thunder": 18203,
    "Orlando Magic": 18846,
    "Philadelphia 76ers": 20478,
    "Phoenix Suns": 18422,
    "Portland Trail Blazers": 19393,
    "Sacramento Kings": 17608,
    "San Antonio Spurs": 18418,
    "Toronto Raptors": 19800,
    "Utah Jazz": 18306,
    "Washington Wizards": 20356,
}

In [7]:
res = 0
for val in Capacity.values():
    res += val

# using len() to get total keys for mean computation
res = res / len(Capacity)

print(res)

18798.766666666666


In [8]:
df.head()

Unnamed: 0.1,Unnamed: 0,Game_ID,Home_ID,Away_ID,Attendance,Points_Home,Points_Away,Fouls_Home,Fouls_Away,Referee_1,Referee_2,Referee_3,Off_Rating_Home,Off_Rating_Away
0,0,21900002,1610612746,1610612747,19068.0,112,102,25,24,200834,201640,203592,115.5,105.2
1,1,21900001,1610612761,1610612740,20787.0,130,122,24,34,200833,201639,2529,113.0,108.9
2,2,21900008,1610612755,1610612738,20422.0,107,93,34,29,201245,2882,1151,103.9,90.3
3,3,21900010,1610612759,1610612752,18354.0,120,111,18,32,202053,1628954,2714,110.1,100.9
4,4,21900012,1610612756,1610612758,18055.0,124,95,25,24,202058,1628951,1194,112.7,88.0


In [9]:
te = {}
for i in teams:
    te[i["id"]] = i["full_name"]

In [10]:
df["Home_Team"] = df["Home_ID"].apply(lambda x: te[x])
df["Away_Team"] = df["Away_ID"].apply(lambda x: te[x])
df = df.drop("Unnamed: 0", axis=1)
df = df.fillna(0)

In [11]:
df = df.merge(df_extra_home, on="Game_ID", how="left")
df = df.merge(df_extra_away, on="Game_ID", how="left")

In [12]:
df["Home_Win"] = df.apply(
    lambda x: 1 if x["Points_Home"] > x["Points_Away"] else 0, axis=1
)
df["Points_diff"] = df.apply(lambda x: x["Points_Home"] - x["Points_Away"], axis=1)
df["Points_total"] = df.apply(lambda x: x["Points_Home"] + x["Points_Away"], axis=1)
df["Fouls_diff"] = df.apply(lambda x: x["Fouls_Home"] - x["Fouls_Away"], axis=1)
df["Fouls_total"] = df.apply(lambda x: x["Fouls_Home"] + x["Fouls_Away"], axis=1)
df["Off_Rating_diff"] = df.apply(
    lambda x: x["Off_Rating_Home"] - x["Off_Rating_Away"], axis=1
)
df["Off_Rating_total"] = df.apply(
    lambda x: x["Off_Rating_Home"] + x["Off_Rating_Away"], axis=1
)
df["Referee_1"] = df["Referee_1"].astype(str)
df["Referee_2"] = df["Referee_2"].astype(str)
df["Referee_3"] = df["Referee_3"].astype(str)
df["Closed_Doors"] = df["Attendance"].apply(lambda x: 1 if x == 0 else 0)
df["Max_Seats"] = df["Home_Team"].apply(lambda x: Capacity[x])
df["Capacity"] = df.apply(lambda x: x["Attendance"] / x["Max_Seats"], axis=1)
df["Capacity"] = df["Capacity"].apply(lambda x: 1 if x > 1 else x)
df["Cap x Off"] = df.apply(lambda x: x["Capacity"] * x["Off_Rating_diff"], axis=1)

In [13]:
df.head()

Unnamed: 0,Game_ID,Home_ID,Away_ID,Attendance,Points_Home,Points_Away,Fouls_Home,Fouls_Away,Referee_1,Referee_2,...,Points_diff,Points_total,Fouls_diff,Fouls_total,Off_Rating_diff,Off_Rating_total,Closed_Doors,Max_Seats,Capacity,Cap x Off
0,21900002,1610612746,1610612747,19068.0,112,102,25,24,200834,201640,...,10,214,1,49,10.3,220.7,0,19060,1.0,10.3
1,21900001,1610612761,1610612740,20787.0,130,122,24,34,200833,201639,...,8,252,-10,58,4.1,221.9,0,19800,1.0,4.1
2,21900008,1610612755,1610612738,20422.0,107,93,34,29,201245,2882,...,14,200,5,63,13.6,194.2,0,20478,0.997265,13.562809
3,21900010,1610612759,1610612752,18354.0,120,111,18,32,202053,1628954,...,9,231,-14,50,9.2,211.0,0,18418,0.996525,9.168031
4,21900012,1610612756,1610612758,18055.0,124,95,25,24,202058,1628951,...,29,219,1,49,24.7,200.7,0,18422,0.980078,24.207931


In [14]:
df_nosd = df[df["Attendance"] != 0]
df_sd = df[df["Attendance"] == 0]

In [15]:
df_nosd["Capacity"].values.mean()

0.9415117886394069

In [16]:
x_catagorical = ["Home_Team", "Away_Team", "Referee_1", "Referee_2", "Referee_3"]

dum = pd.get_dummies(df[x_catagorical])

In [17]:
y_vars = [
    "Home_Win",
    "Points_diff",
    "Points_total",
    "Fouls_Home",
    "Fouls_Away",
    "Fouls_diff",
    "Off_Rating_diff",
]
X1 = pd.concat([df[["Attendance", "Closed_Doors"]], dum], axis=1)

In [18]:
X2 = pd.concat([df["Capacity"], dum], axis=1)

In [19]:
y_3 = [
    "Home_Win",
    "Fouls_Home",
    "Fouls_Away",
    "Fouls_diff",
]

In [20]:
X3 = pd.concat(
    [
        df[["Capacity", "Off_Rating_diff", "Cap x Off"]],
        dum,
    ],
    axis=1,
)

In [21]:
for i in y_vars:
    mod = sm.OLS(df["Points_diff"], X1)
    res = mod.fit()
    print(res.summary())

                            OLS Regression Results                            
Dep. Variable:            Points_diff   R-squared:                       0.386
Model:                            OLS   Adj. R-squared:                  0.169
Method:                 Least Squares   F-statistic:                     1.778
Date:                Tue, 07 Dec 2021   Prob (F-statistic):           6.42e-10
Time:                        16:42:38   Log-Likelihood:                -4046.2
No. Observations:                1059   AIC:                             8646.
Df Residuals:                     782   BIC:                         1.002e+04
Df Model:                         276                                         
Covariance Type:            nonrobust                                         
                                       coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
Attendan

In [22]:
for i in y_vars:
    mod = sm.OLS(df[i], X2)
    res = mod.fit()
    print(res.summary())

                            OLS Regression Results                            
Dep. Variable:               Home_Win   R-squared:                       0.365
Model:                            OLS   Adj. R-squared:                  0.142
Method:                 Least Squares   F-statistic:                     1.638
Date:                Tue, 07 Dec 2021   Prob (F-statistic):           1.15e-07
Time:                        16:42:41   Log-Likelihood:                -522.35
No. Observations:                1059   AIC:                             1597.
Df Residuals:                     783   BIC:                             2967.
Df Model:                         275                                         
Covariance Type:            nonrobust                                         
                                       coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
Capacity

In [23]:
for i in y_3:
    mod = sm.OLS(df[i], X3)
    res = mod.fit()
    print(res.summary())

                            OLS Regression Results                            
Dep. Variable:               Home_Win   R-squared:                       0.734
Model:                            OLS   Adj. R-squared:                  0.640
Method:                 Least Squares   F-statistic:                     7.778
Date:                Tue, 07 Dec 2021   Prob (F-statistic):          1.39e-113
Time:                        16:42:43   Log-Likelihood:                -61.890
No. Observations:                1059   AIC:                             679.8
Df Residuals:                     781   BIC:                             2060.
Df Model:                         277                                         
Covariance Type:            nonrobust                                         
                                       coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
Capacity