Imports

In [44]:
import os

from cmdstanpy import CmdStanModel
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

os.chdir('/app')

from src.web_scraping.scraper import scrap_games

Input parameters for scraping

In [45]:
years = list(range(2020, 2022))

Scraping

In [15]:
scrap_games(years)

HTTPError: 429 Client Error: Too Many Requests for url: https://www.basketball-reference.com/leagues/NBA_2016_games.html

Read data

In [46]:
path = 'src/data/'
path_games = os.path.join(path, 'games/')
seasons_games = {}

for year in years:
    seasons_games[year] = pd.read_csv(os.path.join(path_games, f'{year-1}_{year}_season.csv'))

Edit dataframe

In [49]:
# get unique team names
teams = {}
for year in years:
    teams[year] = list(set(seasons_games[year]['home_team']).union(set(seasons_games[year]['away_team'])))

    # map team to id
    team_to_id = {team: i + 1 for i, team in enumerate(teams[year])}

    seasons_games[year]['home_team_id'] = seasons_games[year]['home_team'].map(team_to_id)
    seasons_games[year]['away_team_id'] = seasons_games[year]['away_team'].map(team_to_id)

    # Add difference in points between home team and away team
    for year in years:
        seasons_games[year]['difference'] = seasons_games[year]['home_team_score'] - seasons_games[year]['away_team_score']

Show data (first 5 rows)

In [50]:
for year in years:
    print(seasons_games[year].head())

                  start_time             away_team  away_team_score  \
0  2019-10-23 00:00:00+00:00  NEW ORLEANS PELICANS              122   
1  2019-10-23 02:30:00+00:00    LOS ANGELES LAKERS              102   
2  2019-10-23 23:00:00+00:00         CHICAGO BULLS              125   
3  2019-10-23 23:00:00+00:00       DETROIT PISTONS              119   
4  2019-10-23 23:00:00+00:00   CLEVELAND CAVALIERS               85   

              home_team  home_team_score  home_team_id  away_team_id  \
0       TORONTO RAPTORS              130            28            23   
1  LOS ANGELES CLIPPERS              112            17             3   
2     CHARLOTTE HORNETS              126            29            24   
3        INDIANA PACERS              110            10             2   
4         ORLANDO MAGIC               94             6            14   

   difference  
0           8  
1          10  
2           1  
3          -9  
4           9  
                  start_time              aw

Prepare input data

In [51]:
year = 2020
inp_data = {
    'teams_number': len(teams[year]),
    'games_number': len(seasons_games[year]),
    'home_team': seasons_games[year].home_team_id.to_numpy(),
    'away_team': seasons_games[year].away_team_id.to_numpy(),
    'home_score': seasons_games[year].home_team_score.to_numpy(),
    'away_score': seasons_games[year].away_team_score.to_numpy(),
}

Modelling

Model 1 - posterior

In [53]:
with open('src/model/model_1_posterior.stan', 'r') as f:
    print(f.read())

data {
  int teams_number;
  int games_number;
  array[games_number] int home_team;
  array[games_number] int away_team;
  array[games_number] int<lower=0> home_score;
  array[games_number] int<lower=0> away_score;
}
parameters {
  real mu_home_att;
  real mu_away_att;
  real mu_home_def;
  real mu_away_def;
  real<lower=0> sigma2_att;
  real<lower=0> sigma2_def;
  real<lower=0> phi_home;
  real<lower=0> phi_away;

  vector[teams_number-1] home_att_raw;
  vector[teams_number-1] away_att_raw;
  vector[teams_number-1] home_def_raw;
  vector[teams_number-1] away_def_raw;
}
transformed parameters {
  vector[games_number] log_mu_home;
  vector[games_number] log_mu_away;
  vector[teams_number] home_att;
  vector[teams_number] away_att;
  vector[teams_number] home_def;
  vector[teams_number] away_def;

  // need to make sum(att)=sum(def)=0
  for (k in 1:(teams_number-1)) {
    home_att[k] = home_att_raw[k];
    away_att[k] = away_att_raw[k];
    home_def[k] = home_def_raw[k];
    away_def[k] 

In [55]:
model_1_prior = CmdStanModel(stan_file='src/model/model_1_posterior.stan')
R = 1000
sim=model_1_prior.sample(data=inp_data,
                     iter_sampling=R,
                     iter_warmup=1,
                     chains=1,
                     fixed_param=False,
                     refresh=R)

INFO:cmdstanpy:found newer exe file, not recompiling
INFO:cmdstanpy:CmdStan start processing
chain 1 |[34m██████████[0m| 00:01 Sampling completed                       

                                                                                


INFO:cmdstanpy:CmdStan done processing.





In [56]:
mod_1_post = sim.draws_pd()
mod_1_post.head()

Unnamed: 0,lp__,accept_stat__,stepsize__,treedepth__,n_leapfrog__,divergent__,energy__,mu_home_att,mu_away_att,mu_home_def,...,away_def[21],away_def[22],away_def[23],away_def[24],away_def[25],away_def[26],away_def[27],away_def[28],away_def[29],away_def[30]
0,-28161.2,0.0,5.79578,0.0,1.0,1.0,28228.0,-1.34692,1.09495,0.82671,...,1.01295,-0.945685,-0.736062,-0.542337,-0.501546,0.292315,-1.33988,-0.288685,-0.231245,10.2751
1,-28161.2,0.0,5.79578,0.0,1.0,1.0,28244.8,-1.34692,1.09495,0.82671,...,1.01295,-0.945685,-0.736062,-0.542337,-0.501546,0.292315,-1.33988,-0.288685,-0.231245,10.2751
2,-28161.2,0.0,5.79578,0.0,1.0,1.0,28231.6,-1.34692,1.09495,0.82671,...,1.01295,-0.945685,-0.736062,-0.542337,-0.501546,0.292315,-1.33988,-0.288685,-0.231245,10.2751
3,-28161.2,0.0,5.79578,0.0,1.0,1.0,28221.4,-1.34692,1.09495,0.82671,...,1.01295,-0.945685,-0.736062,-0.542337,-0.501546,0.292315,-1.33988,-0.288685,-0.231245,10.2751
4,-28161.2,0.0,5.79578,0.0,1.0,1.0,28243.0,-1.34692,1.09495,0.82671,...,1.01295,-0.945685,-0.736062,-0.542337,-0.501546,0.292315,-1.33988,-0.288685,-0.231245,10.2751
