In [1]:
from bs4 import BeautifulSoup
import requests
import datetime
import pandas as pd
import numpy as np
import json
import lightgbm as lgb
import pickle

# query game date

In [3]:
def get_web_page(url):
    resp = requests.get(url=url)
    if resp.status_code != 200:
        print('Invalid url:', resp.url)
        return None
    else:
        resp.encoding='utf-8'
        return resp.text    

In [4]:
doc = get_web_page("https://www.basketball-reference.com/leagues/NBA_2019_games-february.html")

In [5]:
teams = json.loads(open("/Users/chienan/Pycon/github/SportLottery/nbastat/teamId_mpt").read())
teams = dict((v,k) for k,v in teams.items())

In [6]:
soup = BeautifulSoup(doc, "lxml")

In [7]:
data = soup.find('table',attrs={'class':'suppress_glossary sortable stats_table'})

In [8]:
row = data.find('tbody')

In [9]:
data_date = row.find_all('th',attrs={"data-stat": "date_game"})
data_row_home = row.find_all('td',attrs={"data-stat": "home_team_name"})
data_row_visitor = row.find_all('td',attrs={"data-stat": "visitor_team_name"})

In [9]:
data_date = np.array([r.text for r in data_date]).reshape(-1,1)
data_row_home = np.array([r.text for r in data_row_home]).reshape(-1,1)
data_row_visitor = np.array([r.text for r in data_row_visitor]).reshape(-1,1)

In [10]:
df = pd.DataFrame(np.append(np.append(data_date,data_row_home,axis=1),data_row_visitor,axis=1),
                  columns=["GAME_DATE","HOME_TEAM","AWAY_TEAM"])

In [11]:
df["GAME_DATE"] = df.GAME_DATE.apply(lambda dt : datetime.datetime.strptime(dt,"%a, %b %d, %Y").date())

In [12]:
df["TEAM_ID_HOME"] = df.HOME_TEAM.apply(lambda x : teams[x])
df["TEAM_ID_AWAY"] = df.AWAY_TEAM.apply(lambda x : teams[x])

In [13]:
(datetime.datetime.now() + datetime.timedelta(1)).date()

datetime.date(2019, 2, 5)

In [14]:
df = df[df.GAME_DATE==(datetime.datetime.now()).date()].reset_index(drop=True)

In [15]:
df

Unnamed: 0,GAME_DATE,HOME_TEAM,AWAY_TEAM,TEAM_ID_HOME,TEAM_ID_AWAY
0,2019-02-04,Brooklyn Nets,Milwaukee Bucks,1610612751,1610612749
1,2019-02-04,Detroit Pistons,Denver Nuggets,1610612765,1610612743
2,2019-02-04,New Orleans Pelicans,Indiana Pacers,1610612740,1610612754
3,2019-02-04,Phoenix Suns,Houston Rockets,1610612756,1610612745
4,2019-02-04,Sacramento Kings,San Antonio Spurs,1610612758,1610612759
5,2019-02-04,Washington Wizards,Atlanta Hawks,1610612764,1610612737


# create query data

In [16]:
team = pd.read_pickle("/Users/chienan/Pycon/github/SportLottery/input/FEATURE_AGG_2012TO2019.pkl")
home = team[["GAME_ID","GAME_DATE"]+sorted(list(set([col for col in team.columns if "_H" in col])))[::-1]]
away = team.loc[:,["GAME_ID","GAME_DATE"]+sorted(list(set([col for col in team.columns if ("_A" in col) & ("_H" not in col)])))[::-1]]

In [17]:
home = home.drop("TEAM_ID_HOME",axis=1)
away = away.drop("TEAM_ID_AWAY",axis=1)

In [18]:
home.columns = ["GAME_ID","GAME_DATE"]+[column[:-2] for column in home.loc[:,"TEAM_ID_H":].columns]
away.columns = ["GAME_ID","GAME_DATE"]+[column[:-2] for column in away.loc[:,"TEAM_ID_A":].columns]

In [19]:
team = pd.concat([home,away],axis=0).reset_index(drop=True)

In [20]:
team = team.sort_values("GAME_DATE").reset_index(drop=True)

# query data

In [21]:
df

Unnamed: 0,GAME_DATE,HOME_TEAM,AWAY_TEAM,TEAM_ID_HOME,TEAM_ID_AWAY
0,2019-02-04,Brooklyn Nets,Milwaukee Bucks,1610612751,1610612749
1,2019-02-04,Detroit Pistons,Denver Nuggets,1610612765,1610612743
2,2019-02-04,New Orleans Pelicans,Indiana Pacers,1610612740,1610612754
3,2019-02-04,Phoenix Suns,Houston Rockets,1610612756,1610612745
4,2019-02-04,Sacramento Kings,San Antonio Spurs,1610612758,1610612759
5,2019-02-04,Washington Wizards,Atlanta Hawks,1610612764,1610612737


In [22]:
team["TEAM_ID"] = team.TEAM_ID.astype(str)

In [23]:
df = df.set_index("TEAM_ID_HOME")
team_h = pd.DataFrame()
for t_id in df.index:
    team_i = team.loc[team[team.TEAM_ID==t_id].index[-1]:team[team.TEAM_ID==t_id].index[-1],:]
    team_i = team_i.drop(["GAME_ID","GAME_DATE"],axis=1)
    team_i.columns=["TEAM_ID_HOME"]+["%s_H"%col for col in team.loc[:,"LAST_9_GAME_WL":].columns]
    team_i = team_i.set_index("TEAM_ID_HOME")
    team_h = pd.concat([team_h,team_i],axis=0)
df = pd.concat([df,team_h],axis=1).reset_index()

df = df.set_index("TEAM_ID_AWAY")
team_v = pd.DataFrame()
for t_id in df.index:
    team_i = team.loc[team[team.TEAM_ID==t_id].index[-1]:team[team.TEAM_ID==t_id].index[-1],:]
    team_i = team_i.drop(["GAME_ID","GAME_DATE"],axis=1)
    team_i.columns=["TEAM_ID_AWAY"]+["%s_A"%col for col in team.loc[:,"LAST_9_GAME_WL":].columns]
    team_i = team_i.set_index("TEAM_ID_AWAY")
    team_v = pd.concat([team_v,team_i],axis=0)
df = pd.concat([df,team_v],axis=1).reset_index()

In [24]:
team = pd.read_pickle("/Users/chienan/Pycon/github/SportLottery/input/FEATURE_AGG_2012TO2019.pkl")

In [25]:
team.drop(list(set(team.columns)-set(df.columns)),axis=1,inplace=True)

In [26]:
column = pickle.load(open("/Users/chienan/Pycon/github/SportLottery/model/classfication_feature-45_columns.pkl","rb"))

In [27]:
x_input = df[column]

In [28]:
model = lgb.Booster(model_file="/Users/chienan/Pycon/github/SportLottery/model/classfication_model_feature-45.txt")

In [29]:
pred = model.predict(x_input)

In [30]:
pred

array([0.39044531, 0.37726632, 0.60518065, 0.41341926, 0.47838407,
       0.68980421])

In [31]:
df

Unnamed: 0,TEAM_ID_AWAY,TEAM_ID_HOME,GAME_DATE,HOME_TEAM,AWAY_TEAM,LAST_9_GAME_WL_H,LAST_9_GAME_TOV_H,LAST_9_GAME_STL_H,LAST_9_GAME_REB_H,LAST_9_GAME_PTS_H,...,LAST_10_GAME_FTA_A,LAST_10_GAME_FG_PCT_A,LAST_10_GAME_FGM_A,LAST_10_GAME_FGA_A,LAST_10_GAME_FG3_PCT_A,LAST_10_GAME_FG3M_A,LAST_10_GAME_FG3A_A,LAST_10_GAME_DREB_A,LAST_10_GAME_BLK_A,LAST_10_GAME_AST_A
0,1610612749,1610612751,2019-02-04,Brooklyn Nets,Milwaukee Bucks,7.0,14.555556,6.777778,50.111111,117.444444,...,20.5,0.4897,43.2,88.3,0.3629,12.5,34.3,39.6,6.1,25.1
1,1610612743,1610612765,2019-02-04,Detroit Pistons,Denver Nuggets,4.0,11.555556,6.555556,44.333333,99.666667,...,20.1,0.5118,45.4,88.9,0.3746,12.0,31.7,33.1,3.4,27.6
2,1610612754,1610612740,2019-02-04,New Orleans Pelicans,Indiana Pacers,3.0,12.222222,6.0,47.111111,113.555556,...,20.0,0.4541,40.9,90.0,0.3684,9.8,26.4,34.2,4.4,27.0
3,1610612745,1610612756,2019-02-04,Phoenix Suns,Houston Rockets,0.0,15.0,7.555556,37.666667,105.888889,...,28.2,0.4353,39.4,90.8,0.3075,15.5,49.6,29.6,5.2,18.4
4,1610612759,1610612758,2019-02-04,Sacramento Kings,San Antonio Spurs,5.0,13.666667,8.0,44.888889,106.444444,...,20.0,0.4908,43.3,88.6,0.42,11.6,27.6,34.4,5.1,25.0
5,1610612737,1610612764,2019-02-04,Washington Wizards,Atlanta Hawks,5.0,12.555556,8.888889,41.444444,111.666667,...,22.9,0.4775,42.7,89.8,0.356,13.0,37.0,32.0,4.4,26.3


In [32]:
#home
home_exp = np.zeros(pred.shape)
home_exp[1] = 2.6
home_exp[5] = 1.25
home_exp[0] = 3.3
home_exp[3] = 3.75
home_exp[4] = 1.95
home_exp*pred

array([1.28846954, 0.98089244, 0.        , 1.55032224, 0.93284894,
       0.86225526])

In [34]:
pred

array([0.39044531, 0.37726632, 0.60518065, 0.41341926, 0.47838407,
       0.68980421])

In [33]:
#away
away_exp = np.zeros(pred.shape)
away_exp[1] = 1.3
away_exp[5] = 2.85
away_exp[0] = 1.18
away_exp[3] = 1.12
away_exp[4] = 1.55
away_exp*(1-pred)

array([0.71927453, 0.80955378, 0.        , 0.65697043, 0.80850469,
       0.884058  ])

In [40]:
pred[7]*pred[6]*2.08

0.13019182316102715

In [41]:
pred[1]*pred[6]*3.19

0.34620549843183906

In [42]:
pred[1]*pred[7]*3.92

0.5895458327208245