In [1]:
#setup

import pandas as pd
import pyodbc
import numpy as np
from datetime import datetime,timedelta
from bs4 import BeautifulSoup as bs4
import requests
import lxml
import re
import time
import sqlite3
import string

from data import team_codes


In [2]:
from request import QueryWeb

In [93]:
class QueryWeb:
    

    def __init__(self):
        self._base_url= 'https://www.basketball-reference.com'
        return
    
    @staticmethod
    def get_html(url):
        
        try:
            response = requests.get(url=url, allow_redirects=False)
            response.raise_for_status()
            time.sleep(3) #meet bbref rate limit of 20 hits per minute
        except:

            if response.status_code == 429: #if too many requests, wait 1 hour
                time.sleep(3600)
            else: # else proceed
                pass
            
       
        return response
      
    def team_roster(self, team_cd,season_end_year):
        url = f'{self._base_url}/teams/{team_cd}/{season_end_year}.html'
        response = self.get_html(url)
        return response
    
    def regular_season_schedule(self, season_end_year):
        url = f'{self._base_url}/leagues/NBA_{season_end_year}_games.html'
        response = self.get_html(url)
        return response
    
    def playoff_schedule(self, season_end_year):
        url = f'{self._base_url}/playoffs/NBA_{season_end_year}_games.html'
        response = self.get_html(url)
        return response
    
    def player_career(self, last_name_abbrev,player_cd):
        url = f'{self._base_url}/players/{last_name_abbrev}/{player_cd}.html'
        response = self.get_html(url)
        return response
    
    def player_gamelog(self, last_name_abbrev, player_cd, season_end_year):
        url = f'{self._base_url}/players/{last_name_abbrev}/{player_cd}/gamelog/{season_end_year}'
        response = self.get_html(url)
        return response

In [96]:
instance = QueryWeb()
response = instance.team_roster('TOR','2019')

In [97]:
response.status_code

200

## Data required for modeling

1. Player data by game 
2. Live Injury
    a. Player depth chart
3. Team Roster

Last 5 years 

### Order of operations
1. Grab list of active players
    a. Review player list alphabetically
    b. get player name and player code 
    c. store player name and player code in db
2. For each player name and player code, navigate to player page
3. On player page, locate earliest season of gamelogs to view (and list)
4. On each player gamelog page (per season), pull all rows in tables for regular and playoff season
5. Store data in db


bball ref rate limit is 20 queries per minute


### 1. Grab active player list
1. Find players with bolded "strong" values 
2. Return to list 

In [60]:
base_url = 'https://www.basketball-reference.com'
pg_cat = 'players'
lst_nm_cd = 'a'
pg = requests.get(f'{base_url}/{pg_cat}/{lst_nm_cd}/')


In [61]:
pg.raise_for_status()

## Get Current Active from Alphabetical List

In [54]:
lst_nm_cd_list =  list(string.ascii_lowercase)
p_dict = {}
count=1
for letter in lst_nm_cd_list:
    pg = requests.get(f'{base_url}/{pg_cat}/{letter}/')
    soup = bs4(pg.content, 'html.parser')
    time.sleep(3.5)
    for tag in soup.find_all('strong'):
        try:
            player = tag.find('a').text
            player_url_tag = tag.find('a')['href']
            p_dict[count]=[player, player_url_tag,player_url_tag[11:20]]
            count+=1
        except:
            pass

active_players = pd.DataFrame.from_dict(p_dict, orient='index',columns=['player_name','player_profile_url','player_code'])
active_players.to_csv(r'active_players.csv')

### Get Team Roster per season for last 5 years
- Aggregate list of players to get full list of players to review past game data

### 4. Get Gamelog data per player

In [6]:

player_code='adam_sst01'
page_type = 'gamelog'
season='2014'



In [3]:
data = requests.get('https://www.basketball-reference.com/players/a/adamsst01/gamelog/2014#all_game_log_summary')
soup = bs4(data.content, 'html.parser')
soup.prettify()



In [4]:
soup.tr

<tr class="">
<td class="left"><tr><td> 0-9</td><td>15</td></tr>
<tr><td>10-19</td><td>50</td></tr>
<tr><td>20-29</td><td>15</td></tr>
<tr><td>30-39</td><td>1</td></tr>
</td>
</tr>

In [59]:
soup.find_all(id='pgl_basic.1')

[<tr id="pgl_basic.1"><th class="right" csk="1" data-stat="ranker" scope="row">1</th><td class="right endpoint tooltip" data-endpoint="/players/pgl_cum_stats.cgi?player=adamsst01&amp;year=2014&amp;date_game=2013-10-30&amp;is_playoff_game=N" data-stat="game_season"><strong>1</strong></td><td class="left" data-stat="date_game"><a href="/boxscores/201310300UTA.html">2013-10-30</a></td><td class="right" data-stat="age">20-102</td><td class="left" data-stat="team_id"><a href="/teams/OKC/2014.html">OKC</a></td><td class="center" data-stat="game_location">@</td><td class="left" data-stat="opp_id"><a href="/teams/UTA/2014.html">UTA</a></td><td class="center" csk="3" data-stat="game_result">W (+3)</td><td class="right iz" data-stat="gs">0</td><td class="right" csk="1107" data-stat="mp">18:27</td><td class="right" data-stat="fg">1</td><td class="right" data-stat="fga">1</td><td class="right" data-stat="fg_pct">1.000</td><td class="right iz" data-stat="fg3">0</td><td class="right iz" data-stat="f

In [11]:
txt = soup.find_all(id='pgl_basic.81')[0]
txt
# bs4(txt,'html.parser')

<tr id="pgl_basic.81"><th class="right" csk="82" data-stat="ranker" scope="row">82</th><td class="right endpoint tooltip" data-endpoint="/players/pgl_cum_stats.cgi?player=adamsst01&amp;year=2014&amp;date_game=2014-04-16&amp;is_playoff_game=N" data-stat="game_season"><strong>81</strong></td><td class="left" data-stat="date_game"><a href="/boxscores/201404160OKC.html">2014-04-16</a></td><td class="right" data-stat="age">20-270</td><td class="left" data-stat="team_id"><a href="/teams/OKC/2014.html">OKC</a></td><td class="center iz" data-stat="game_location"></td><td class="left" data-stat="opp_id"><a href="/teams/DET/2014.html">DET</a></td><td class="center" csk="1" data-stat="game_result">W (+1)</td><td class="right iz" data-stat="gs">0</td><td class="right" csk="642" data-stat="mp">10:42</td><td class="right iz" data-stat="fg">0</td><td class="right" data-stat="fga">1</td><td class="right iz" data-stat="fg_pct">.000</td><td class="right iz" data-stat="fg3">0</td><td class="right iz" dat

In [20]:
for i in :
    print(i)

82


In [38]:
txt.find_all()

[<th class="right" csk="82" data-stat="ranker" scope="row">82</th>,
 <td class="right endpoint tooltip" data-endpoint="/players/pgl_cum_stats.cgi?player=adamsst01&amp;year=2014&amp;date_game=2014-04-16&amp;is_playoff_game=N" data-stat="game_season"><strong>81</strong></td>,
 <strong>81</strong>,
 <td class="left" data-stat="date_game"><a href="/boxscores/201404160OKC.html">2014-04-16</a></td>,
 <a href="/boxscores/201404160OKC.html">2014-04-16</a>,
 <td class="right" data-stat="age">20-270</td>,
 <td class="left" data-stat="team_id"><a href="/teams/OKC/2014.html">OKC</a></td>,
 <a href="/teams/OKC/2014.html">OKC</a>,
 <td class="center iz" data-stat="game_location"></td>,
 <td class="left" data-stat="opp_id"><a href="/teams/DET/2014.html">DET</a></td>,
 <a href="/teams/DET/2014.html">DET</a>,
 <td class="center" csk="1" data-stat="game_result">W (+1)</td>,
 <td class="right iz" data-stat="gs">0</td>,
 <td class="right" csk="642" data-stat="mp">10:42</td>,
 <td class="right iz" data-sta

In [52]:
txt.find_all()[3]['data-stat']

'date_game'

In [53]:

for i in txt.find_all():
    try:
        print(f'{i["data-stat"]} : {i.string}') 
    except:
        print('fail')

ranker : 82
game_season : 81
fail
date_game : 2014-04-16
fail
age : 20-270
team_id : OKC
fail
game_location : None
opp_id : DET
fail
game_result : W (+1)
gs : 0
mp : 10:42
fg : 0
fga : 1
fg_pct : .000
fg3 : 0
fg3a : 0
fg3_pct : None
ft : 1
fta : 2
ft_pct : .500
orb : 2
drb : 0
trb : 2
ast : 0
stl : 0
blk : 0
tov : 0
pf : 0
pts : 1
game_score : 1.3
plus_minus : -7
