In [None]:
import logging
from argparse import ArgumentParser
from tqdm import tqdm
from bs4 import BeautifulSoup
import requests
import datetime
import sys
sys.path.append('.')
from cbastats.Scraper import Scraper
from basketball_reference_scraper.constants import TEAM_TO_TEAM_ABBR
from basketball_reference_scraper.box_scores import get_box_scores
from basketball_reference_scraper.seasons import get_schedule, get_standings
from pathlib import Path
import pandas as pd
from cbastats.DBHelper import MongoDBHelper
import os
from requests.api import head

In [1]:
config={}

In [8]:
needed_envs = ['MONGODB_PWD', 'MONGODB_USERNAME', 'MONGODB_ENDPOINT']
envs = os.environ
# only checks if user wants to save data to DB
# check if all needed environment variables are present

for needed_env in needed_envs:
    if needed_env not in envs:
        raise Exception(f"Missing environment variable: {needed_env}.\n \
    Please check if these environment variables are present: {needed_envs}")
    config[needed_env] = envs[needed_env]

In [9]:
mongodbio = MongoDBHelper()
client = mongodbio.create_connection(
    config['MONGODB_USERNAME'], config['MONGODB_PWD'], config['MONGODB_ENDPOINT'])
nba_db = client['nbaStats']
coll_nbaGames = nba_db['nbaGames']
coll_nbaGamesStaging= nba_db['nbaGamesStaging']

existing database ['cbaStats', 'nbaStats', 'admin', 'local']


In [19]:
coll_nbaGames.find_one(filter={ 'four_factos':{ "$exists":False } })

{'_id': ObjectId('604eacc9cb4bc300ee4c65af'),
 'DATE': datetime.datetime(2020, 12, 22, 0, 0),
 'VISITOR': 'GSW',
 'VISITOR_PTS': 99.0,
 'HOME': 'BRK',
 'HOME_PTS': 125.0,
 'season': '2020-2021',
 'boxscores_url': 'https://www.basketball-reference.com/boxscores/202012220BRK.html',
 'game_id': '202012220BRK'}

In [36]:
ff_tasks = mongodbio.select_records(coll_nbaGames,filter={'four_factors':{"$exists":False}},field={'boxscores_url':1,'game_id':1})

In [37]:
len(ff_tasks)

560

In [23]:
ff_tasks[0]

{'_id': ObjectId('604eacc9cb4bc300ee4c65af'),
 'boxscores_url': 'https://www.basketball-reference.com/boxscores/202012220BRK.html',
 'game_id': '202012220BRK'}

In [20]:
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

In [29]:
def get_four_factors(url,game_id,session=None):
    session = session or requests.Session()
    response = session.get(url, timeout=5)
    html = response.content
    html = html.decode()
    stat_html = html.replace('<!--', "")
    stat_html = stat_html.replace('-->', "")
    soup = BeautifulSoup(stat_html, 'html.parser')
    table = pd.read_html(str(soup.find_all('table',attrs={"id":"four_factors"})[0]))[0]
    table = table.droplevel(0,axis=1)
    table = table.rename({'Unnamed: 0_level_1':'Team'},axis=1)
    table['game_id'] = game_id
    return table.to_dict('records')

def requests_retry_session(
    retries=5,
    backoff_factor=0.3,
    status_forcelist=(500, 502, 504),
    session=None,
):
    session = session or requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=status_forcelist,
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session

In [35]:
session = requests_retry_session()
for task in tqdm(ff_tasks):
    four_factors = get_four_factors(task['boxscores_url'],task['game_id'],session)
    coll_nbaGames.find_one_and_update(filter={"_id":task['_id']},update={'$set':{'four_factors':four_factors}})

100%|██████████| 10/10 [00:07<00:00,  1.37it/s]


In [None]:
coll_nbaGames.find_one_and_update(filter={"game_id":})