# Imports and Paths

In [176]:
import urllib
from bs4 import BeautifulSoup, Comment
import os
import pandas as pd
from plotnine import *
import plotnine
import time
import json
from datetime import datetime as dt
from datetime import timedelta as td
import numpy as np

plotnine.options.figure_size = (12, 8)
from pandas.api.types import is_string_dtype
pd.options.display.max_columns=150
pd.options.display.max_rows=100

In [2]:
PATH = '/data/msnow/football/'

# Functions

In [3]:
def game_urls(yr,wk):
    bs_pg = 'https://www.pro-football-reference.com/years/'
    pg_url = f'{bs_pg}{yr}/week_{wk}.htm'
    pg = urllib.request.urlopen(pg_url)
    soup = BeautifulSoup(pg, 'html.parser')
    summ = soup.find('div',{'class':'game_summaries'})
    links = summ.find_all('td',{'class':'gamelink'})
    game_urls = []
    game_urls_bs = 'https://www.pro-football-reference.com/'
    for game in links:
        game_str = game.a['href']
        game_urls.append(f'{game_urls_bs}{game_str}')
    return game_urls

In [256]:
def scorebox(soup):
    scrbox_dict = {}
    scrbox_div = soup.find('div', {'class':'scorebox'})
    scrbox = scrbox_div.find_all('div', recursive=False)
    team_str = ['home', 'away']
    for idx, team in enumerate(team_str):
        team_name = scrbox[idx].find('a',{'itemprop':'name'})
        coach_name = scrbox[idx].find('div', {'class':'datapoint'}).find('a')
        score = scrbox[idx].find('div', {'class':'score'})
        record = score.findNextSibling().string.split('-')
        scrbox_dict[team + '_team_pg'] = team_name['href']
        scrbox_dict[team + '_team_id'] = team_name['href'].split('/')[-2]
        scrbox_dict[team + '_team_name'] = team_name.string
        scrbox_dict[team + '_team_score'] = int(score.string)
        scrbox_dict[team + '_team_coach_pg'] = coach_name['href']
        scrbox_dict[team + '_team_coach_id'] = coach_name['href'].split('/')[-1].split('.')[0]
        scrbox_dict[team + '_team_coach_name'] = coach_name.string
        scrbox_dict[team + '_team_wins'] = int(record[0])
        scrbox_dict[team + '_team_losses'] = int(record[1])
        if len(record) == 3:
            scrbox_dict[team + '_team_ties'] = int(record[2])
        else:
            scrbox_dict[team + '_team_ties'] = 0
    if scrbox_dict['home_team_score'] > scrbox_dict['away_team_score']:
        scrbox_dict['home_team_wins'] = scrbox_dict['home_team_wins'] - 1
        scrbox_dict['away_team_losses'] = scrbox_dict['away_team_losses'] - 1
    elif scrbox_dict['home_team_score'] < scrbox_dict['away_team_score']:
        scrbox_dict['away_team_wins'] = scrbox_dict['away_team_wins'] - 1
        scrbox_dict['home_team_losses'] = scrbox_dict['home_team_losses'] - 1
    else:
        scrbox_dict['away_team_ties'] = scrbox_dict['away_team_ties'] - 1
        scrbox_dict['home_team_ties'] = scrbox_dict['home_team_ties'] - 1

    scrbox_meta = scrbox[2].find_all('div')
    game_datetime = scrbox_meta[0].string + scrbox_meta[1].contents[1][1:]
    scrbox_dict['datetime'] = dt.strptime(game_datetime, '%A %b %d, %Y %H:%M%p')
    for meta in scrbox_meta:
        if meta.contents[0].string =='Stadium':
            scrbox_dict['stadium_pg'] = meta.a['href']
            scrbox_dict['stadium_name'] = meta.a.string
            scrbox_dict['stadium_id'] = meta.a['href'].split('/')[-1].split('.')[0]
    scrbox_dict['game_id'] = scrbox_dict['datetime'].strftime('%Y%m%d') + '0' + scrbox_dict['home_team_id']
    return scrbox_dict

In [351]:
def team_lookup(team_string, scrbox_dict):
    if scrbox_dict['home_team_name'].find(team_string) > -1 or scrbox_dict['home_team_id'].find(team_string) > -1:
        tm_name = scrbox_dict['home_team_name']
        tm_id = scrbox_dict['home_team_id']
        tm_loc = 'home'
    elif scrbox_dict['away_team_name'].find(team_string) > -1 or scrbox_dict['away_team_id'].find(team_string) > -1:
        tm_name = scrbox_dict['away_team_name']
        tm_id = scrbox_dict['away_team_id']
        tm_loc = 'away'
    else:
        raise KeyError('Team Not Found')
    return tm_name, tm_id, tm_loc

In [677]:
def scoring(soup, scrbox_dict):
    scoring_list = []
    scoring_div = soup.find('div', {'id':'all_scoring'})
    rows = scoring_div.find_all('tr')
    quarter = 1
    for row in rows[1:]:
        tmp_dict = {}
        for cell in row.contents:
            cell_str = cell.string
            if cell['data-stat'] =='quarter' and cell_str is not None:
                quarter = int(cell_str)
            tmp_dict['quarter'] = quarter
            if cell['data-stat'] =='time':
                if cell_str is not None:
                    str_split = cell_str.split(':')
                    tmp_dict['sec_left_in_quarter'] = int(str_split[0])*60 + int(str_split[1])
                    tmp_dict['sec_into_quarter'] = 15*60 - (int(str_split[0])*60 + int(str_split[1]))
                    tmp_dict['sec_left_in_game'] = int(str_split[0])*60 + int(str_split[1]) + (4-tmp_dict['quarter'])*15*60
                    tmp_dict['sec_into_game'] = 4*15*60 - (int(str_split[0])*60 + int(str_split[1]) + (4-tmp_dict['quarter'])*15*60)
                else:
                    tmp_dict['sec_left_in_quarter'] = np.NAN
                    tmp_dict['sec_into_quarter'] = np.NAN
                    tmp_dict['sec_left_in_game'] = np.NAN
                    tmp_dict['sec_into_game'] = np.NAN
            elif cell['data-stat'] =='team':
                tm_name, tm_id, tm_loc = team_lookup(cell_str, scrbox_dict)
                tmp_dict['scoring_team_name'] = tm_name
                tmp_dict['scoring_team_id'] = tm_id
                tmp_dict['scoring_team_loc'] = tm_loc
            elif cell['data-stat'] =='vis_team_score':
                tmp_dict['away_team_score'] = int(cell_str)
            elif cell['data-stat'] =='home_team_score':
                tmp_dict['home_team_score'] = int(cell_str)
            elif cell['data-stat'] =='description':
                cont = cell.contents
                if cont[1].find('field') > -1:
                    tmp_dict['score_type'] = 'field_goal'
                    tmp_dict['kicker_id'] = cont[0]['href'].split('/')[-1].split('.')[0]
                    tmp_dict['kicker_name'] = cont[0].string
                    tmp_dict['kick_success'] = 1
                    tmp_dict['pass_from_id'] = -999
                    tmp_dict['pass_from_name'] = -999
                    tmp_dict['rec_by_id'] = -999
                    tmp_dict['rec_by_name'] = -999
                    tmp_dict['rush_by_id'] = -999
                    tmp_dict['rush_by_name'] = -999
                    tmp_dict['yards'] = int(cont[1].split(' ')[1])
                elif cont[1].find('pass') > -1:
                    tmp_dict['score_type'] = 'pass'
                    tmp_dict['pass_from_id'] = cont[0]['href'].split('/')[-1].split('.')[0]
                    tmp_dict['pass_from_name'] = cont[0].string
                    tmp_dict['rec_by_id'] = cont[2]['href'].split('/')[-1].split('.')[0]
                    tmp_dict['rec_by_name'] = cont[2].string
                    tmp_dict['rush_by_id'] = -999
                    tmp_dict['rush_by_name'] = -999
                    tmp_dict['yards'] = int(cont[1].split(' ')[1])
                    tmp_dict['kicker_id'] = cont[-2]['href'].split('/')[-1].split('.')[0]
                    tmp_dict['kicker_name'] = cont[-2].string
                    tmp_dict['kick_success'] = 1 if cont[-1].find('failed') == -1 else 0
                elif cont[1].find('rush') > -1:
                    tmp_dict['score_type'] = 'rush'
                    tmp_dict['pass_from_id'] = -999
                    tmp_dict['pass_from_name'] = -999
                    tmp_dict['rec_by_id'] = -999
                    tmp_dict['rec_by_name'] = -999
                    tmp_dict['rush_by_id'] = cont[0]['href'].split('/')[-1].split('.')[0]
                    tmp_dict['rush_by_name'] = cont[0].string
                    tmp_dict['yards'] = int(cont[1].split(' ')[1])
                    tmp_dict['kicker_id'] = cont[-2]['href'].split('/')[-1].split('.')[0]
                    tmp_dict['kicker_name'] = cont[-2].string
                    tmp_dict['kick_success'] = 1 if cont[-1].find('failed') == -1 else 0
                else:
                    raise KeyError('Play Not Found')
                tmp_dict['play_desc'] = ' '.join(x.string.strip() for x in cont)
            tmp_dict['game_id'] = scrbox_dict['game_id']
        scoring_list.append(tmp_dict)
    return scoring_list

In [393]:
def game_info(soup, scrbox_dict):
    ginfo_dict = {}
    ginfo_div = soup.find('div', {'id':'all_game_info'})
    comments=ginfo_div.find_all(string=lambda text:isinstance(text,Comment))
    table = BeautifulSoup(str(comments), 'lxml')
    rows = table.find_all('tr')
    for row in rows[1:]:
        row_lbl = row.contents[0].contents[0]
        row_val = row.contents[1].contents[0]
        if row_lbl not in ['Vegas Line', 'Over/Under']:
            ginfo_dict[row_lbl] = row_val
    ginfo_dict['game_id'] = scrbox_dict['game_id']   
    return ginfo_dict

In [394]:
def officials(soup, scrbox_dict):
    off_list = []
    off_div = soup.find('div', {'id':'all_officials'})
    comments=off_div.find_all(string=lambda text:isinstance(text,Comment))
    table = BeautifulSoup(str(comments), 'lxml')
    rows = table.find_all('tr')
    for row in rows[1:]:
        off_dict = {}
        off_dict['ref_title'] = row.contents[0].string
        off_dict['ref_pg'] = row.contents[1].a['href']
        off_dict['ref_id'] = off_dict['ref_pg'].split('/')[-1].split('.')[0]
        off_dict['ref_name'] = row.contents[1].string
        off_dict['game_id'] = scrbox_dict['game_id']
        off_list.append(off_dict)
    return off_list

In [395]:
def game_summ(soup, scrbox_dict):
    summ_dict = {}
    summ_div = soup.find('div', {'id':'all_team_stats'})
    comments= summ_div.find_all(string=lambda text:isinstance(text,Comment))
    table = BeautifulSoup(str(comments), 'lxml')
    rows = table.find_all('tr')[1:]
    for row in rows:
        cont = row.contents
        row_lbl = cont[0].string
        summ_dict['home_' + row_lbl] = row.find('td',{'data-stat':'home_stat'}).string
        summ_dict['away_' + row_lbl] = row.find('td',{'data-stat':'vis_stat'}).string
    summ_dict['home_team_name'] = scrbox_dict['home_team_name']
    summ_dict['home_team_id'] = scrbox_dict['home_team_id']
    summ_dict['away_team_name'] = scrbox_dict['away_team_name']
    summ_dict['away_team_id'] = scrbox_dict['away_team_id']
    summ_dict['game_id'] = scrbox_dict['game_id']
    return summ_dict

In [575]:
def stats_table(soup, scrbox_dict, div_id):
    """
    works for the following tables and associated divs:
    Passing, Rushing and Receiving - all_player_offense
    Defense - all_player_defense
    Kick/Punt Returns - all_returns
    kicking and Punting - all_kicking
    """
    stat_list = []
    stat_div = soup.find('div', {'id':div_id})
    comments= stat_div.find_all(string=lambda text:isinstance(text,Comment))
    table = BeautifulSoup(str(comments), 'lxml')
    rows = table.find_all('tr')[2:]
    for row in rows:
        cells = row.contents
        if cells[0].name is not None:
            stat_dict = {}
            for cell in cells:
                lbl = cell['data-stat']
                cell_str = cell.string
                if cell_str is None:
                    stat_dict[cell['data-stat']] = 0
                elif lbl =='player':
                    stat_dict['player_id'] = cell['data-append-csv']
                    stat_dict['player_name'] = cell_str
                elif lbl =='team':
                    stat_dict[cell['data-stat']] = cell_str.lower()
                elif lbl =='pos':
                    stat_dict[cell['data-stat']] = cell_str.lower()
                else:
                    stat_dict[cell['data-stat']] = float(cell_str.replace('%',''))
            stat_dict['game_id'] = scrbox_dict['game_id']
            stat_list.append(stat_dict)       
    return stat_list

In [569]:
def starters(soup, scrbox_dict, loc):
    """
    loc is either home or away
    """
    loc_html = 'vis' if loc == 'away' else 'home'
    start_list = []
    start_div = soup.find('div', {'id':'all_' + loc_html + '_starters'})
    comments= start_div.find_all(string=lambda text:isinstance(text,Comment))
    table = BeautifulSoup(str(comments), 'lxml')
    rows = table.find_all('tr')[1:]
    for row in rows:
        cells = row.contents
        start_dict = {}
        start_dict['player_id'] = cells[0]['data-append-csv']
        start_dict['player_name'] = cells[0].string
        start_dict['pos'] = cells[1].string.lower()
        start_dict['game_id'] = scrbox_dict['game_id']
        start_dict['team_name'] = scrbox_dict[loc + '_team_name']
        start_dict['team_id'] = scrbox_dict[loc + '_team_id']
        start_list.append(start_dict)
    return start_list

In [661]:
def drives(soup, scrbox_dict, loc):
    """
    loc is either home or away
    """
    loc_html = 'vis' if loc == 'away' else 'home'
    drive_list = []
    drive_div = soup.find('div', {'id':'all_' + loc_html + '_drives'})
    comments= drive_div.find_all(string=lambda text:isinstance(text,Comment))
    table = BeautifulSoup(str(comments), 'lxml')
    rows = table.find_all('tr')[1:]
    for row in rows:
        cells = row.contents
        drive_dict = {}
        drive_dict['game_id'] = scrbox_dict['game_id']
        drive_dict['team_name'] = scrbox_dict[loc + '_team_name']
        drive_dict['team_id'] = scrbox_dict[loc + '_team_id']
        for cell in cells:
            cell_data = cell['data-stat']
            cell_str = cell.string
            if cell_data in ['drive_num', 'quarter', 'net_yds']:
                drive_dict[cell_data] = int(cell_str)
            elif cell_data == 'time_start':
                str_split = cell_str.split(':')
                drive_dict['sec_left_in_quarter'] = int(str_split[0])*60 + int(str_split[1])
                drive_dict['sec_into_quarter'] = 15*60 - (int(str_split[0])*60 + int(str_split[1]))
                drive_dict['sec_left_in_game'] = int(str_split[0])*60 + int(str_split[1]) + (4-drive_dict['quarter'])*15*60
                drive_dict['sec_into_game'] = 4*15*60 - (int(str_split[0])*60 + int(str_split[1]) + (4-drive_dict['quarter'])*15*60)
            elif cell_data == 'start_at':
                str_split = cell_str.split(' ')
                start_yd = int(str_split[1].lower())
                drive_dict['start_yrd'] = start_yd
                drive_dict['start_side'] = str_split[0].lower()
                if drive_dict['start_side'] == drive_dict['team_id']:
                    start_yd = 100 - start_yd
                drive_dict['yds_to_td'] = start_yd
            elif cell_data == 'play_count_tip':
                drive_dict['total_plays'] = int(cell_str)
                plays = cell.span['tip'].split(',')
                for i in plays:
                    play_sub = i.strip().split(' ')
                    drive_dict[play_sub[1] + '_plays'] = int(play_sub[0])
            elif cell_data == 'time_total':
                str_split = cell_str.split(':')
                drive_dict['drive_sec'] = int(str_split[0])*60 + int(str_split[1])
            elif cell_data == 'end_event':
                drive_dict[cell_data] = cell_str
            else:
                raise KeyError('Column Not Found')
        drive_list.append(drive_dict)    
    return drive_list

In [960]:
def name_extract(cell):
    cell_id = cell['href'].split('/')[-1].split('.')[0]
    cell_name = cell.string
    return cell_id, cell_name

In [1109]:
def play_by_play(soup, scrbox_dict):
    pbp_list = []
    pbp_div = soup.find('div', {'id':'all_pbp'})
    comments= pbp_div.find_all(string=lambda text:isinstance(text,Comment))
    table = BeautifulSoup(str(comments), 'lxml')
    rows = table.find_all('tr')[2:]
    row_ct = 0
    for row_idx,row in enumerate(rows):
        cells = row.contents
        pbp_dict = {}
        if cells[0].name is not None and len(cells)==10 and cells[5].string is None:
            pbp_dict['game_id'] = scrbox_dict['game_id']
            row_ct += 1
            for cell in cells:
                cell_str = cell.string
                cell_data = cell['data-stat']
                if cell_data in ['quarter', 'down', 'yds_to_go', 'pbp_score_aw', 'pbp_score_hm']:
                    if cell_str is None:
                        pbp_dict[cell_data] = -99
                    else:
                        pbp_dict[cell_data] = int(cell_str)
                elif cell_data == 'qtr_time_remain':
                    if cell_str is not None:
                        str_split = cell_str.split(':')
                        pbp_dict['sec_left_in_quarter'] = int(str_split[0])*60 + int(str_split[1])
                        pbp_dict['sec_into_quarter'] = 15*60 - (int(str_split[0])*60 + int(str_split[1]))
                        pbp_dict['sec_left_in_game'] = int(str_split[0])*60 + int(str_split[1]) + (4-pbp_dict['quarter'])*15*60
                        pbp_dict['sec_into_game'] = 4*15*60 - (int(str_split[0])*60 + int(str_split[1]) + (4-pbp_dict['quarter'])*15*60)
                    else:
                        pbp_dict['sec_left_in_quarter'] = np.NAN
                        pbp_dict['sec_into_quarter'] = np.NAN
                        pbp_dict['sec_left_in_game'] = np.NAN
                        pbp_dict['sec_into_game'] = np.NAN
                elif cell_data == 'location':
                    if cell_str is None:
                        pbp_dict['loc_yrd'] = -999
                        pbp_dict['loc_side'] = -999
                    else:
                        str_split = cell_str.strip().split(' ')
                        if len(str_split) == 1:
                            pbp_dict['loc_yrd'] = int(str_split[0])
                            pbp_dict['loc_side'] = -999
                        else:
                            pbp_dict['loc_yrd'] = int(str_split[1])
                            pbp_dict['loc_side'] = str_split[0].lower()
                elif cell_data == 'detail':
                    cont = cell.contents
                    play_str = ''.join(x.string for x in cont if x.string is not None)
                    pbp_dict['play_str'] = play_str
                    pbp_dict['play_count'] = cont[0]['name'].split('_')[-1]
                    if play_str.find('Timeout') >-1:
                        cont_str = cont[1].split(' ')
                        pbp_dict['play_type'] = 'timeout'
                        pbp_dict['timeout_num'] = int(cont_str[1][1:])
                        pbp_dict['timeout_by'] = ' '.join(cont_str[3:])
                    elif cont[1].name is None:
                        pbp_dict['play_type'] = 'penalty'                    
                    else:
                        action = cont[2].strip().split(' ')
                        if action[0] in ['kicks', 'punts'] or action[0].isnumeric():
                            pbp_dict['play_type'] = 'kick'
                            player_id, player_name = name_extract(cont[1])
                            pbp_dict['kicker_id'] = player_id
                            pbp_dict['kicker_name'] = player_name
                            if action[1] == 'off':
                                pbp_dict['play_subtype'] = 'kickoff'
                                kick_str = cont[2].strip().split(' ')
                                pbp_dict['play_yds'] = kick_str[2]
                                if kick_str[4] == 'touchback':
                                    pbp_dict['play_res'] = 'touchback'
                                else:
                                    kick_ret_str = cont[4].strip().split(' ')
                                    player_id, player_name = name_extract(cont[3])
                                    pbp_dict['kick_ret_id'] = player_id
                                    pbp_dict['kick_ret_name'] = player_name
                                    pbp_dict['kick_ret_yds'] = int(kick_ret_str[1])
                            elif action[0] == 'punts':
                                pbp_dict['play_subtype'] = 'punt'
                            elif action[1] == 'extra':
                                pbp_dict['play_subtype'] = 'xp'
                                if play_str.find('no good') > -1:
                                    pbp_dict['play_res'] = 'no_good'
                                else:
                                    pbp_dict['play_res'] = 'good'
                            elif action[0].isnumeric():
                                pbp_dict['play_subtype'] = 'field_goal'
                                if play_str.find('no good') > -1:
                                    pbp_dict['play_res'] = 'no_good'
                                else:
                                    pbp_dict['play_res'] = 'good'
                            else:
                                raise KeyError('Kicking Play Not Found')
                    for idx, srch in enumerate(cont):
                        str_srch = srch.string
                        if str_srch is not None:
                            str_srch = str_srch.lower()
                            if str_srch.find('tackle by') > -1:
                                tackle_id, tackle_name = name_extract(cont[idx+1])
                                pbp_dict['tackled_by_id'] = tackle_id
                                pbp_dict['tackled_by_name'] = tackle_name
                                if len(cont) >= idx+3 and cont[idx+2].find('and') > -1:
                                    tackle_asst_id, tackle_asst_name = name_extract(cont[idx+3])
                                    pbp_dict['tackle_asst_id'] = tackle_asst_id
                                    pbp_dict['tackle_asst_name'] = tackle_asst_name
                            if str_srch.find('sacked by') > -1:
                                sack_id, sack_name = name_extract(cont[idx+1])
                                pbp_dict['sacked_by_id'] = sack_id
                                pbp_dict['sacked_by_name'] = sack_name
                                pbp_dict['play_type'] = 'sack'
                            elif str_srch.find('pass') > -1 and str_srch.find('pass') < 5:
                                pbp_dict['play_type'] = 'pass'
                                passer_id, passer_name = name_extract(cont[idx-1])
                                pbp_dict['passer_id'] = passer_id
                                pbp_dict['passer_name'] = passer_name
                                try:
                                    rec_id, rec_name = name_extract(cont[idx+1])
                                    pbp_dict['rec_id'] = rec_id
                                    pbp_dict['rec_name'] = rec_name
                                except:
                                    pass
                                if str_srch.find('incomplete') > -1:
                                    pbp_dict['play_res'] = 'incomplete'
                                    pbp_dict['play_yds'] = 0
                                else:
                                    pbp_dict['play_res'] = 'complete'
                                    pbp_dict['play_yds'] = int(cont[idx+2].strip().split()[1])
                            elif max(str_srch.find(x) for x in ['right', 'middle', 'left']) > -1: 
                                pbp_dict['play_type'] = 'rush'
                                rush_id, rush_name = name_extract(cont[idx-1])
                                pbp_dict['rush_id'] = rush_id
                                pbp_dict['rush_name'] = rush_name
                                rush_splt = str_srch.split('for')
                                if str_srch.find('no gain') > -1:
                                    pbp_dict['play_yds'] = 0
                                else:
                                    pbp_dict['play_yds'] = int(rush_splt[-1].split('yard')[0].strip())
                                pbp_dict['play_subtype'] = rush_splt[0].strip()
                            elif str_srch.find('kneels') > -1:
                                pbp_dict['play_type'] = 'kneel'
                                kneel_id, kneel_name = name_extract(cont[idx-1])
                                pbp_dict['kneel_id'] = kneel_id
                                pbp_dict['kneel_name'] = kneel_name
                                if str_srch.find('no gain') > -1:
                                    pbp_dict['play_yds'] = 0
                                else:
                                    pbp_dict['play_yds'] = int(str_srch.split('for')[-1].split('yard')[0].strip())
                            if str_srch.find('fumble') > -1:
                                fmbl_id, fmbl_name = name_extract(cont[idx-1])
                                pbp_dict['fmbl_id'] = fmbl_id
                                pbp_dict['fmbl_name'] = fmbl_name
                                if str_srch.find('forced') > -1:
                                    fmbl_forc_by_id, fmbl_forc_by_name = name_extract(cont[idx+1])
                                    pbp_dict['fmbl_forc_by_id'] = fmbl_forc_by_id
                                    pbp_dict['fmbl_forc_by_name'] = fmbl_forc_by_name
                            if str_srch.find('recover') > -1:
                                recover_id, recover_name = name_extract(cont[idx+1])
                                pbp_dict['recover_id'] = recover_id
                                pbp_dict['recover_name'] = recover_name
                            if str_srch.find('intercept') > -1:
                                int_id, int_name = name_extract(cont[idx+1])
                                pbp_dict['int_id'] = int_id
                                pbp_dict['int_name'] = int_name
                                pbp_dict['play_yds'] = int(cont[idx+4].string.split('for')[-1].split('yard')[0].strip())
                                pbp_dict['play_res'] = 'interception'
                            if str_srch.find('penalty') > -1:
                                pen_on_id, pen_on_name = name_extract(cont[idx+1])
                                if 'pen_on_id' in pbp_dict:
                                    pen_sfx = '_2'
                                else:
                                    pen_sfx = ''
                                pbp_dict['pen_on_id' + pen_sfx] = pen_on_id
                                pbp_dict['pen_on_name' + pen_sfx] = pen_on_name
                                pen_res_str = cont[idx+2].string.lower()
                                if pen_res_str.find('yard') > -1:
                                    pen_res = pen_res_str.split(':')[-1].split(', ')
                                    pbp_dict['pen_cause' + pen_sfx] = pen_res[0]                            
                                    pbp_dict['pen_res' + pen_sfx] = pen_res[1]
                                else:
                                    pen_res = pen_res_str.split(':')[-1].split('penalty')
                                    pbp_dict['pen_cause' + pen_sfx] = pen_res[0]                            
                                    pbp_dict['pen_res' + pen_sfx] = -999
                            if str_srch.find('touchdown') > -1:
                                pbp_dict['play_res'] = 'touchdown'
                    if play_str.find('no play') > -1:
                        pbp_dict['play_res'] = 'no_play'
                    for field in ['play_type', 'play_subtype', 'timeout_num', 'timeout_by', 'kicker_id', 'kicker_name', 'play_yds',
                                 'kick_ret_id', 'kick_ret_name', 'kick_ret_yds', 'play_res', 'tackled_by_id', 'tackled_by_name',
                                 'sacked_by_name', 'sacked_by_id', 'passer_id', 'passer_name', 'rec_id', 'rec_name', 'rush_id',
                                 'rush_name', 'kneel_id', 'kneel_name', 'fmbl_id', 'fmbl_name', 'fmbl_forc_by_id', 
                                  'fmbl_forc_by_name', 'recover_id', 'recover_name', 'tackle_asst_id', 'tackle_asst_name', 
                                 'int_name', 'int_id', 'pen_on_id', 'pen_on_name', 'pen_cause', 'pen_res']:
                        if field not in pbp_dict:
                            pbp_dict[field] = -999
            pbp_list.append(pbp_dict)
        return pbp_list

In [542]:
yr = 2017
wk = 1
gurls = game_urls(yr, wk)
pg = urllib.request.urlopen(gurls[0])
soup = BeautifulSoup(pg, 'html.parser')

In [666]:
# scrbox_dict = scorebox(soup)
# scoring_list = scoring(soup, scrbox_dict)
# ginfo_dict = game_info(soup, scrbox_dict)
# off_list = officials(soup, scrbox_dict)
# summ_dict = game_summ(soup, scrbox_dict)
# prr_list = stats_table(soup, scrbox_dict, 'all_player_offense')
# def_list = stats_table(soup, scrbox_dict, 'all_player_defense')
# kp_ret_list = stats_table(soup, scrbox_dict, 'all_returns')
# kp_list = stats_table(soup, scrbox_dict, 'all_kicking')
# home_starters_list = starters(soup, scrbox_dict, 'home')
# away_starters_list = starters(soup, scrbox_dict, 'away')
# home_snaps_list = stats_table(soup, scrbox_dict, 'all_home_snap_counts')
# away_snaps_list = stats_table(soup, scrbox_dict, 'all_vis_snap_counts')
# pass_tgts_list = stats_table(soup, scrbox_dict, 'all_targets_directions')
# rush_dir_list = stats_table(soup, scrbox_dict, 'all_rush_directions')
# pass_tckl_list = stats_table(soup, scrbox_dict, 'all_pass_tackles')
# rush_tckl_list = stats_table(soup, scrbox_dict, 'all_rush_tackles')
# home_drives_list = drives(soup, scrbox_dict, 'home')
# away_drives_list = drives(soup, scrbox_dict, 'away')


503 ms ± 19.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [663]:
away_drives_list

[{'game_id': '201709070nwe',
  'team_name': 'Kansas City Chiefs',
  'team_id': 'kan',
  'drive_num': 1,
  'quarter': 1,
  'sec_left_in_quarter': 728,
  'sec_into_quarter': 172,
  'sec_left_in_game': 3428,
  'sec_into_game': 172,
  'start_yrd': 25,
  'start_side': 'kan',
  'yds_to_td': 75,
  'total_plays': 1,
  'Pass_plays': 0,
  'Rush_plays': 1,
  'Penalty_plays': 0,
  'drive_sec': 8,
  'net_yds': 7,
  'end_event': 'Fumble'},
 {'game_id': '201709070nwe',
  'team_name': 'Kansas City Chiefs',
  'team_id': 'kan',
  'drive_num': 2,
  'quarter': 1,
  'sec_left_in_quarter': 565,
  'sec_into_quarter': 335,
  'sec_left_in_game': 3265,
  'sec_into_game': 335,
  'start_yrd': 10,
  'start_side': 'kan',
  'yds_to_td': 90,
  'total_plays': 12,
  'Pass_plays': 8,
  'Rush_plays': 4,
  'Penalty_plays': 0,
  'drive_sec': 382,
  'net_yds': 90,
  'end_event': 'Touchdown'},
 {'game_id': '201709070nwe',
  'team_name': 'Kansas City Chiefs',
  'team_id': 'kan',
  'drive_num': 3,
  'quarter': 2,
  'sec_left_i

In [370]:
pd.DataFrame([ginfo_dict])

Unnamed: 0,Roof,Surface,Weather,Won Toss,game_id
0,outdoors,fieldturf,"63 degrees, wind 8 mph",Chiefs (deferred),201709070nwe


In [290]:
df.play_desc[2]

'Stephen Gostkowski 25 yard field goal'

# Code

In [8]:
yr = 2017
wk = 1
gurls = game_urls(yr, wk)
pg = urllib.request.urlopen(gurls[0])
soup = BeautifulSoup(pg, 'html.parser')

## scorebox

In [127]:
scrbox_dict = {}
scrbox_div = soup.find('div', {'class':'scorebox'})
scrbox = scrbox_div.find_all('div', recursive=False)
team_str = ['home', 'away']
for idx, team in enumerate(team_str):
    team_name = scrbox[idx].find('a',{'itemprop':'name'})
    coach_name = scrbox[idx].find('div', {'class':'datapoint'}).find('a')
    score = scrbox[idx].find('div', {'class':'score'})
    record = score.findNextSibling().string.split('-')
    scrbox_dict[team + '_team_pg'] = team_name['href']
    scrbox_dict[team + '_team_id'] = team_name['href'].split('/')[-2]
    scrbox_dict[team + '_team_name'] = team_name.string
    scrbox_dict[team + '_team_score'] = int(score.string)
    scrbox_dict[team + '_team_coach_pg'] = coach_name['href']
    scrbox_dict[team + '_team_coach_id'] = coach_name['href'].split('/')[-1].split('.')[0]
    scrbox_dict[team + '_team_coach_name'] = coach_name.string
    scrbox_dict[team + '_team_wins'] = int(record[0])
    scrbox_dict[team + '_team_losses'] = int(record[1])
    if len(record) == 3:
        scrbox_dict[team + '_team_ties'] = int(record[2])
    else:
        scrbox_dict[team + '_team_ties'] = 0
if scrbox_dict['home_team_score'] > scrbox_dict['away_team_score']:
    scrbox_dict['home_team_wins'] = scrbox_dict['home_team_wins'] - 1
    scrbox_dict['away_team_losses'] = scrbox_dict['away_team_losses'] - 1
elif scrbox_dict['home_team_score'] < scrbox_dict['away_team_score']:
    scrbox_dict['away_team_wins'] = scrbox_dict['away_team_wins'] - 1
    scrbox_dict['home_team_losses'] = scrbox_dict['home_team_losses'] - 1
else:
    scrbox_dict['away_team_ties'] = scrbox_dict['away_team_ties'] - 1
    scrbox_dict['home_team_ties'] = scrbox_dict['home_team_ties'] - 1

scrbox_meta = scrbox[2].find_all('div')
game_datetime = scrbox_meta[0].string + scrbox_meta[1].contents[1][1:]
scrbox_dict['datetime'] = dt.strptime(game_datetime, '%A %b %d, %Y %H:%M%p')
for meta in scrbox_meta:
    if meta.contents[0].string =='Stadium':
        scrbox_dict['stadium_pg'] = meta.a['href']
        scrbox_dict['stadium_name'] = meta.a.string
        scrbox_dict['stadium_id'] = meta.a['href'].split('/')[-1].split('.')[0]
# team_dict


In [173]:
time.strptime()

<module 'time' (built-in)>

## scoring

In [284]:
scoring_list = []
scoring_div = soup.find('div', {'id':'all_scoring'})
rows = scoring_div.find_all('tr')
quarter = 1
for row in rows[1:]:
    tmp_dict = {}
    for cell in row.contents:
        cell_str = cell.string
        if cell['data-stat'] =='quarter' and cell_str is not None:
            quarter = int(cell_str)
        tmp_dict['quarter'] = quarter
        if cell['data-stat'] =='time':
            if cell_str is not None:
                str_split = cell_str.split(':')
                tmp_dict['sec_left_in_quarter'] = int(str_split[0])*60 + int(str_split[1])
                tmp_dict['sec_in_quarter'] = 15*60 - (int(str_split[0])*60 + int(str_split[1]))
                tmp_dict['sec_left_in_game'] = int(str_split[0])*60 + int(str_split[1]) + (4-tmp_dict['quarter'])*15*60
                tmp_dict['sec_in_game'] = 4*15*60 - (int(str_split[0])*60 + int(str_split[1]) + (4-tmp_dict['quarter'])*15*60)
            else:
                tmp_dict['sec_left_in_quarter'] = np.NAN
                tmp_dict['sec_in_quarter'] = np.NAN
                tmp_dict['sec_left_in_game'] = np.NAN
                tmp_dict['sec_in_game'] = np.NAN
        elif cell['data-stat'] =='team':
            if scrbox_dict['home_team_name'].find(cell_str) > -1:
                tmp_dict['scoring_team_name'] = scrbox_dict['home_team_name']
                tmp_dict['scoring_team_id'] = scrbox_dict['home_team_id']
                tmp_dict['scoring_team_loc'] = 'home'
            elif scrbox_dict['away_team_name'].find(cell_str) > -1:
                tmp_dict['scoring_team_name'] = scrbox_dict['away_team_name']
                tmp_dict['scoring_team_id'] = scrbox_dict['away_team_id']
                tmp_dict['scoring_team_loc'] = 'away'
            else:
                raise KeyError('Team Not Found')
        elif cell['data-stat'] =='vis_team_score':
            tmp_dict['away_team_score'] = int(cell_str)
        elif cell['data-stat'] =='home_team_score':
            tmp_dict['home_team_score'] = int(cell_str)
        elif cell['data-stat'] =='description':
            cont = cell.contents
            if cont[1].find('field') > -1:
                tmp_dict['score_type'] = 'field_goal'
                tmp_dict['kicker_id'] = cont[0]['href'].split('/')[-1].split('.')[0]
                tmp_dict['kicker_name'] = cont[0].string
                tmp_dict['kick_success'] = 1
                tmp_dict['pass_from_id'] = -1
                tmp_dict['pass_from_name'] = -1
                tmp_dict['rec_by_id'] = -1
                tmp_dict['rec_by_name'] = -1
                tmp_dict['rush_by_id'] = -1
                tmp_dict['rush_by_name'] = -1
                tmp_dict['yards'] = int(cont[1].split(' ')[1])
            elif cont[1].find('pass') > -1:
                tmp_dict['score_type'] = 'pass'
                tmp_dict['pass_from_id'] = cont[0]['href'].split('/')[-1].split('.')[0]
                tmp_dict['pass_from_name'] = cont[0].string
                tmp_dict['rec_by_id'] = cont[2]['href'].split('/')[-1].split('.')[0]
                tmp_dict['rec_by_name'] = cont[2].string
                tmp_dict['rush_by_id'] = -1
                tmp_dict['rush_by_name'] = -1
                tmp_dict['yards'] = int(cont[1].split(' ')[1])
                tmp_dict['kicker_id'] = cont[-2]['href'].split('/')[-1].split('.')[0]
                tmp_dict['kicker_name'] = cont[-2].string
                tmp_dict['kick_success'] = 1 if cont[-1].find('failed') == -1 else 0
            elif cont[1].find('rush') > -1:
                tmp_dict['score_type'] = 'rush'
                tmp_dict['pass_from_id'] = -1
                tmp_dict['pass_from_name'] = -1
                tmp_dict['rec_by_id'] = -1
                tmp_dict['rec_by_name'] = -1
                tmp_dict['rush_by_id'] = cont[0]['href'].split('/')[-1].split('.')[0]
                tmp_dict['rush_by_name'] = cont[0].string
                tmp_dict['yards'] = int(cont[1].split(' ')[1])
                tmp_dict['kicker_id'] = cont[-2]['href'].split('/')[-1].split('.')[0]
                tmp_dict['kicker_name'] = cont[-2].string
                tmp_dict['kick_success'] = 1 if cont[-1].find('failed') == -1 else 0
            else:
                raise KeyError('Play Not Found')
            tmp_dict['description'] = ' '.join(x.string.strip() for x in cont)
        tmp_dict['game_id'] = scrbox_dict['game_id']
    scoring_list.append(tmp_dict)
            
    
    

## Game Info

In [300]:
ginfo_dict = {}
ginfo_div = soup.find('div', {'id':'all_game_info'})
comments=ginfo_div.find_all(string=lambda text:isinstance(text,Comment))
table = BeautifulSoup(str(comments), 'lxml')
rows = table.find_all('tr')
for row in rows[1:]:
    row_lbl = row.contents[0].contents[0]
    row_val = row.contents[1].contents[0]
    if row_lbl not in ['Vegas Line', 'Over/Under']:
        ginfo_dict[row_lbl] = row_val
ginfo_dict['game_id'] = scrbox_dict['game_id']

## Officials

In [335]:
off_list = []
off_div = soup.find('div', {'id':'all_officials'})
comments=off_div.find_all(string=lambda text:isinstance(text,Comment))
table = BeautifulSoup(str(comments), 'lxml')
rows = table.find_all('tr')
for row in rows[1:]:
    off_dict = {}
    off_dict['ref_title'] = row.contents[0].string
    off_dict['ref_pg'] = row.contents[1].a['href']
    off_dict['ref_id'] = off_dict['ref_pg'].split('/')[-1].split('.')[0]
    off_dict['ref_name'] = row.contents[1].string
    off_dict['game_id'] = scrbox_dict['game_id']
    off_list.append(off_dict)

## Game Summary Stats

In [390]:
summ_dict = {}
summ_div = soup.find('div', {'id':'all_team_stats'})
comments= summ_div.find_all(string=lambda text:isinstance(text,Comment))
table = BeautifulSoup(str(comments), 'lxml')
rows = table.find_all('tr')[1:]
for row in rows:
    cont = row.contents
    row_lbl = cont[0].string
    summ_dict['home_' + row_lbl] = row.find('td',{'data-stat':'home_stat'}).string
    summ_dict['away_' + row_lbl] = row.find('td',{'data-stat':'vis_stat'}).string
summ_dict['home_team_name'] = scrbox_dict['home_team_name']
summ_dict['home_team_id'] = scrbox_dict['home_team_id']
summ_dict['away_team_name'] = scrbox_dict['away_team_name']
summ_dict['away_team_id'] = scrbox_dict['away_team_id']
summ_dict['game_id'] = scrbox_dict['game_id']

## Passing, Rushing and Receiving & Defense

In [517]:
prr_list = []
prr_div = soup.find('div', {'id':'all_player_offense'})
comments= prr_div.find_all(string=lambda text:isinstance(text,Comment))
table = BeautifulSoup(str(comments), 'lxml')
rows = table.find_all('tr')[2:]
for row in rows:
    cells = row.contents
    if cells[0].name is not None:
        prr_dict = {}
        for cell in cells:
            lbl = cell['data-stat']
            cell_str = cell.string
            if cell_str is None:
                prr_dict[cell['data-stat']] = 0
            elif lbl =='player':
                prr_dict['player_id'] = cell['data-append-csv']
                prr_dict['player_name'] = cell_str
            elif lbl =='team':
                prr_dict[cell['data-stat']] = cell_str.lower()
            else:
                prr_dict[cell['data-stat']] = float(cell_str)
        prr_dict['game_id'] = scrbox_dict['game_id']
        prr_list.append(prr_dict)        

## Starters

In [559]:
loc = 'home'
start_list = []
start_div = soup.find('div', {'id':'all_' + loc + '_starters'})
comments= start_div.find_all(string=lambda text:isinstance(text,Comment))
table = BeautifulSoup(str(comments), 'lxml')
rows = table.find_all('tr')[1:]
for row in rows:
    cells = row.contents
    start_dict = {}
    start_dict['player_id'] = cells[0]['data-append-csv']
    start_dict['player_name'] = cells[0].string
    start_dict['pos'] = cells[1].string.lower()
    start_dict['game_id'] = scrbox_dict['game_id']
    start_dict['team_name'] = scrbox_dict[loc + '_team_name']
    start_dict['team_id'] = scrbox_dict[loc + '_team_id']
    start_list.append(start_dict)  

## Drives

In [654]:
"""
loc is either home or away
"""
loc = 'home'
loc_html = 'vis' if loc == 'away' else 'home'
drive_list = []
drive_div = soup.find('div', {'id':'all_' + loc_html + '_drives'})
comments= drive_div.find_all(string=lambda text:isinstance(text,Comment))
table = BeautifulSoup(str(comments), 'lxml')
rows = table.find_all('tr')[1:]
for row in rows:
    cells = row.contents
    drive_dict = {}
    drive_dict['game_id'] = scrbox_dict['game_id']
    drive_dict['team_name'] = scrbox_dict[loc + '_team_name']
    drive_dict['team_id'] = scrbox_dict[loc + '_team_id']
    for cell in cells:
        cell_data = cell['data-stat']
        cell_str = cell.string
        if cell_data in ['drive_num', 'quarter', 'net_yds']:
            drive_dict[cell_data] = int(cell_str)
        elif cell_data == 'time_start':
            str_split = cell_str.split(':')
            drive_dict['sec_left_in_quarter'] = int(str_split[0])*60 + int(str_split[1])
            drive_dict['sec_into_quarter'] = 15*60 - (int(str_split[0])*60 + int(str_split[1]))
            drive_dict['sec_left_in_game'] = int(str_split[0])*60 + int(str_split[1]) + (4-drive_dict['quarter'])*15*60
            drive_dict['sec_into_game'] = 4*15*60 - (int(str_split[0])*60 + int(str_split[1]) + (4-drive_dict['quarter'])*15*60)
        elif cell_data == 'start_at':
            str_split = cell_str.split(' ')
            start_yd = int(str_split[1].lower())
            drive_dict['start_yrd'] = start_yd
            drive_dict['start_side'] = str_split[0].lower()
            if drive_dict['start_side'] == drive_dict['team_id']:
                start_yd = 100 - start_yd
            drive_dict['yds_to_td'] = start_yd
        elif cell_data == 'play_count_tip':
            drive_dict['total_plays'] = int(cell_str)
            plays = cell.span['tip'].split(',')
            for i in plays:
                play_sub = i.strip().split(' ')
                drive_dict[play_sub[1] + '_plays'] = int(play_sub[0])
        elif cell_data == 'time_total':
            str_split = cell_str.split(':')
            drive_dict['drive_sec'] = int(str_split[0])*60 + int(str_split[1])
        elif cell_data == 'end_event':
            drive_dict[cell_data] = cell_str
        else:
            raise KeyError('Column Not Found')
    drive_list.append(drive_dict)

## Play by Play

In [1100]:
pbp_list = []
pbp_div = soup.find('div', {'id':'all_pbp'})
comments= pbp_div.find_all(string=lambda text:isinstance(text,Comment))
table = BeautifulSoup(str(comments), 'lxml')
rows = table.find_all('tr')[2:]
row_ct = 0
for row_idx,row in enumerate(rows):
    cells = row.contents
    pbp_dict = {}
    if cells[0].name is not None and len(cells)==10 and cells[5].string is None:
        pbp_dict['game_id'] = scrbox_dict['game_id']
        row_ct += 1
        for cell in cells:
            cell_str = cell.string
            cell_data = cell['data-stat']
            if cell_data in ['quarter', 'down', 'yds_to_go', 'pbp_score_aw', 'pbp_score_hm']:
                if cell_str is None:
                    pbp_dict[cell_data] = -99
                else:
                    pbp_dict[cell_data] = int(cell_str)
            elif cell_data == 'qtr_time_remain':
                if cell_str is not None:
                    str_split = cell_str.split(':')
                    pbp_dict['sec_left_in_quarter'] = int(str_split[0])*60 + int(str_split[1])
                    pbp_dict['sec_into_quarter'] = 15*60 - (int(str_split[0])*60 + int(str_split[1]))
                    pbp_dict['sec_left_in_game'] = int(str_split[0])*60 + int(str_split[1]) + (4-pbp_dict['quarter'])*15*60
                    pbp_dict['sec_into_game'] = 4*15*60 - (int(str_split[0])*60 + int(str_split[1]) + (4-pbp_dict['quarter'])*15*60)
                else:
                    pbp_dict['sec_left_in_quarter'] = np.NAN
                    pbp_dict['sec_into_quarter'] = np.NAN
                    pbp_dict['sec_left_in_game'] = np.NAN
                    pbp_dict['sec_into_game'] = np.NAN
            elif cell_data == 'location':
                if cell_str is None:
                    pbp_dict['loc_yrd'] = -999
                    pbp_dict['loc_side'] = -999
                else:
                    str_split = cell_str.strip().split(' ')
                    if len(str_split) == 1:
                        pbp_dict['loc_yrd'] = int(str_split[0])
                        pbp_dict['loc_side'] = -999
                    else:
                        pbp_dict['loc_yrd'] = int(str_split[1])
                        pbp_dict['loc_side'] = str_split[0].lower()
            elif cell_data == 'detail':
                cont = cell.contents
                play_str = ''.join(x.string for x in cont if x.string is not None)
                pbp_dict['play_str'] = play_str
                pbp_dict['play_count'] = cont[0]['name'].split('_')[-1]
                if play_str.find('Timeout') >-1:
                    cont_str = cont[1].split(' ')
                    pbp_dict['play_type'] = 'timeout'
                    pbp_dict['timeout_num'] = int(cont_str[1][1:])
                    pbp_dict['timeout_by'] = ' '.join(cont_str[3:])
                elif cont[1].name is None:
                    pbp_dict['play_type'] = 'penalty'                    
                else:
                    action = cont[2].strip().split(' ')
                    if action[0] in ['kicks', 'punts'] or action[0].isnumeric():
                        pbp_dict['play_type'] = 'kick'
                        player_id, player_name = name_extract(cont[1])
                        pbp_dict['kicker_id'] = player_id
                        pbp_dict['kicker_name'] = player_name
                        if action[1] == 'off':
                            pbp_dict['play_subtype'] = 'kickoff'
                            kick_str = cont[2].strip().split(' ')
                            pbp_dict['play_yds'] = kick_str[2]
                            if kick_str[4] == 'touchback':
                                pbp_dict['play_res'] = 'touchback'
                            else:
                                kick_ret_str = cont[4].strip().split(' ')
                                player_id, player_name = name_extract(cont[3])
                                pbp_dict['kick_ret_id'] = player_id
                                pbp_dict['kick_ret_name'] = player_name
                                pbp_dict['kick_ret_yds'] = int(kick_ret_str[1])
                        elif action[0] == 'punts':
                            pbp_dict['play_subtype'] = 'punt'
                        elif action[1] == 'extra':
                            pbp_dict['play_subtype'] = 'xp'
                            if play_str.find('no good') > -1:
                                pbp_dict['play_res'] = 'no_good'
                            else:
                                pbp_dict['play_res'] = 'good'
                        elif action[0].isnumeric():
                            pbp_dict['play_subtype'] = 'field_goal'
                            if play_str.find('no good') > -1:
                                pbp_dict['play_res'] = 'no_good'
                            else:
                                pbp_dict['play_res'] = 'good'
                        else:
                            raise KeyError('Kicking Play Not Found')
                for idx, srch in enumerate(cont):
                    str_srch = srch.string
                    if str_srch is not None:
                        str_srch = str_srch.lower()
                        if str_srch.find('tackle by') > -1:
                            tackle_id, tackle_name = name_extract(cont[idx+1])
                            pbp_dict['tackled_by_id'] = tackle_id
                            pbp_dict['tackled_by_name'] = tackle_name
                            if len(cont) >= idx+3 and cont[idx+2].find('and') > -1:
                                tackle_asst_id, tackle_asst_name = name_extract(cont[idx+3])
                                pbp_dict['tackle_asst_id'] = tackle_asst_id
                                pbp_dict['tackle_asst_name'] = tackle_asst_name
                        if str_srch.find('sacked by') > -1:
                            sack_id, sack_name = name_extract(cont[idx+1])
                            pbp_dict['sacked_by_id'] = sack_id
                            pbp_dict['sacked_by_name'] = sack_name
                            pbp_dict['play_type'] = 'sack'
                        elif str_srch.find('pass') > -1 and str_srch.find('pass') < 5:
                            pbp_dict['play_type'] = 'pass'
                            passer_id, passer_name = name_extract(cont[idx-1])
                            pbp_dict['passer_id'] = passer_id
                            pbp_dict['passer_name'] = passer_name
                            try:
                                rec_id, rec_name = name_extract(cont[idx+1])
                                pbp_dict['rec_id'] = rec_id
                                pbp_dict['rec_name'] = rec_name
                            except:
                                pass
                            if str_srch.find('incomplete') > -1:
                                pbp_dict['play_res'] = 'incomplete'
                                pbp_dict['play_yds'] = 0
                            else:
                                pbp_dict['play_res'] = 'complete'
                                pbp_dict['play_yds'] = int(cont[idx+2].strip().split()[1])
                        elif max(str_srch.find(x) for x in ['right', 'middle', 'left']) > -1: 
                            pbp_dict['play_type'] = 'rush'
                            rush_id, rush_name = name_extract(cont[idx-1])
                            pbp_dict['rush_id'] = rush_id
                            pbp_dict['rush_name'] = rush_name
                            rush_splt = str_srch.split('for')
                            if str_srch.find('no gain') > -1:
                                pbp_dict['play_yds'] = 0
                            else:
                                pbp_dict['play_yds'] = int(rush_splt[-1].split('yard')[0].strip())
                            pbp_dict['play_subtype'] = rush_splt[0].strip()
                        elif str_srch.find('kneels') > -1:
                            pbp_dict['play_type'] = 'kneel'
                            kneel_id, kneel_name = name_extract(cont[idx-1])
                            pbp_dict['kneel_id'] = kneel_id
                            pbp_dict['kneel_name'] = kneel_name
                            if str_srch.find('no gain') > -1:
                                pbp_dict['play_yds'] = 0
                            else:
                                pbp_dict['play_yds'] = int(str_srch.split('for')[-1].split('yard')[0].strip())
                        if str_srch.find('fumble') > -1:
                            fmbl_id, fmbl_name = name_extract(cont[idx-1])
                            pbp_dict['fmbl_id'] = fmbl_id
                            pbp_dict['fmbl_name'] = fmbl_name
                            if str_srch.find('forced') > -1:
                                fmbl_forc_by_id, fmbl_forc_by_name = name_extract(cont[idx+1])
                                pbp_dict['fmbl_forc_by_id'] = fmbl_forc_by_id
                                pbp_dict['fmbl_forc_by_name'] = fmbl_forc_by_name
                        if str_srch.find('recover') > -1:
                            recover_id, recover_name = name_extract(cont[idx+1])
                            pbp_dict['recover_id'] = recover_id
                            pbp_dict['recover_name'] = recover_name
                        if str_srch.find('intercept') > -1:
                            int_id, int_name = name_extract(cont[idx+1])
                            pbp_dict['int_id'] = int_id
                            pbp_dict['int_name'] = int_name
                            pbp_dict['play_yds'] = int(cont[idx+4].string.split('for')[-1].split('yard')[0].strip())
                            pbp_dict['play_res'] = 'interception'
                        if str_srch.find('penalty') > -1:
                            pen_on_id, pen_on_name = name_extract(cont[idx+1])
                            if 'pen_on_id' in pbp_dict:
                                pen_sfx = '_2'
                            else:
                                pen_sfx = ''
                            pbp_dict['pen_on_id' + pen_sfx] = pen_on_id
                            pbp_dict['pen_on_name' + pen_sfx] = pen_on_name
                            pen_res_str = cont[idx+2].string.lower()
                            if pen_res_str.find('yard') > -1:
                                pen_res = pen_res_str.split(':')[-1].split(', ')
                                pbp_dict['pen_cause' + pen_sfx] = pen_res[0]                            
                                pbp_dict['pen_res' + pen_sfx] = pen_res[1]
                            else:
                                pen_res = pen_res_str.split(':')[-1].split('penalty')
                                pbp_dict['pen_cause' + pen_sfx] = pen_res[0]                            
                                pbp_dict['pen_res' + pen_sfx] = -999
                        if str_srch.find('touchdown') > -1:
                            pbp_dict['play_res'] = 'touchdown'
                if play_str.find('no play') > -1:
                    pbp_dict['play_res'] = 'no_play'
                for field in ['play_type', 'play_subtype', 'timeout_num', 'timeout_by', 'kicker_id', 'kicker_name', 'play_yds',
                             'kick_ret_id', 'kick_ret_name', 'kick_ret_yds', 'play_res', 'tackled_by_id', 'tackled_by_name',
                             'sacked_by_name', 'sacked_by_id', 'passer_id', 'passer_name', 'rec_id', 'rec_name', 'rush_id',
                             'rush_name', 'kneel_id', 'kneel_name', 'fmbl_id', 'fmbl_name', 'fmbl_forc_by_id', 
                              'fmbl_forc_by_name', 'recover_id', 'recover_name', 'tackle_asst_id', 'tackle_asst_name', 
                             'int_name', 'int_id', 'pen_on_id', 'pen_on_name', 'pen_cause', 'pen_res']:
                    if field not in pbp_dict:
                        pbp_dict[field] = -999
        pbp_list.append(pbp_dict)
print(row_ct)        
                

195


In [1098]:
'abcd' + ''

'abcd'

In [1089]:
play_str

'Mike Gillislee right tackle for 2 yards. Penalty on Nate Solder: Offensive Holding (Offsetting), Penalty on Allen Bailey: Defensive Holding (Offsetting) (no play)'

In [1091]:
'abcd'.split(',')

['abcd']

In [1090]:
pen_res

[' Defensive Holding (Offsetting) (no play)']

In [1084]:
str_srch

': defensive pass interference, 12 yards (no play)'

In [1085]:
cont

[<a name="pbp_9"></a>,
 <a href="/players/B/BradTo00.htm">Tom Brady</a>,
 ' pass incomplete short left intended for ',
 <a href="/players/C/CookBr00.htm">Brandin Cooks</a>,
 '. Penalty on ',
 <a href="/players/M/MitcTe00.htm">Terrance Mitchell</a>,
 ': Defensive Pass Interference, 12 yards (no play)']

In [1087]:
cont[2].string.find('pass')

1

In [984]:
for i in rows[3].contents[5].contents:
    print(i.string)

None
James White
 left guard for 8 yards (tackle by 
Ron Parker
 and 
Derrick Johnson
)


In [902]:
play_string = []
for i in rows:
    try:
        play_string.append(''.join(x.string for x in i.contents[5].contents if x.string is not None))
    except:
        pass

In [903]:
play_string

['Cairo Santos kicks off 64 yards, returned by Dion Lewis for 26 yards (tackle by Kevin Pierre-Louis)',
 'Tom Brady pass incomplete deep left intended for Dwayne Allen',
 'Tom Brady pass complete short right to Rex Burkhead for 8 yards (tackle by Ron Parker)',
 'James White left guard for 8 yards (tackle by Ron Parker and Derrick Johnson)',
 'James White middle for 3 yards (tackle by Eric Berry)',
 'Tom Brady pass complete deep left to Brandin Cooks for 19 yards (tackle by Phillip Gaines)',
 'James White left tackle for 5 yards (tackle by Dee Ford and Derrick Johnson)',
 'Tom Brady pass complete short left to Danny Amendola for 16 yards (tackle by Phillip Gaines)',
 'Tom Brady pass incomplete short left intended for Brandin Cooks. Penalty on Terrance Mitchell: Defensive Pass Interference, 12 yards (no play)',
 'Mike Gillislee right tackle for 2 yards. Penalty on Nate Solder: Offensive Holding (Offsetting), Penalty on Allen Bailey: Defensive Holding (Offsetting) (no play)',
 'Tom Brady 

In [1064]:
tmp_list = []
for i,v in enumerate(play_string):
    if v.find('fumble') > -1:
        tmp_list.append(str(i) + v)
len(tmp_list)

2

In [1108]:
pbp_list[9] #['play_str'].find('no play')

{'game_id': '201709070nwe',
 'quarter': 1,
 'sec_left_in_quarter': 741,
 'sec_into_quarter': 159,
 'sec_left_in_game': 3441,
 'sec_into_game': 159,
 'down': 1,
 'yds_to_go': 2,
 'loc_yrd': 2,
 'loc_side': 'kan',
 'play_str': 'Mike Gillislee right tackle for 2 yards. Penalty on Nate Solder: Offensive Holding (Offsetting), Penalty on Allen Bailey: Defensive Holding (Offsetting) (no play)',
 'play_count': '10',
 'play_type': 'rush',
 'rush_id': 'GillMi00',
 'rush_name': 'Mike Gillislee',
 'play_yds': 2,
 'play_subtype': 'right tackle',
 'pen_on_id': 'SoldNa00',
 'pen_on_name': 'Nate Solder',
 'pen_cause': ' offensive holding (offsetting), ',
 'pen_res': -999,
 'pen_on_id_2': 'BailAl00',
 'pen_on_name_2': 'Allen Bailey',
 'pen_cause_2': ' defensive holding (offsetting) (no play)',
 'pen_res_2': -999,
 'play_res': 'no_play',
 'timeout_num': -999,
 'timeout_by': -999,
 'kicker_id': -999,
 'kicker_name': -999,
 'kick_ret_id': -999,
 'kick_ret_name': -999,
 'kick_ret_yds': -999,
 'tackled_by_i

In [1105]:
tmp_list = []
for i in pbp_list:
    if 'pen_on_id_2' in i:
        tmp_list.append(i)
len(tmp_list)

1

In [1096]:
tmp_list = []
for i in pbp_list:
    if i['play_type'] == 'kick':
        tmp_list.append(i['play_str'])
len(tmp_list)

38

In [1097]:
tmp_list

['Cairo Santos kicks off 64 yards, returned by Dion Lewis for 26 yards (tackle by Kevin Pierre-Louis)',
 'Stephen Gostkowski kicks extra point good',
 'Stephen Gostkowski kicks off 65 yards, touchback',
 'Cairo Santos kicks extra point good',
 'Cairo Santos kicks off 65 yards, touchback',
 'Stephen Gostkowski 25 yard field goal good',
 "Stephen Gostkowski kicks off 66 yards, returned by De\\'Anthony Thomas for 31 yards (tackle by Patrick Chung). Penalty on Brandon King: Unnecessary Roughness, 15 yards",
 'Dustin Colquitt punts 36 yards, fair catch by Danny Amendola',
 'Stephen Gostkowski kicks extra point good',
 "Stephen Gostkowski kicks off 71 yards, returned by De\\'Anthony Thomas for 29 yards (tackle by Jordan Richards)",
 "Dustin Colquitt punts 46 yards. Penalty on De\\'Anthony Thomas: Interference with Opportunity to, 15 yards",
 'Ryan Allen punts 39 yards. Penalty on Frank Zombo: Offensive Holding, 9 yards',
 'Cairo Santos kicks extra point good',
 'Cairo Santos kicks off 57 yar

In [874]:
rows[0].contents[5].contents[2].strip().split(' ')[4]

'returned'

In [877]:
pbp_list[0]

{'game_id': '201709070nwe',
 'quarter': 1,
 'sec_left_in_quarter': 900,
 'sec_into_quarter': 0,
 'sec_left_in_game': 3600,
 'sec_into_game': 0,
 'down': -99,
 'yds_to_go': -99,
 'loc_yrd': 35,
 'loc_side': 'kan',
 'pbp_count': '1',
 'play_type': 'kick',
 'kicker_id': 'SantCa01',
 'kicker_name': 'Cairo Santos',
 'play_subtype': 'kickoff',
 'play_yds': '64',
 'timeout_num': -999,
 'timeout_by': -999,
 'pbp_score_aw': 0,
 'pbp_score_hm': 0}

In [754]:
rows[102].contents[5].string is None

False

In [691]:
df = pd.DataFrame(pbp_list)

# Extra

In [647]:
drive_dict['team_id']

'nwe'

In [638]:
cell

<td class="right " data-stat="start_at">NWE 27</td>

In [648]:
rows[1].contents

[<th class="right " data-stat="drive_num" scope="row">2</th>,
 <td class="center " data-stat="quarter">1</td>,
 <td class="right " csk="720" data-stat="time_start">12:00</td>,
 <td class="right " data-stat="start_at">KAN 32</td>,
 <td class="right " data-stat="play_count_tip"><span class="tooltip" tip="4 Pass, 3 Rush, 1 Penalty">7</span></td>,
 <td class="right " csk="155" data-stat="time_total">2:35</td>,
 <td class="right " data-stat="net_yds">22</td>,
 <td class="center " data-stat="end_event">Downs</td>]

In [619]:
rows[0].contents[4].span['tip']

'5 Pass, 4 Rush, 2 Penalty'

In [656]:
drive_list

[{'game_id': '201709070nwe',
  'team_name': 'New England Patriots',
  'team_id': 'nwe',
  'drive_num': 1,
  'quarter': 1,
  'sec_left_in_quarter': 900,
  'sec_into_quarter': 0,
  'sec_left_in_game': 3600,
  'sec_into_game': 0,
  'start_yrd': 27,
  'start_side': 'nwe',
  'yds_to_td': 73,
  'total_plays': 10,
  'Pass_plays': 5,
  'Rush_plays': 4,
  'Penalty_plays': 2,
  'drive_sec': 172,
  'net_yds': 73,
  'end_event': 'Touchdown'},
 {'game_id': '201709070nwe',
  'team_name': 'New England Patriots',
  'team_id': 'nwe',
  'drive_num': 2,
  'quarter': 1,
  'sec_left_in_quarter': 720,
  'sec_into_quarter': 180,
  'sec_left_in_game': 3420,
  'sec_into_game': 180,
  'start_yrd': 32,
  'start_side': 'kan',
  'yds_to_td': 32,
  'total_plays': 7,
  'Pass_plays': 4,
  'Rush_plays': 3,
  'Penalty_plays': 1,
  'drive_sec': 155,
  'net_yds': 22,
  'end_event': 'Downs'},
 {'game_id': '201709070nwe',
  'team_name': 'New England Patriots',
  'team_id': 'nwe',
  'drive_num': 3,
  'quarter': 1,
  'sec_le

In [502]:
aa.name

'th'

In [501]:
bb.name is None

True

In [438]:
table.find_all('tr','data-row')

[]

In [405]:
rows[0].contents[2]['data-stat']

'pass_cmp'

In [381]:
rows[0].find('td',{'data-stat':'home_stat'}).string

'25'

In [232]:
rows[2].contents[3].contents

[<a href="/players/H/HarrDe03.htm">Demetrius Harris</a>,
 ' 7 yard pass from ',
 <a href="/players/S/SmitAl03.htm">Alex Smith</a>,
 ' (',
 <a href="/players/S/SantCa01.htm">Cairo Santos</a>,
 ' kick)']

In [105]:
scrbox_meta[2].strong.string

'Stadium'