<a href="https://colab.research.google.com/github/JeffHCross/xfl_data_scrape/blob/main/stats_scrape.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [198]:
## Imports
import time
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import pandas as pd

In [199]:
def build_players(tbl):
  d = {}
  player_heading = tbl.find_all('tr')[0]
  if player_heading.text != 'PLAYER':
    raise RuntimeError(f"Unexpected text found in player_heading {player_heading}")
  for p in tbl.find_all('tr')[1:]:
    if team_name not in d:
      d.update({team_name: {'TEAM': team_name, 'NUMBER': '', 'NAME': 'TEAM TOTAL'}})
    if p.find(class_='pstats-pname-data smt-widget-sm') is not None:
      player_id = p.attrs['data-pid']
      if player_id not in d:
        ## PLAYER
        d[player_id] = {}
        d[player_id]['TEAM'] = team_name
        row = p.find(class_='pstats-pname-data smt-widget-sm')
        d[player_id]['NUMBER'] = row.find('numb').text
        d[player_id]['NAME'] = row.find('nam').text
    ## else: continue
  return d
#build_players(player_name_table)

In [200]:
def build_stats(tbl, stat_type):
  #stats = {'PLAYER_ID':[], 'TYPE':[], 'STAT':[], 'VALUE':[]}
  plyr_lst = []
  type_lst = []
  stat_lst = []
  valu_lst = []
  for p in tbl.find_all('tr')[1:]:
    for cell in p.children:
      player_id = team_name if p.attrs['data-pid']=='TEAM' else p.attrs['data-pid']
      #stats['PLAYER_ID'].append(player_id)
      #stats['TYPE'].append(stat_type)
      #stats['STAT'].append(cell.attrs['data-stat-column'])
      #stats['VALUE'].append(cell.attrs['data-stat-value'])
      plyr_lst.append(player_id)
      type_lst.append(stat_type)
      stat_lst.append(cell.attrs['data-stat-column'])
      valu_lst.append(cell.attrs['data-stat-value'])
  return plyr_lst, type_lst, stat_lst, valu_lst
#build_stats(player_stats_table,stat_type=stat_headers[i].text)

In [201]:
## globals
options = webdriver.ChromeOptions()
## no-sandbox needed if running as a Linux admin
options.add_argument("--no-sandbox")
options.add_argument("--disable-extensions")
options.add_argument("--headless")
driver = webdriver.Chrome(options=options)
players = {}
stats = {'PLAYER_ID':[], 'TYPE':[], 'STAT':[], 'VALUE':[]}

## game loop
try:
  for game_id in ['2302001-VGS-at-ARL','2302002-ORL-at-HOU','2302003-STL-at-SA', '2302004-SEA-at-DC']:
    game = f'https://api.xfl.com/game/w/PlayerGameStats?EventId={game_id}'
    driver.get(game)
    # Wait until stats tables have loaded
    WebDriverWait(driver, timeout=15).until(lambda d: d.find_element(By.CSS_SELECTOR,"div.pstats-pname-data.smt-widget-sm"))
    soup = BeautifulSoup(driver.page_source, "lxml")
    v_team = soup.find(id="pstat-v-show-btn").text
    h_team = soup.find(id="pstat-h-show-btn").text
    for table_id in ["pstats-v-tbls","pstats-h-tbls"]:
      team_name = v_team if table_id=="pstats-v-tbls" else h_team
      stat_headers = soup.find(id=table_id).find_all(class_=["smt-fnt-lg"])
      stat_tables = soup.find(id=table_id).find_all(class_=["pstats-tbl-cont"])
      assert len(stat_headers) == len(stat_tables),f'stat_headers ({len(stat_headers)}) != stat_tables ({len(stat_tables)})'
      for i in range(0,len(stat_headers)):
        player_name_table = stat_tables[i].find(class_=["pstats-tbl-pname"])
        tbl_players = build_players(player_name_table)
        players.update(tbl_players)
        player_stats_table = stat_tables[i].find(class_=["pstats-tbl-vals"])
        pid, typ, sta, val = build_stats(player_stats_table,stat_type=stat_headers[i].text)
        stats['PLAYER_ID'] = stats['PLAYER_ID'] + pid
        stats['TYPE'] = stats['TYPE'] + typ
        stats['STAT'] = stats['STAT'] + sta
        stats['VALUE'] = stats['VALUE'] + val
    time.sleep(5)
except:
  raise
finally:
  ## Quit the WebDriver
  driver.quit()

In [None]:
pd.merge(left=pd.DataFrame.from_dict(players,orient='index'),
         right=pd.DataFrame(stats),
         how='left',
         left_index=True,right_on='PLAYER_ID',
         sort=True).reset_index(drop=True)