<a href="https://colab.research.google.com/github/JeffHCross/xfl_data_scrape/blob/main/XFL_stats_scrape.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
## Imports
import datetime
import time
import requests
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import pandas as pd

In [4]:
def get_date_from_strings(date_list):
  """ Function to pull Pandas Timestamp from an HTML cell's date string. Dates expected to be similar to Saturday, February 18 3:00PM ET"""
  date_str = ' '.join(date_list)
  if not date_str.endswith('ET'):
    raise RuntimeError(f"Unexpected timezone {date_str[-2:]}")
  else:
    tz = 'US/Eastern'
    date_str = date_str[:-2]
    try:
      pd.Timestamp(date_str,tz=tz)
    except pd._libs.tslibs.np_datetime.OutOfBoundsDatetime as e:
      date_as_list = date_str.split(' ')
      ## Assume that date_as_list[0] is date and [1] is time and year is missing
      date_as_list.insert(1,'2023')
      date_str = ' '.join(date_as_list)
      return pd.Timestamp(date_str,tz=tz)

def generate_game_list(completed_games_only = True, include_active_games = False):
  if completed_games_only == include_active_games:
    raise RuntimeError("Arguments completed_games_only and include_active_games must be set to opposite values")
  games = []
  url = 'https://www.xfl.com/schedule'
  ## XFL schedule is a static page, so we can just use requests
  r = requests.get(url)
  if r.status_code != 200:
    raise RuntimeError(f"Unable to pull XFL schedule from {url}")
  soup = BeautifulSoup(r.text, 'html.parser')
  article_blocks = soup.find_all(class_=["s-article-block"])
  rows = []
  for block in article_blocks:
    rows.extend(block.find_all('tr'))
  for row in rows:
    date_cell = row.find_all('td')[3]
    ## Skip header row
    if date_cell.text.strip() == 'Date/Time':
      continue
    if completed_games_only:
      ## Check if last cell in row contains 'Find Tickets link'
      if 'Find Tickets' in row.find_all('td')[5].text:
        ## If we've found a Find Tickets link, we've passed all completed games
        break
      else:
        ## Game ID can be found in last cell in row, in last A tag, HREF attribute
        games.append(row.find_all('td')[-1].find_all('a')[-1].attrs['href'].split('/')[-1])
    else: ## include_active_games
      ## Need to parse game date to determine if game has kicked off
      ## Get game Date & Time from date_cell stripped strings
      game_date = get_date_from_strings(list(date_cell.stripped_strings))
      if game_date < pd.Timestamp.now(tz='US/Eastern'):
        ## Game kickoff is prior to now
        ## Game ID can be found in last cell in row, in last A tag, HREF attribute
        games.append(row.find_all('td')[-1].find_all('a')[-1].attrs['href'].split('/')[-1])
      else:
        break
  return games
## Unit Test
#generate_game_list()

In [5]:
def build_players(tbl,team):
  d = {}
  player_heading = tbl.find_all('tr')[0]
  if player_heading.text != 'PLAYER':
    raise RuntimeError(f"Unexpected text found in player_heading {player_heading}")
  for p in tbl.find_all('tr')[1:]:
    if team not in d:
      d.update({team: {'TEAM': team, 'NUMBER': '', 'NAME': 'TEAM TOTAL'}})
    if p.find(class_='pstats-pname-data smt-widget-sm') is not None:
      player_id = p.attrs['data-pid']
      if player_id not in d:
        ## PLAYER
        d[player_id] = {}
        d[player_id]['TEAM'] = team
        row = p.find(class_='pstats-pname-data smt-widget-sm')
        d[player_id]['NUMBER'] = row.find('numb').text
        d[player_id]['NAME'] = row.find('nam').text
    ## else: continue
  return d
## Unit Test
#test_html = '<tr><th>PLAYER</th></tr><tr data-pid="22026"><td><div class="pstats-pname-data smt-widget-sm"><numb>10</numb><nam>A. McCarron</nam></div></td></tr>'
#build_players(BeautifulSoup(test_html, "lxml"),'BATTLEHAWKS')

In [11]:
def build_stats(tbl, stat_type, team):
  #stats = {'PLAYER_ID':[], 'TYPE':[], 'STAT':[], 'VALUE':[]}
  plyr_lst = []
  type_lst = []
  stat_lst = []
  valu_lst = []
  for p in tbl.find_all('tr')[1:]:
    for cell in p.children:
      player_id = team if p.attrs['data-pid']=='TEAM' else p.attrs['data-pid']
      #stats['PLAYER_ID'].append(player_id)
      #stats['TYPE'].append(stat_type)
      #stats['STAT'].append(cell.attrs['data-stat-column'])
      #stats['VALUE'].append(cell.attrs['data-stat-value'])
      plyr_lst.append(player_id)
      type_lst.append(stat_type)
      stat_lst.append(cell.attrs['data-stat-column'])
      valu_lst.append(cell.attrs['data-stat-value'])
  return plyr_lst, type_lst, stat_lst, valu_lst
## Unit Test
#test_html = '\<tr\>\<th\>C/ATT</th></tr><tr data-pid="22026"><td data-stat-column="C/ATT" data-stat-value="18/26">18/26</td></tr>'
#build_stats(BeautifulSoup(test_html, "lxml"),'PASSING','BATTLEHAWKS')

In [14]:
def player_game_stats(games):
  ## globals
  options = webdriver.ChromeOptions()
  ## no-sandbox needed if running as a Linux admin
  options.add_argument("--no-sandbox")
  options.add_argument("--disable-extensions")
  options.add_argument("--headless")
  driver = webdriver.Chrome(options=options)
  players = {}
  stats = {'PLAYER_ID':[], 'TYPE':[], 'STAT':[], 'VALUE':[]}

  ## game loop
  try:
    for game_id in games:
      game = f'https://api.xfl.com/game/w/PlayerGameStats?EventId={game_id}'
      driver.get(game)
      # Wait until stats tables have loaded
      WebDriverWait(driver, timeout=15).until(lambda d: d.find_element(By.CSS_SELECTOR,"div.pstats-pname-data.smt-widget-sm"))
      soup = BeautifulSoup(driver.page_source, "lxml")
      v_team = soup.find(id="pstat-v-show-btn").text
      h_team = soup.find(id="pstat-h-show-btn").text
      for table_id in ["pstats-v-tbls","pstats-h-tbls"]:
        team_name = v_team if table_id=="pstats-v-tbls" else h_team
        stat_headers = soup.find(id=table_id).find_all(class_=["smt-fnt-lg"])
        stat_tables = soup.find(id=table_id).find_all(class_=["pstats-tbl-cont"])
        assert len(stat_headers) == len(stat_tables),f'stat_headers ({len(stat_headers)}) != stat_tables ({len(stat_tables)})'
        for i in range(0,len(stat_headers)):
          player_name_table = stat_tables[i].find(class_=["pstats-tbl-pname"])
          tbl_players = build_players(player_name_table,team_name)
          players.update(tbl_players)
          player_stats_table = stat_tables[i].find(class_=["pstats-tbl-vals"])
          pid, typ, sta, val = build_stats(player_stats_table,stat_headers[i].text,team_name)
          stats['PLAYER_ID'] = stats['PLAYER_ID'] + pid
          stats['TYPE'] = stats['TYPE'] + typ
          stats['STAT'] = stats['STAT'] + sta
          stats['VALUE'] = stats['VALUE'] + val
      time.sleep(5)
    ## games loop completed
    return pd.merge(left=pd.DataFrame.from_dict(players,orient='index'),
                  right=pd.DataFrame(stats),
                  how='left',
                  left_index=True,right_on='PLAYER_ID',
                  sort=True).reset_index(drop=True)
  except:
    raise
  finally:
    ## Quit the WebDriver
    driver.quit()
## Unit test
#player_game_stats(['2302003-STL-at-SA'])

In [None]:
xfl_games = generate_game_list()
df = player_game_stats(xfl_games)

In [24]:
df.sample(5,random_state=100)

Unnamed: 0,TEAM,NUMBER,NAME,PLAYER_ID,TYPE,STAT,VALUE
205,BRAHMAS,2.0,T. Bonds,20142,Defensive,FR,0.0
3010,BATTLEHAWKS,,TEAM TOTAL,BATTLEHAWKS,Fumbles,REC,1.0
2429,BATTLEHAWKS,10.0,A. McCarron,22026,Passing,INT,0.0
1337,DEFENDERS,39.0,S. Ramirez,21278,Defensive,TOT,4.0
917,RENEGADES,12.0,A. Killins,20889,Kick Returns,TD,0.0
