In this series of notebooks, I will attempt to do some introductory exploration of various roller derby statistics. We will use the publicly available stats on the FlatTrackStats website. First, I will build a table scraper tool using the BeautifulSoup4 package to parse the stats tables on the website. If not already installed, you will need pandas and BeautifulSoup4 in order to run this notebook. 


In [84]:
    import requests
    import pandas as pd
    import numpy as np
    from bs4 import BeautifulSoup
    from itertools import product
    from urllib.request import urlopen
   
    #First, define a class to parse HTML tables for bouts and players

    class HTMLTableParser:
       
        def parse_url(self, url):
            response = requests.get(url)
            soup = BeautifulSoup(response.text, 'lxml')
            return [(self.read_table(table)) for table in soup.find_all('table')]  
        
        def read_table(self, table_tag):
            rowspans = []  # track pending rowspans
            rows = table_tag.find_all('tr')

            # first scan, see how many columns we need
            colcount = 0
            column_names=[]
            for r, row in enumerate(rows):
                cells = row.find_all(['td', 'th'], recursive=False)
                colcount = max(
                    colcount,
                    sum(int(c.get('colspan', 1)) or 1 for c in cells[:-1]) + len(cells[-1:]) + len(rowspans))
                # update rowspan bookkeeping; 0 is a span to the bottom. 
                rowspans += [int(c.get('rowspan', 1)) or len(rows) - r for c in cells]
                rowspans = [s - 1 for s in rowspans if s > 1]

            # it doesn't matter if there are still rowspan numbers 'active'; no extra
            # rows to show in the table means the larger than 1 rowspan numbers in the
            # last table row are ignored.    
            # build an empty matrix for all possible cells
            table = [[None] * colcount for row in rows]


            # fill matrix from row data
            rowspans = {}  # track pending rowspans, column number mapping to count
            for row, row_elem in enumerate(rows):
                span_offset = 0  # how many columns are skipped due to row and colspans 
                for col, cell in enumerate(row_elem.find_all(['td', 'th'], recursive=False)):
                    # adjust for preceding row and colspans
                    col += span_offset
                    while rowspans.get(col, 0):
                        span_offset += 1
                        col += 1

                    # fill table data
                    rowspan = rowspans[col] = int(cell.get('rowspan', 1)) or len(rows) - row
                    colspan = int(cell.get('colspan', 1)) or colcount - col
                    # next column is offset by the colspan
                    span_offset += colspan - 1
                    value = cell.get_text()
                    points = len(cell.find_all("div", {"class": " point"}))
                    pens = cell.find_all("div", {"class": "penalty major"})  
                    if (value == '' or value == '&nbsp') and points != '':
                        value = str(points)
                    if len(pens) > 0:
                        value = ''
                        for pen in pens:
                            value += pen.find_all(text=True, recursive=False)[0] + " "
                    if len(cell.find_all("div", {"class": "lead"}))== 1:
                        if value == "0": value = ""
                        value += "Lead "
                    if len(cell.find_all("div", {"class": "leadloss"}))== 1:
                        if value == "0": value = ""
                        value += "LeadLoss "
                    if len(cell.find_all("div", {"class": "lost"}))== 1:
                        if value == "0": value = ""
                        value += "LeadLoss "                   
                    if len(cell.find_all("div", {"class": "call"}))== 1:
                        if value == "0": value = ""
                        value += "call "
                    if len(cell.find_all("div", {"class": "nopass"}))== 1:
                        if value == "0": value = ""
                        value += "np "    
                    if len(cell.find_all("div", {"class": "starpass"}))== 1:
                        if value == "0": value = ""
                        value += "sp "    
                    for drow, dcol in product(range(rowspan), range(colspan)):
                        try:
                            table[row + drow][col + dcol] = value
                            rowspans[col + dcol] = rowspan
                        except IndexError:
                            # rowspan or colspan outside the confines of the table
                            pass

                # update rowspan bookkeeping
                rowspans = {c: s - 1 for c, s in rowspans.items() if s > 1}
            npt = np.array(table)
            #df = pd.DataFrame(np.array(table), column_names) 
            return table

        def parse_url_todf(self, url):
            tables = self.parse_url(url)
            dfs = []
            if "combos" in url:
                for table in tables:
                    headers = table.pop(0)
                    df = pd.DataFrame(np.array(table), columns=np.array(headers))    
                    dfs.append(df)
            if "teams" in url:
                for table in tables:
                    headers = table.pop(0)
                    df = pd.DataFrame(np.array(table), columns=np.array(headers))    
                    dfs.append(df)
            else:
                headers = tables[0]
                headersnew = []
                for header in headers:
                    headernew = [i for i in header if i] 
                    headersnew.append(headernew)
                for i in range(len(tables)-1):
                    df = pd.DataFrame(np.array(tables[i+1]), columns=headersnew[i])    
                    dfs.append(df)
                
            return dfs    

In [85]:
def GetAllBouts(teamID):
    teamID = str(3393)
    npages = 0
    links=[]
    
    base_url = "http://flattrackstats.com/teams/"+teamID+"/bouts"
    text = urlopen(base_url).read()
    base_soup = BeautifulSoup(text)

    for listitem in base_soup.findAll('li', class_="pager-last last"):
        npages = int(listitem.findAll('a')[0]['href'][-1])
    
    
    for page in range(0,npages):
        url = "http://flattrackstats.com/teams/3393/bouts?page="+str(page)
        text = urlopen(url).read()
        soup = BeautifulSoup(text)
        for link in soup.findAll('a', class_="boutlink has-stats"):
            if "node" not in link['href']:
                linkname = link['href']
                linksize = len(linkname)
                linkstub = linkname[:linksize-8]
                links.append(linkstub)
    
    return links

[   Jam                 Jammer            Lead Pass2  3  4  5  6  7  8 Events  \
0    0                      0               0     0  0  0  0  0  0  0      0   
1    1          Gorges Curves               0     0  0  0  0  0  0  0    np    
2    1       Racer McChaseHer           Lead      0  0  0  0  0  0  0  call    
3    2          Gorges Curves           Lead      0  0  0  0  0  0  0  call    
4    2  Sarah Hipel (Killb...       LeadLoss      0  0  0  0  0  0  0    np    
..  ..                    ...             ...   ... .. .. .. .. .. ..    ...   
82  20  Meryl Slaughterbur...           Lead      0  0  0  0  0  0  0  call    
83  21  Zee "Loraine Acid"...               0     0  0  0  0  0  0  0      0   
84  21  Sarah Hipel (Killb...  Lead LeadLoss      0  0  0  0  0  0  0      0   
85  22             Pain Train               0     0  0  0  0  0  0  0      0   
86  22          Swift Justice           Lead      0  0  0  0  0  0  0  call    

   JamScore TotalScore  0  
0         

In [115]:
def GetAllScores(teamID):
    scoreframes = []
    teamID = str(3393)
    bouts = GetAllBouts(teamID)
    hp = HTMLTableParser()
    for bout in bouts:
        jampage = "http://flattrackstats.com"+bout+"jams"
        scores = hp.parse_url_todf(jampage)[0].iloc[1:,:-1]
        del scores['0']
        scoreframes.append(scores)
    allScores = pd.concat(scoreframes)
    print(allScores)
    return(allScores)

In [116]:
def GetAllLineups(teamID):
    lineupframes = []
    teamID = str(3393)
    bouts = GetAllBouts(teamID)
    hp = HTMLTableParser()
    for bout in bouts:
        jampage = "http://flattrackstats.com"+bout+"jams"
        lineups = hp.parse_url_todf(jampage)[1].iloc[1:,:-1]
        del lineups['0']
        lineupframes.append(lineups)
    allLineups = pd.concat(lineupframes)
    print(allLineups)
    return(allLineups)

In [117]:
def GetAllPenalties(teamID):
    penframes = []
    teamID = str(3393)
    bouts = GetAllBouts(teamID)
    hp = HTMLTableParser()
    for bout in bouts:
        jampage = "http://flattrackstats.com"+bout+"jams"
        pens = hp.parse_url_todf(jampage)[2].iloc[1:,:-1]
        del pens['0']
        penframes.append(scores)
    allPens = pd.concat(penframes)
    print(allPens)
    return(allPens)

In [114]:
GetAllLineups(str(3393))

   Jam                 Jammer            Lead              Blocker 1  \
1    1          Gorges Curves               0  Lora Wayman (Outa ...   
2    1       Racer McChaseHer           Lead              Julie Ruin   
3    2          Gorges Curves           Lead   Lora Wayman (Outa ...   
4    2  Sarah Hipel (Killb...       LeadLoss                  Perish   
5    3  Zee "Loraine Acid"...           Lead             Slammerhead   
..  ..                    ...             ...                    ...   
70  15             HellionBoi               0            Hell Camino   
71  16  Jenny the Jet Rodr...  Lead LeadLoss           LeeAnn Crimes   
72  16             Foxy Force               0            Hell Camino   
73  17  Jenny the Jet Rodr...               0         Jennifer Smith   
74  17     Kitty Liquorbottom  Lead LeadLoss             Hell Camino   

             Blocker 2              Blocker 3                  Pivot JamScore  \
1          Slammerhead              Bigg Rigg         

Unnamed: 0,Jam,Jammer,Lead,Blocker 1,Blocker 2,Blocker 3,Pivot,JamScore,TotalScore
1,1,Gorges Curves,0,Lora Wayman (Outa ...,Slammerhead,Bigg Rigg,Kelsey Khaos,0,0
2,1,Racer McChaseHer,Lead,Julie Ruin,Lazer Beam,Kelly Genei,Lollypops Em Hard�...,10,10
3,2,Gorges Curves,Lead,Lora Wayman (Outa ...,Betty T. KayO,Chainsaw,Kelsey Khaos,14,14
4,2,Sarah Hipel (Killb...,LeadLoss,Perish,Oi! Rish,Meryl Slaughterbur...,Cookie Rumble,0,10
5,3,"Zee ""Loraine Acid""...",Lead,Slammerhead,Betty T. KayO,Bigg Rigg,Chainsaw,4,18
...,...,...,...,...,...,...,...,...,...
70,15,HellionBoi,0,Hell Camino,Bratislava Bruiser,Amy Spears,kill basa,3,36
71,16,Jenny the Jet Rodr...,Lead LeadLoss,LeeAnn Crimes,Lady Fury,Fully Addomatic,Slammylou Harris,2,158
72,16,Foxy Force,0,Hell Camino,Bratislava Bruiser,Amy Spears,kill basa,0,36
73,17,Jenny the Jet Rodr...,0,Jennifer Smith,Jersey Jackhammer,Fully Addomatic,E-Money,8,166
