## Import Libraries

In [605]:
# The library Beautiful Soup will be used for the HTML parsing
from bs4 import BeautifulSoup

# General libraries
import os

## Rankings class  - Handles single HTML document

In [606]:
class Rankings:
    """A simple class to parse rankings"""
    
    def __init__(self, year, html_fn, helmNames):
        self.year = year
        self.html_fn = html_fn
        self.helmNames = helmNames

        self.stats = {}

        self.parse()
        self.processTables()
        
    def parse(self):
        with open(os.path.join(self.year, self.html_fn), "r") as f:
            self.html = f.read()
        self.soup = BeautifulSoup(self.html, "html.parser")
        
    def processTables(self):
        summaryTitles = self.soup.find_all("h3", "summarytitle")
        summaryTables = self.soup.find_all("table", "summarytable")
        index = 0
        for summaryTable in summaryTables:
            table = Table(summaryTitles[index].string, summaryTable, self.helmNames, self.stats)
            index += 1
            
    def outputStats(self, stats_fn):
        with open(os.path.join(self.year, stats_fn), "w") as f:
            for eventName in self.stats.keys():
                event = self.stats[eventName]
                event.outputStats(f)    

## Table class  - Handles single HTML table

In [607]:
class Table:
    """A simple class to parse HTML table"""
    
    def __init__(self, title, table, helmNames, stats):
        self.title = title
        self.table = table
        self.helmNames = helmNames
        self.stats = stats
        
        self.process()
        
    def process(self):
        self.processHeadings()
        self.storeColumnIndices()
        self.storeEvents()
        self.generateStats()
    
    def processHeadings(self):
        self.colClasses = []
        cols = self.table.find_all("col")
        for col in cols:
            self.colClasses.append(col["class"][0])

        self.colHeaders = []
        cols = self.table.find_all("th")
        for col in cols:
            self.colHeaders.append(col.string)
        
    def storeColumnIndices(self):
        self.rankIdx = self.colClasses.index("rank")
        self.fleetIdx = self.colClasses.index("fleet")
        self.helmNameIdx = self.colClasses.index("helmname")
        self.helmAgeIdx = self.colClasses.index("helmagegroup")
        try:
            self.helmSexIdx = self.colClasses.index("helmsex")
        except:
            self.helmSexIdx = None
        
    def storeEvents(self):
        self.eventNames = []
        self.eventIdx = {}
        for i in range(len(self.colClasses)):
            if self.colClasses[i] == "race":
                eventName = self.colHeaders[i]
                self.eventNames.append(eventName)
                self.eventIdx[eventName] = i
                
                if not self.stats.has_key(eventName):
                    event = Event(eventName, self.helmNames)
                    self.stats[eventName] = event
        
    def generateStats(self):
        rows = self.table.tbody.find_all("tr")
        for row in rows:
            # Helm name
            children = row.findChildren("td")
            helmName = children[self.helmNameIdx].string
            
            # Helm ages to be reported are as follows - 'U17', 'U20', 'M', 'V
            helmAge = row.findChildren("td")[self.helmAgeIdx].string
            # Kids have YOB between 2000 and 2010 rather than a U17 code
            if len(helmAge) == 4 and helmAge[:2] == '20':
                helmAge = 'U17'
            # Super veterans will be grouped with Veterans
            if helmAge == 'SV':
                helmAge = 'V'
            # Default
            if helmAge not in ('U17', 'U20', 'M', 'V'):
                helmAge = None

            # Helm sex is not always known (e.g. Inland rankings)
            if self.helmSexIdx is not None:
                helmSex = row.findChildren("td")[self.helmSexIdx].string
                # Receboard fleet have lightweight (LW) and heavyweight (HW) for men
                if helmSex in ('LW', 'HW'):
                    helmSex = 'M'
                # Unknown sex
                if helmSex not in ('M', 'F'):
                    helmSex = 'U'
            else:
                # Unknown sex
                helmSex = 'U'
            
            # Process all events for the competitor
            for eventName in self.eventNames:
                eventIdx = self.eventIdx[eventName]
                result = children[eventIdx].string
                result = result.strip('()')

                if result.find('DNC') >= 0:
                    result = None
                elif result.find('ABS') >= 0:
                    result = None
                elif result.find('ATT') >= 0:
                    result = 999.9
                elif result.find('RDG') >= 0:
                    result = 999.9
                    
                if result is not None:
                    self.stats[eventName].addAttendee(helmName, helmAge, helmSex)
                
    def outputStats(self):
        for eventName in self.eventNames:
            event = self.stats[eventName]
            event.outputStats()

## Event class - Keeps count of competitors at a single event / venue

In [608]:
class Event:
    """A simple class to represent a single event / race"""
    
    def __init__(self, title, helmNames):
        self.title = title
        self.helmNames = helmNames
        
        self.competitors = 0
        self.newcomers = 0
        self.males = 0
        self.females = 0
        self.unknowns = 0
        self.u17s = 0
        self.u20s = 0
        self.masters = 0
        self.veterans = 0
        
    def addAttendee(self, helmName, helmAge, helmSex):

        self.competitors += 1
        
        if self.helmNames.count(helmName) == 0:
            self.helmNames.append(helmName)
            self.newcomers += 1
            
        if helmSex == 'M':
            self.males += 1
        elif helmSex == 'F':
            self.females += 1
        elif helmSex == 'U':
            self.unknowns += 1
        
        if helmAge == 'U17':
            self.u17s += 1
        elif helmAge == 'U20':
            self.u20s += 1
        elif helmAge == 'M':
            self.masters += 1
        elif helmAge == 'V':
            self.veterans += 1
            
    def outputStats(self, f):
        f.write('Venue\t' + self.title + '\n')
        f.write('Total Competitors\t' + str(self.competitors) + '\n')
        f.write('Newcomers\t' + str(self.newcomers) + '\n')
        if self.males > self.unknowns:
            f.write('No of Males\t' + str(self.males) + '\n')
            f.write('No of Females\t' + str(self.females) + '\n')
        else:
            f.write('No of Males\t?' + '\n')
            f.write('No of Females\t?' + '\n')
        f.write('Under 17\t' + str(self.u17s) + '\n')
        f.write('Under 20\t' + str(self.u20s) + '\n')
        f.write('Master\t' + str(self.masters) + '\n')
        f.write('Veteran\t' + str(self.veterans) + '\n')
        f.write('\n')


## Inland Events Summary

In [609]:
fn = "inland-rankings.htm"
helmNames = []

rankings = Rankings("2015", fn, helmNames)
rankings = Rankings("2016", fn, helmNames)
rankings = Rankings("2017", fn, helmNames)
rankings = Rankings("2018", fn, helmNames)

rankings.outputStats("inland-summary.csv")

## Cup Events Summary

In [610]:
fn = "nat-champs-ranking.htm"
helmNames = []

rankings = Rankings("2015", fn, helmNames)
rankings = Rankings("2016", fn, helmNames)
rankings = Rankings("2017", fn, helmNames)
rankings = Rankings("2018", fn, helmNames)

rankings.outputStats("nat-champs-summary.csv")

## Slalom Events Summary

In [611]:
fn = "slalom-rankings.htm"
helmNames = []

rankings = Rankings("2015", fn, helmNames)
rankings = Rankings("2016", fn, helmNames)
rankings = Rankings("2017", fn, helmNames)
rankings = Rankings("2018", fn, helmNames)

rankings.outputStats("slalom-summary.csv")