In [1]:
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import pandas as pd
import statistics
import matplotlib.pyplot as plt

In [2]:
base_url = "https://www.thegreyhoundrecorder.com.au"
results_url = "https://www.thegreyhoundrecorder.com.au/results/"

In [3]:
# organise by tracks? record dates?
def get_pages_soups(urls: list[str]):
    soups = []
    
    for url in urls:
        r = Request(url, headers={"User-Agent": "Mozilla/5.0"})

        with urlopen(r) as webpage:
            content = webpage.read().decode()
            soups.append(BeautifulSoup(content))

    return soups

def get_page_soup(url: str):
    r = Request(url, headers={"User-Agent": "Mozilla/5.0"})

    with urlopen(r) as webpage:
        content = webpage.read().decode()
        soup = BeautifulSoup(content)

    return soup

In [4]:
# race refers to dogContainer elements (one for each race in a meeting)
def extract_distance(race):
    race_title = race.find(attrs={"class": "font14"}).text
    return race_title.split()[-1].strip("()")

def extract_race_grade(race):
    race_header = race.select(".runnerFormHeader div")
    race_class =  race_header[1].text.split('$')[0]
    return race_class[:-3]

def extract_fastest_split(race):
    race_header = race.select(".runnerFormHeader div")
    try:
        return float(race_header[2].text.split()[1].strip(','))
    except:
        return None

def extract_win_time(race):
    first_row = race.select(".runnerContainer .runnerSubDetails div")[1].text
    time = first_row.strip()[4:]
    return float(time)

In [5]:
def get_races(soup):
    return soup.select(".dogContainer")

In [6]:
# scrapes times organised by distance and grade
def scrape_times_by_grade(races):
    race_time_dict = {}
    sect_time_dict = {}
    
    for race in races:
        distance = extract_distance(race)
        grade = extract_race_grade(race)
        fastest_split = extract_fastest_split(race)
        win_time = extract_win_time(race)

        if distance not in race_time_dict:
            race_time_dict[distance] = {}
            sect_time_dict[distance] = {}

        if grade not in race_time_dict[distance]:
            race_time_dict[distance][grade] = []
            sect_time_dict[distance][grade] = []

        if win_time != 0:
            race_time_dict[distance][grade].append(win_time)
        
        if fastest_split != None:
            sect_time_dict[distance][grade].append(fastest_split)

    return race_time_dict, sect_time_dict

In [7]:
def get_single_track_results_links(track: str):
    soup = get_page_soup("https://www.thegreyhoundrecorder.com.au/results/" \
        + track + '/')

    result_links = soup.select(".results.table-striped a")
    track_urls = [base_url + link_tag["href"] for link_tag in result_links]

    return track_urls

def get_result_page_links_by_track():
    track_urls = {}

    base_url = "https://www.thegreyhoundrecorder.com.au"
    results_url = "https://www.thegreyhoundrecorder.com.au/results/"
    results_page_soup = get_page_soup(results_url)

    # all results by track
    results_by_track = results_page_soup.select(".row.resultTracks a")
    
    # add all urls of results in track page
    for results_tag in results_by_track:
        track_urls[results_tag.text] = get_single_track_results_links(results_tag.text)

    return track_urls

In [8]:
# combine the times organised by grade
# for a distance
# e.g. {'390m': {'Maiden': [22.63], ...}}
def combine_grade_time_dicts(times_by_grade1, times_by_grade2):
    for grade in times_by_grade1:
        if grade in times_by_grade2:
            times_by_grade1[grade] += times_by_grade2[grade]
    
    return (times_by_grade2 | times_by_grade1)

def combine_distance_time_dicts(times_by_distance1, times_by_distance2):
    for distance in times_by_distance1:
        if distance in times_by_distance2:
            times_by_distance1[distance] = combine_grade_time_dicts(
                times_by_distance1[distance], 
                times_by_distance2[distance]
                )

    return (times_by_distance2 | times_by_distance1)

In [9]:
def get_stdev(values):
    if len(values) <= 1:
        return None
    else:
        return statistics.stdev(values)

def get_median(values):
    if len(values) <= 1:
        return None
    else:
        return statistics.median(values)

def get_mean(values):
    if len(values) <= 1:
        return None
    else:
        return statistics.fmean(values)

def get_times(meet_url: str):
    soup = get_page_soup(meet_url)
    races = get_races(soup)
    times, sect_times = scrape_times_by_grade(races)

    return times, sect_times

def get_standard_times_single_track(track: str):
    standard_times = {}
    standard_sect_times = {}
    results = get_single_track_results_links(track)

    # get times from last 20 meets
    for r in results[:20]:
        s_times, sect_times = get_times(r)
        standard_times = combine_distance_time_dicts(standard_times, s_times)
        standard_sect_times = combine_distance_time_dicts(standard_sect_times, sect_times)

    # calculate averages
    for dist in standard_times:
        for grade in standard_times[dist]:
            times = standard_times[dist][grade]
            standard_times[dist][grade] = {
                "mean": get_mean(times),
                "stdev": get_stdev(times),
                "median": get_median(times)
            }

    for dist in standard_sect_times:
        for grade in standard_sect_times[dist]:
            sect_times = standard_sect_times[dist][grade]
            standard_sect_times[dist][grade] = {
                "mean": get_mean(sect_times),
                "stdev": get_stdev(sect_times),
                "median": get_median(sect_times)
            }

    return standard_times, standard_sect_times

In [10]:
standard_times = {}
standard_sect_times = {}
results = get_single_track_results_links("ballarat")

# get times from last 20 meets
for r in results[:20]:
    s_times, sect_times = get_times(r)
    standard_times = combine_distance_time_dicts(standard_times, s_times)
    standard_sect_times = combine_distance_time_dicts(standard_sect_times, sect_times)

In [11]:
standard_times, standard_sect_times = get_standard_times_single_track("ballarat")

In [12]:
# avg. win and sect times formatted in table
std_time_df = pd.DataFrame(standard_times)
std_stime_df = pd.DataFrame(standard_sect_times)

In [13]:
std_time_df

Unnamed: 0,450m,390m,545m,660m
Maiden,"{'mean': 25.688333333333333, 'stdev': 0.293422...","{'mean': 22.314166666666665, 'stdev': 0.171647...",,
Restricted Win Final,"{'mean': None, 'stdev': None, 'median': None}",,"{'mean': None, 'stdev': None, 'median': None}",
Mixed 4/5,"{'mean': 25.3125, 'stdev': 0.22243779226689747...","{'mean': 22.076363636363638, 'stdev': 0.077623...","{'mean': 31.104000000000003, 'stdev': 0.341730...",
Grade 5,"{'mean': 25.490526315789474, 'stdev': 0.266363...","{'mean': 22.229310344827585, 'stdev': 0.183106...","{'mean': 31.105, 'stdev': 0.2906372309254267, ...",
Mixed 6/7,"{'mean': 25.506, 'stdev': 0.22858015856344355,...","{'mean': 22.405714285714286, 'stdev': 0.119980...",,
Restricted Win,"{'mean': 25.673333333333336, 'stdev': 0.015275...","{'mean': 22.14, 'stdev': 0.20880613017821065, ...",,
Tier 3 - Maiden,"{'mean': 25.966, 'stdev': 0.33080205561634585,...","{'mean': 22.517000000000003, 'stdev': 0.310127...",,
Tier 3 - Restricted Win,"{'mean': 25.78833333333333, 'stdev': 0.1788202...","{'mean': 22.64, 'stdev': 0.2651414716712566, '...",,
Grade 5 T3,"{'mean': 25.84, 'stdev': 0.2775718895100034, '...","{'mean': 22.521818181818183, 'stdev': 0.188771...","{'mean': 31.6425, 'stdev': 0.4935838327984406,...",
Grade 6,"{'mean': None, 'stdev': None, 'median': None}","{'mean': None, 'stdev': None, 'median': None}",,
