In [2]:
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import pandas as pds

In [3]:
base_url = "https://www.thegreyhoundrecorder.com.au"
results_url = "https://www.thegreyhoundrecorder.com.au/results/"

In [4]:
# organise by tracks? record dates?
def get_pages_soups(urls: list[str]):
    soups = []
    
    for url in urls:
        r = Request(url, headers={"User-Agent": "Mozilla/5.0"})

        with urlopen(r) as webpage:
            content = webpage.read().decode()
            soups.append(BeautifulSoup(content))

    return soups

def get_page_soup(url: str):
    r = Request(url, headers={"User-Agent": "Mozilla/5.0"})

    with urlopen(r) as webpage:
        content = webpage.read().decode()
        soup = BeautifulSoup(content)

    return soup

In [5]:
# race refers to dogContainer elements (one for each race in a meeting)
def extract_distance(race):
    race_title = race.find(attrs={"class": "font14"}).text
    return race_title.split()[-1].strip("()")

def extract_race_grade(race):
    race_header = race.select(".runnerFormHeader div")
    race_class =  race_header[1].text.split('$')[0]
    return race_class[:-3]

def extract_fastest_split(race):
    race_header = race.select(".runnerFormHeader div")
    try:
        return float(race_header[2].text.split()[1].strip(','))
    except:
        return None

def extract_win_time(race):
    first_row = race.select(".runnerContainer .runnerSubDetails div")[1].text
    time = first_row.strip()[4:]
    return float(time)

In [6]:
def get_races(soup):
    return soup.select(".dogContainer")

In [7]:
# scrapes times organised by distance only
def scrape_times(races):
    race_time_dict = {}
    sect_time_dict = {}

    for race in races:
        distance = extract_distance(race)
        fastest_split = extract_fastest_split(race)
        win_time = extract_win_time(race)

        if distance not in race_time_dict:
            race_time_dict[distance] = []
            sect_time_dict[distance] = []

        race_time_dict[distance].append(win_time)
        sect_time_dict[distance].append(fastest_split)

    return race_time_dict, sect_time_dict

# scrapes times organised by distance and grade
def scrape_times_by_grade(races):
    race_time_dict = {}
    sect_time_dict = {}
    
    for race in races:
        distance = extract_distance(race)
        grade = extract_race_grade(race)
        fastest_split = extract_fastest_split(race)
        win_time = extract_win_time(race)

        if distance not in race_time_dict:
            race_time_dict[distance] = {}
            sect_time_dict[distance] = {}

        if grade not in race_time_dict[distance]:
            race_time_dict[distance][grade] = []
            sect_time_dict[distance][grade] = []

        race_time_dict[distance][grade].append(win_time)
        
        if fastest_split != None:
            sect_time_dict[distance][grade].append(fastest_split)

    return race_time_dict, sect_time_dict

In [8]:
def get_result_page_soups_by_track():
    track_urls = {}

    results_page_soup = get_page_soup(results_url)

    # all results by track
    results_by_track = results_page_soup.select(".row.resultTracks a")
    
    # add all urls of results in track page
    for results_tag in results_by_track:
        track_result_page_soup = get_page_soup(base_url + results_tag["href"])
        result_links = track_result_page_soup.select(".results.table-striped a")
        track_urls[results_tag.text] = [base_url + link["href"] for link in result_links]

    # curr structure
    # {track_name: result_page_urls}
    return {track:get_pages_soups(urls) for track, urls in track_urls.items()}

def get_result_page_links_by_track():
    track_urls = {}

    base_url = "https://www.thegreyhoundrecorder.com.au"
    results_url = "https://www.thegreyhoundrecorder.com.au/results/"
    results_page_soup = get_page_soup(results_url)

    # all results by track
    results_by_track = results_page_soup.select(".row.resultTracks a")
    
    # add all urls of results in track page
    for results_tag in results_by_track:
        track_result_page_soup = get_page_soup(base_url + results_tag["href"])
        result_links = track_result_page_soup.select(".results.table-striped a")
        track_urls[results_tag.text] = [base_url + link["href"] for link in result_links]

    return track_urls

def get_single_track_results_links(track: str):
    soup = get_page_soup("https://www.thegreyhoundrecorder.com.au/results/" \
        + track + '/')

    result_links = soup.select(".results.table-striped a")
    track_urls = [base_url + link_tag["href"] for link_tag in result_links]

    return track_urls

In [9]:
# combine the times organised by grade
# for a distance
# e.g. {'390m': {'Maiden': [22.63], ...}}
def combine_grade_time_dicts(times_by_grade1, times_by_grade2):
    for grade in times_by_grade1:
        if grade in times_by_grade2:
            times_by_grade1[grade] += times_by_grade2[grade]
    
    return (times_by_grade2 | times_by_grade1)

def combine_distance_time_dicts(times_by_distance1, times_by_distance2):
    for distance in times_by_distance1:
        if distance in times_by_distance2:
            times_by_distance1[distance] = combine_grade_time_dicts(
                times_by_distance1[distance], 
                times_by_distance2[distance]
                )

    return (times_by_distance2 | times_by_distance1)

In [21]:
def get_standard_times_single_track(track: str):
    standard_times = {}
    standard_sect_times = {}
    results = get_single_track_results_links(track)

    for r in results[:20]:
        r_soup = get_page_soup(r)
        races = get_races(r_soup)
        times, sect_times = scrape_times_by_grade(races)
        standard_times = combine_distance_time_dicts(
            standard_times, 
            times
            )
        standard_sect_times = combine_distance_time_dicts(
            standard_sect_times, 
            sect_times
            )

    return standard_times, standard_sect_times

In [22]:
standard_times, standard_sect_times = get_standard_times_single_track("ballarat")

In [24]:
standard_sect_times

{'390m': {'Maiden': [8.47,
   8.35,
   8.43,
   8.52,
   8.38,
   8.45,
   8.54,
   8.47,
   8.58,
   8.39,
   8.56,
   8.38,
   8.59,
   8.43,
   8.51],
  '7': [8.54, 8.48, 8.47, 8.48, 8.38, 8.66],
  'Grade 5': [8.51,
   8.36,
   8.42,
   8.19,
   8.16,
   8.35,
   8.41,
   8.51,
   8.36,
   8.35,
   8.42,
   8.5,
   8.39,
   8.47,
   8.42,
   8.36,
   8.38,
   8.36,
   None,
   8.27,
   8.29,
   8.29,
   8.25,
   8.31,
   8.27,
   8.42],
  'Mixed 4/5': [8.43, 8.3, 8.37, 8.27, 8.36, 8.25, 8.35, 8.28, 8.34],
  'Grade 6': [8.47, 8.35, 8.36],
  'Tier 3 - Maiden': [8.55,
   8.41,
   8.53,
   8.45,
   8.53,
   8.44,
   8.46,
   8.46,
   8.6,
   8.73,
   8.64],
  'Tier 3 - Grade 7': [8.61,
   8.42,
   8.41,
   8.59,
   8.58,
   8.55,
   8.47,
   8.33,
   8.54,
   8.29,
   8.56,
   8.7],
  'Tier 3 - Grade 6': [8.49, 8.48, 8.4, 8.43],
  'Grade 5 T3': [8.44,
   8.39,
   8.5,
   8.46,
   8.45,
   8.57,
   8.48,
   8.4,
   8.47,
   8.52,
   8.5,
   8.41,
   8.42,
   8.49,
   8.4,
   8.42,
   8.5

In [17]:
results = get_single_track_results_links("ballarat")
result_soup = get_page_soup(results[0])
result_soup2 = get_page_soup(results[2])
races = get_races(result_soup)
races2 = get_races(result_soup2)
times, sect_times = scrape_times_by_grade(races)
times2, sect_times2 = scrape_times_by_grade(races2)

In [23]:
with open("output.txt", 'w') as f:
    f.write(str(combine_distance_time_dicts(times, times2)))
    
print(results[0])
print(results[1])

https://www.thegreyhoundrecorder.com.au/results/ballarat/82333
https://www.thegreyhoundrecorder.com.au/results/ballarat/82309
