In [2]:
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup

In [17]:
base_url = "https://www.thegreyhoundrecorder.com.au"
results_url = "https://www.thegreyhoundrecorder.com.au/results/"

In [28]:
# organise by tracks? record dates?
def get_pages_soups(urls: list[str]):
    soups = []
    
    for url in urls:
        r = Request(url, headers={"User-Agent": "Mozilla/5.0"})

        with urlopen(r) as webpage:
            content = webpage.read().decode()
            soups.append(BeautifulSoup(content))

    return soups

def get_page_soup(url: str):
    r = Request(url, headers={"User-Agent": "Mozilla/5.0"})

    with urlopen(r) as webpage:
        content = webpage.read().decode()
        soup = BeautifulSoup(content)

    return soup

In [29]:
# race refers to dogContainer elements (one for each race in a meeting)
def extract_distance(race):
    race_title = race.find(attrs={"class": "font14"}).text
    return race_title.split()[-1].strip("()")

def extract_race_grade(race):
    race_header = race.select(".runnerFormHeader div")
    race_class =  race_header[1].text.split('$')[0]
    return race_class[:-3]

def extract_fastest_split(race):
    race_header = race.select(".runnerFormHeader div")
    return float(race_header[2].text.split()[1].strip(','))

def extract_win_time(race):
    first_row = race.select(".runnerContainer .runnerSubDetails div")[1].text
    time = first_row.strip()[4:]
    return float(time)

In [30]:
def get_races(soup):
    return soup.select(".dogContainer")

In [34]:
# scrapes times organised by distance only
def scrape_times(races):
    race_time_dict = {}
    sect_time_dict = {}

    for race in races:
        distance = extract_distance(race)
        fastest_split = extract_fastest_split(race)
        win_time = extract_win_time(race)

        if distance not in race_time_dict:
            race_time_dict[distance] = []
            sect_time_dict[distance] = []

        race_time_dict[distance].append(win_time)
        sect_time_dict[distance].append(fastest_split)

    return race_time_dict, sect_time_dict

# scrapes times organised by distance and grade
def scrape_times_by_grade(races):
    race_time_dict = {}
    sect_time_dict = {}
    
    for race in races:
        distance = extract_distance(race)
        grade = extract_race_grade(race)
        fastest_split = extract_fastest_split(race)
        win_time = extract_win_time(race)

        if distance not in race_time_dict:
            race_time_dict[distance] = {}
            sect_time_dict[distance] = {}

        if grade not in race_time_dict[distance]:
            race_time_dict[distance][grade] = []
            sect_time_dict[distance][grade] = []

        race_time_dict[distance][grade].append(win_time)
        sect_time_dict[distance][grade].append(fastest_split)

    return race_time_dict, sect_time_dict

In [32]:
def get_result_page_soups_by_track():
    track_urls = {}

    results_page_soup = get_page_soup(results_url)

    # all results by track
    results_by_track = results_page_soup.select(".row.resultTracks a")
    
    # add all urls of results in track page
    for results_tag in results_by_track:
        track_result_page_soup = get_page_soup(base_url + results_tag["href"])
        result_links = track_result_page_soup.select(".results.table-striped a")
        track_urls[results_tag.text] = [base_url + link["href"] for link in result_links]

    # curr structure
    # {track_name: [result_pages_soups]}
    return {track:get_pages_soups(urls) for track, urls in track_urls.items()}

def get_result_page_links_by_track():
    track_urls = {}

    base_url = "https://www.thegreyhoundrecorder.com.au"
    results_url = "https://www.thegreyhoundrecorder.com.au/results/"
    results_page_soup = get_page_soup(results_url)

    # all results by track
    results_by_track = results_page_soup.select(".row.resultTracks a")
    
    # add all urls of results in track page
    for results_tag in results_by_track:
        track_result_page_soup = get_page_soup(base_url + results_tag["href"])
        result_links = track_result_page_soup.select(".results.table-striped a")
        track_urls[results_tag.text] = [base_url + link["href"] for link in result_links]

    return track_urls

def get_single_track_results_links(url: str):
    soup = get_page_soup(url)

    result_links = soup.select(".results.table-striped a")
    track_urls = [base_url + link_tag["href"] for link_tag in result_links]

    return track_urls

# get standard time for one track
def get_track_standard_time(track_soups):
    track_standard_race_times = {}
    track_standard_sect_times = {}

    race_times_by_distance = {}
    sect_times_by_distance = {}

    for meet in track_soups:
        races = get_races(meet)
        race_times_dict, sect_times_dict = scrape_times(races)

        for distance in race_times_dict:
            if distance not in race_times_by_distance:
                race_times_by_distance[distance] = []
                sect_times_by_distance[distance] = []

        for distance in race_times_dict:
            race_times_by_distance[distance] += race_times_dict[distance]
            sect_times_by_distance[distance] += sect_times_dict[distance]

    for distance in race_times_by_distance:
        race_times_list = race_times_by_distance[distance]
        sect_times_list = sect_times_by_distance[distance]

        track_standard_race_times[distance] = sum(race_times_list) / len(race_times_list)
        track_standard_sect_times[distance] = sum(sect_times_list) / len(sect_times_list)

    return track_standard_race_times, track_standard_sect_times
    

# for multiple tracks        
# track_soups is dictionary of results' page html soups by track 
def get_standard_times(track_soups):
    track_standard_race_times = {track:{} for track in track_soups}
    track_standard_sect_times = {track:{} for track in track_soups}

    for track in track_soups:
        race_times_by_distance = {}
        sect_times_by_distance = {}

        for meet in track_soups[track]:
            races = get_races(meet)
            race_times_dict, sect_times_dict = scrape_times(races)
            
            for distance in race_times_dict:
                if distance not in race_times_by_distance:
                    race_times_by_distance[distance] = []
                    sect_times_by_distance[distance] = []

            for distance in race_times_dict:
                race_times_by_distance[distance] += race_times_dict[distance]
                sect_times_by_distance[distance] += sect_times_dict[distance]

        for distance in race_times_by_distance:
            race_times_list = race_times_by_distance[distance]
            sect_times_list = sect_times_by_distance[distance]

            track_standard_race_times[track][distance] = sum(race_times_list) / len(race_times_list)
            track_standard_sect_times[track][distance] = sum(sect_times_list) / len(sect_times_list)

    return track_standard_race_times, track_standard_sect_times


In [20]:
track_links = get_single_track_results_links("https://www.thegreyhoundrecorder.com.au/results/ballarat/")

In [22]:
link_soups = get_pages_soups(track_links)

In [35]:
race_standard_times, sect_standard_times = get_track_standard_time(link_soups)

ValueError: could not convert string to float: 'N/A'