In [26]:
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import pandas as pd
import re
from datetime import datetime

In [2]:
def get_page_soup(url: str):
    r = Request(url, headers={"User-Agent": "Mozilla/5.0"})

    with urlopen(r) as webpage:
        content = webpage.read().decode()
        soup = BeautifulSoup(content)

    return soup

In [3]:
base_url = "https://www.thegreyhoundrecorder.com.au"

In [4]:
soup = get_page_soup(base_url + "/greyhounds/fernando-bale")

In [6]:
profile = soup.find(id="dogProfileContainer").p.find_all(recursive=False)

In [16]:
a = profile[0].nextSibling.split()[1].strip("\xa0")

In [27]:
d = datetime.strptime(a, "%d-%m-%Y")

datetime.datetime(2013, 3, 12, 0, 0)

In [35]:
int((datetime.now() - d).days / 356)

9

In [30]:
def scrape_dog_info(soup: BeautifulSoup):
    # find name and breeding info
    profile = soup.find(id="dogProfileContainer")
    name = profile.h1.text
    breeding_links = [
        base_url+link["href"] for link in profile.span.find_all('a')
        ]
    breeding = ', '.join(
        [link.text for link in profile.span.find_all('a')]
    )

    # find dog and trainer info
    info_block = profile.p.find_all(recursive=False)
    dog_type = info_block[0].text
    
    trainer_link = base_url+info_block[2]["href"]
    trainer_name = info_block[2].text

    # find racing records
    record = profile.find(id="fragment-1").find_all(recursive=False)
    # overall win_rate
    # win_rate needs to be created for individual tracks + distances
    win_rate = float(
        record[1].find("tr").find_all(recursive=False)[-1].text[:-1]
        ) / 100 # change percentage to decimal

    race_records = []
    for result in record[2].tbody.find_all(recursive=False):
        result_dict = {}
        result_entries = result.find_all("td", recursive=False)
        result_dict["race_link"] = base_url + result_entries[0].a["href"]
        result_dict["track"] = result_entries[1]["title"]
        result_dict["place"] = int(result_entries[2].text[0])
        result_dict["box"] = int(result_entries[3].text)
        result_dict["distance"] = int(result_entries[4].text)
        result_dict["grade"] = result_entries[5].text
        result_dict["time"] = float(result_entries[6].text)
        result_dict["win_time"] = float(result_entries[7].text)
        result_dict["bon"] = float(result_entries[8].text) # best time for that specific dist and grade in the meet
        result_dict["margin"] = float(result_entries[9].text[:-1])
        result_dict["pir"] = result_entries[10].text
        result_dict["sp"] = float(result_entries[12].text[1:])

        race_records.append(result_dict)

    return {
        "name": name,
        "breeding_links": breeding_links,
        "breeding": breeding,
        "dog_type": dog_type,
        "trainer_link": trainer_link,
        "trainer_name": trainer_name,
        "win_rate": win_rate,
        "race_records": race_records
    }

In [29]:
def get_run_stats(soup):
    profile = soup.find(id="dogProfileContainer")
    stats = profile \
        .find(id="fragment-2") \
        .find(id="distanceTimeStats") \
        .find_all(class_=re.compile("datagrid stats hide marginBottom10"))
        
    run_stats = {}
    for track in stats:
        track_stats = []
        rows = track.tbody.find_all(recursive=False)
        for row in rows:
            stat_entry = {}
            values = row.find_all(recursive=False)
            stat_entry["distance"] = values[0].text
            stat_entry["starts"] = values[1].text
            stat_entry["win_rate"] = int(values[2].text) / int(values[1].text)
            stat_entry["best_time"] = float(values[6].text)
            track_stats.append(stat_entry)
        track_loc = track["class"][-1]
        run_stats[track_loc] = track_stats
    
    return run_stats

In [26]:
# only gets overall stats
def get_box_stats(soup):
    profile = soup.find(id="dogProfileContainer")
    stats = profile \
        .find(id="fragment-2") \
        .find(id="boxTrackStats") \
        .find_all(class_=re.compile("datagrid stats hide"))
        
    box_stats = {}
    for track in stats:
        track_stats = []
        rows = track.tbody.find_all(recursive=False)
        for row in enumerate(rows):
            stat_entry = {}
            values = row[1].find_all(recursive=False)
            stat_entry["box"] = int(values[0].text)
            starts = int(values[1].text)
            stat_entry["starts"] = starts
            if starts != 0:
                stat_entry["win_rate"] = int(values[2].text) / int(values[1].text)
            else:
                stat_entry["win_rate"] = None
            track_stats.append(stat_entry)
        # be aware overall will also be a track_loc
        track_loc = track["class"][-1] # no space for two word locations
        box_stats[track_loc] = track_stats

    return box_stats