# Web-crawling from HKJC

In [None]:
import requests
from bs4 import BeautifulSoup


# initialize the url list
horse_urls = list()

# Send a GET request to the website
url = "https://racing.hkjc.com/racing/information/English/Horse/HorseFormerName.aspx"
response = requests.get(url)

# Create a BeautifulSoup object to parse the HTML content
soup = BeautifulSoup(response.content, "html.parser")

# Find the <div> elements with the class name "comm"
comm_divs = soup.find_all("div", class_="commContent")

# Iterate over each <div> element
for div in comm_divs:
    # Find all the <a> elements within the <div>
    links = div.find_all("a")

    # Extract and print the href attribute of each <a> element
    for link in links:
        href = link.get("href")
        if href and "HorseId=" in href:
            horse_urls.append("https://racing.hkjc.com/" + href + "&Option=1")

In [None]:
print(horse_urls[:5])

['https://racing.hkjc.com//racing/information/English/Horse/Horse.aspx?HorseId=HK_2022_H037&Option=1', 'https://racing.hkjc.com//racing/information/English/Horse/Horse.aspx?HorseId=HK_2022_H342&Option=1', 'https://racing.hkjc.com//racing/information/English/Horse/Horse.aspx?HorseId=HK_2020_E144&Option=1', 'https://racing.hkjc.com//racing/information/English/Horse/Horse.aspx?HorseId=HK_2022_H331&Option=1', 'https://racing.hkjc.com//racing/information/English/Horse/Horse.aspx?HorseId=HK_2021_G196&Option=1']


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

# scrape from the web and store it into a dataframe
def scrape_form_records(urls):

    data = list()

    for url in urls:

        # Send a GET request to the website
        response = requests.get(url)

        # Create a BeautifulSoup object to parse the HTML content
        soup = BeautifulSoup(response.content, "html.parser")

        # Find the <div> elements with the class name "comm"
        rows = soup.find_all("tr", {'bgcolor': ['#F8F4EF', '#E7E4DF']})

        # Extract the data from the rows and store it in a list of dictionaries

        for key, row in enumerate(rows):
            row_data = list()

            for cell in row.find_all('td'):
                value = cell.text.strip()
                row_data.append(value)

            data.append(row_data)

    # Create a pandas DataFrame from the extracted data
    df = pd.DataFrame(data, columns=['Race_Index', 'Place', 'Date',
                                        'RC/Track/Course', 'Distance', 'Going',
                                        'Race_Class', 'Draw', 'Rating', 'Trainer',
                                        'Jockey', 'LBW', 'Win_Odds',
                                        'Actual_Weight', 'Running_Position',
                                        'Finish_Time', 'Declared_Horse_Weight',
                                        'Gear', 'Video_Replay', 'Video_Replay_2'])

    return df



In [None]:
table = scrape_form_records(horse_urls)
print(table.describe())

       Race_Index  Place      Date  RC/Track/Course Distance  Going  \
count       15210  15210     15210            15210    15210  15210   
unique        838     24       496               17       12     11   
top           449     01  12/02/24  ST / Turf / "A"     1200      G   
freq           37   1599       136             1912     6180  11013   

       Race_Class   Draw Rating   Trainer    Jockey    LBW Win_Odds  \
count       15210  15210  15210     15210     15210  15210    15210   
unique         15     15    123        27        65    155      353   
top             4      5     52  A S Cruz  Z Purton  1-1/4       10   
freq         6882   1294   1240      1031      1229    606      575   

       Actual_Weight Running_Position Finish_Time Declared_Horse_Weight  \
count          15210            15210       15210                 15210   
unique            30             4983        3104                   375   
top              126               --          --              

In [None]:
table.to_csv('./drive/MyDrive/aist4010_project/form_records.csv')