In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

In [2]:
response = requests.get('http://www.racebase.co.nz/jockthis.htm')

---
HTML and XML specify their encoding, and we can use `content` to check that. you will see that we also have an attribute called `encoding`, but it just makes an educated guess on the encoding used, so it's nice to check

In [3]:
response.content

b'<html>\n<head>\n<meta http-equiv="Content-Type"\ncontent="text/html; charset=iso-8859-1">\n<title>Jockey Premiership This Season</title>\n</head>\n<body bgcolor="#F4FFFC">\n<font face="Arial" Size="3" color="black">\n<table border="1" width="100%">\n<tr>\n<td background="grainblue.jpg">&nbsp;</td>\n<td width="1024" bgcolor="#F4FFFC">\n<p><font color=#000000" size="1" face="Lucida Handwriting">CopyRight\n</font><font color="#0000FF" size="1" face="Lucida Handwriting">R</font><font\ncolor="#000000" size="1" face="Lucida Handwriting">ace</font><font\ncolor="#FF0000" size="1" face="Lucida Handwriting">B</font><font\ncolor="#000000" size="1" face="Lucida Handwriting">ase 25 Jan 2024</font></p>\n<p align="center"><font color="#000000" size = "2">\n<img src="nzracing2.jpg"></font></p>\n<table border="0" width="100%" bgcolor="#FFFFFF" id="table1"><tr>\n<td background="lgren014.jpg" bgcolor="#FFFFFF">\n<p align="center"><font size="4">New Zealand Jockey Premiership This Season\n</font></td></

In [4]:
response.encoding

'ISO-8859-1'

In [5]:
soup = BeautifulSoup(response.text,'html.parser')

# Just like what we see at browser inspect:

In [6]:
print(soup.prettify())

<html>
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <title>
   Jockey Premiership This Season
  </title>
 </head>
 <body bgcolor="#F4FFFC">
  <font color="black" face="Arial" size="3">
   <table border="1" width="100%">
    <tr>
     <td background="grainblue.jpg">
     </td>
     <td bgcolor="#F4FFFC" width="1024">
      <p>
       <font color='#000000"' face="Lucida Handwriting" size="1">
        CopyRight
       </font>
       <font color="#0000FF" face="Lucida Handwriting" size="1">
        R
       </font>
       <font color="#000000" face="Lucida Handwriting" size="1">
        ace
       </font>
       <font color="#FF0000" face="Lucida Handwriting" size="1">
        B
       </font>
       <font color="#000000" face="Lucida Handwriting" size="1">
        ase 25 Jan 2024
       </font>
      </p>
      <p align="center">
       <font color="#000000" size="2">
        <img src="nzracing2.jpg"/>
       </font>
      </p>
      <table bgcolor="#FF

---
Now, there are two very helpful functions in `BeautifulSoup` that helps us navigate the html code, they're called `find` and `find_all` as their name suggests, with one, you can find the first occurrence of a tag, and with the other you get a list with every occurrence of that tag you're looking for. Knowing that, I've used find_all to get the third table in the site, in there I find the statistics table and we're going to need them in order to create our pandas version

In [7]:
statistics_table = soup.find_all('table')[2]

Using the same principle, lets go to the row where the table header is located

In [8]:
statistics_table.find_all('tr')[1]

<tr>
<td bgcolor="#EAEAEA" width="30"><b><font size="2"></font></b></td>
<td bgcolor="#EAEAEA" width="200"><b><font size="2">Jockey</font></b></td>
<td bgcolor="#EAEAEA" width="90"><b><font size="2">Rides</font></b></td>
<td bgcolor="#EAEAEA" width="90"><b><font size="2">Wins</font></b></td>
<td bgcolor="#EAEAEA" width="90"><b><font size="2">2nds</font></b></td>
<td bgcolor="#EAEAEA" width="90"><b><font size="2">3rds</font></b></td>
<td bgcolor="#EAEAEA" width="90"><b><font size="2">S Rate</font></b></td>
<td bgcolor="#EAEAEA" width="90"><b><font size="2">UDR</font></b></td>
</tr>

In [9]:
table_titles = statistics_table.find_all('tr')[1]

In [10]:
table_titles.text.strip()

'Jockey\nRides\nWins\n2nds\n3rds\nS Rate\nUDR'

In [11]:
table_titles = [title.text.strip() for title in table_titles]

In [12]:
table_titles

['',
 '',
 '',
 'Jockey',
 '',
 'Rides',
 '',
 'Wins',
 '',
 '2nds',
 '',
 '3rds',
 '',
 'S Rate',
 '',
 'UDR',
 '']

In [13]:
#removing empty strings
table_titles = [title for title in table_titles if title != '']

# Let's create our dataframe
but before that, notice that the jockeys are ranked by victories, even so there's no nome on our rank column. Let's include that.

In [14]:
table_titles.insert(0,'Rank')
statistics_df =  pd.DataFrame(columns=[table_titles])

In [15]:
statistics_df

Unnamed: 0,Rank,Jockey,Rides,Wins,2nds,3rds,S Rate,UDR


In [16]:
#where is the data of our columns?
statistics_table.find_all('tr')[2]

<tr>
<td width="30"><font size="2">1</font></td>
<td width="200"><font size="2">W KENNEDY</font></td>
<td width="90"><font size="2">417</font></td>
<td width="90"><font size="2">79</font></td>
<td width="90"><font size="2">46</font></td>
<td width="90"><font size="2">38</font></td>
<td width="90"><font size="2">18.9%</font></td>
<td width="90"><font size="2">0.281</font></td>
</tr>

In [17]:
statistics_table.find_all('tr')[-1]

<tr>
<td width="30"><font size="2">144</font></td>
<td width="200"><font size="2">V RATHOAR</font></td>
<td width="90"><font size="2">11</font></td>
<td width="90"><font size="2">0</font></td>
<td width="90"><font size="2">0</font></td>
<td width="90"><font size="2">2</font></td>
<td width="90"><font size="2">0.0%</font></td>
<td width="90"><font size="2">0.061</font></td>
</tr>

In [18]:
column_data = statistics_table.find_all('tr')[2:]

In [19]:
#let's go step by step, first, we need to go into each row and retrieve the text inside, use strip to grant there's no empty strings
# notice that we had to tidy in the same way we did with the title
for row in column_data:
    row_stripped = [row2.text.strip() for row2 in row]
    print([row3 for row3 in row_stripped if row3 != ''])

['1', 'W KENNEDY', '417', '79', '46', '38', '18.9%', '0.281']
['2', 'M MCNAB', '377', '72', '63', '54', '19.1%', '0.332']
['3', 'J DOYLE', '543', '63', '69', '63', '11.6%', '0.225']
['4', 'O BOSSON', '207', '56', '30', '21', '27.1%', '0.385']
['5', 'C GRYLLS', '404', '50', '44', '54', '12.4%', '0.229']
['6', 'A COMIGNAGHI', '320', '44', '54', '38', '13.8%', '0.271']
['7', 'L ALLPRESS', '359', '44', '38', '40', '12.3%', '0.219']
['8', 'K ASANO', '349', '37', '33', '29', '10.6%', '0.186']
['9', 'J FAWCETT', '305', '37', '31', '20', '12.1%', '0.200']
['10', 'S SPRATT', '360', '36', '40', '36', '10.0%', '0.195']
['11', 'S WEATHERLEY', '278', '34', '35', '26', '12.2%', '0.223']
['12', 'L SUTHERLAND', '334', '33', '25', '29', '9.9%', '0.169']
['13', 'J PARKES', '185', '30', '15', '11', '16.2%', '0.227']
['14', 'K WILLIAMS', '265', '28', '25', '44', '10.6%', '0.213']
['15', 'M HASHIZUME', '334', '28', '20', '36', '8.4%', '0.153']
['16', 'N PARMAR', '260', '27', '22', '21', '10.4%', '0.178']
[

In [20]:
#now we need to append it to the dataframe

for row in column_data:
    row_stripped = [row2.text.strip() for row2 in row]
    row_cleaned = [row3 for row3 in row_stripped if row3 != '']
    length = len(statistics_df)
    statistics_df.loc[length] = row_cleaned

In [21]:
statistics_df.to_csv('data/horse_stats.csv',index=False)