In this notebook, we'll demonstrate how to retrieve data automatically from draft guru

In [1]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
from collections import ChainMap
import math
import numpy as np

In [2]:
def make_soup(website):
    if 'https' in website:
        page = requests.get(website, timeout=120)
    else:
        page = requests.get('https://' + website, timeout=120)
    soup = BeautifulSoup(page.content, 'html.parser')
    page.close()
    return soup

In [3]:
TEAMS = [
"adelaide-crows",
"brisbane-lions",
"carlton-blues",
"collingwood-magpies",
"essendon-bombers",
"fremantle-dockers",
"gold-coast-suns",
"greater-western-sydney-giants",
"hawthorn-hawks",
"kangaroos",
"st-kilda-saints",
"sydney-swans",
"west-coast-eagles",
"western-bulldogs",
"port-adelaide-power",
"melbourne-demons",
"geelong-cats",
"richmond-tigers"]
len(set(TEAMS))

18

In [4]:
TEAMS[0]

'adelaide-crows'

In [5]:

URL = "https://www.footywire.com/afl/footy/to-"+TEAMS[0]
soup = make_soup(URL)
soup


<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">

<html>
<head>
<title>Adelaide Crows AFL Player Contracts</title>
<meta content="Details of AFL player contracts for the Adelaide Crows." name="description"/>
<meta content="Adelaide Crows AFL Player Contracts" name="keywords"/>
<script>
var ad2Div;
var ad2Stuck = false;

var ad2Reversed;

var headerDiv;
var headerHeight;

var contentPageCell;
var contentPageRect;
var ad2Rect;
var ad2BottomPx;

var scrollOffset = window.scrollY;
var lastScrollOffset = window.scrollY;
var lastScrollIncrement = 0;
var scrolledDown = false;
var scrollReversed = false;

var contentPageCellHeightChecked = false;

function setAd2Properties() {
  ad2Div = document.getElementById("scrollad");
  if (ad2Div) {
    ad2Stuck = false;
    ad2Reversed = false;
  }
}

function setHeaderProperties() {
  headerDiv = document.getElementById("header");
  if (headerDiv) {
    headerHeight = headerDiv.offsetHeight;
  }
}
function setContentCellProperties() {


In [6]:


#Actually better to search for "td = content page cell" - rather than searching for the width
contract_table_html = [a for a in soup.find_all('table') if a.get('width')=='688'][0]
contract_table_html

<table border="0" cellpadding="0" cellspacing="0" width="688">
<tr>
<td class="lbnorm" height="28" width="25%"> Name</td>
<td class="bnorm" width="25%">Final Year</td>
<td class="bnorm" width="25%">Years Service*</td>
<td class="bnorm" width="25%">Status</td>
</tr>
<tr class="darkcolor" onmouseout="this.className='darkcolor';" onmouseover="this.className='highlightcolor';">
<td align="left" height="24"> <a href="pp-adelaide-crows--andrew-mcpherson">Andrew McPherson</a></td>
<td align="center">2023</td>
<td align="center">6</td>
<td align="center">Non-Free Agent</td>
</tr>
<tr class="lightcolor" onmouseout="this.className='lightcolor';" onmouseover="this.className='highlightcolor';">
<td align="left" height="24"> <a href="pp-adelaide-crows--ben-keays">Ben Keays</a></td>
<td align="center">2024</td>
<td align="center">5</td>
<td align="center">Non-Free Agent</td>
</tr>
<tr class="darkcolor" onmouseout="this.className='darkcolor';" onmouseover="this.className='highlightcolor';">
<td align

In [83]:



flat_table_data = [a.text.replace('\xa0','') for a in contract_table_html.find_all('td')]
header = flat_table_data[0:4]
body = flat_table_data[4:]

data_dict = {j:[] for j in header}
for counter,i in enumerate(range(len(body))):
    h_name = header[counter%len(header)]
    data_dict[h_name].append(body[counter])
    
pd.DataFrame(data_dict)

Unnamed: 0,Name,Final Year,Years Service*,Status
0,Andrew McPherson,2023,6,Non-Free Agent
1,Ben Keays,2024,5,Non-Free Agent
2,Billy Dowling,2024,2,Non-Free Agent
3,Brayden Cook,2024,4,Non-Free Agent
4,Brodie Smith,2024,14,Unrestricted Free Agent
5,Chayce Jones,2023,5,Non-Free Agent
6,Darcy Fogarty,2025,8,Restricted Free Agent
7,Elliott Himmelberg,2024,8,Restricted Free Agent
8,Harry Schoenberg,2023,4,Non-Free Agent
9,Hugh Bond,2024,2,Non-Free Agent


In [88]:
def get_table_team(team):
    
    print(f'Scraping contracts for {team}')
    
    # Get the URL
    URL = "https://www.footywire.com/afl/footy/to-"+team
    soup = make_soup(URL)
    
    # Extract out the table. we search for the table that's exactly 688
    contract_table_html = [a for a in soup.find_all('table') if a.get('width')=='688'][0]
    
    # For each element, we extract the text and replace the 'xa0' (non-breaking character) with ''
    flat_table_data = [a.text.replace('\xa0','') for a in contract_table_html.find_all('td')]
    
    # Assume the first 4 entries are the data header
    header = flat_table_data[0:4]
    
    # Body
    body = flat_table_data[4:]

    # Iteratively create the dictionary (to be turned into a dataframe)
    # Every 4th row cycles goes back to the beginning
    data_dict = {j:[] for j in header}
    for counter,i in enumerate(range(len(body))):
        h_name = header[counter%len(header)]
        data_dict[h_name].append(body[counter])
        
    data_dict['team'] = [team]*len(data_dict[header[0]])

    return pd.DataFrame(data_dict)
    
    
pd.concat([get_table_team(team) for team in TEAMS[0:3]],axis =0)    


Scraping contracts for adelaide-crows
Scraping contracts for brisbane-lions
Scraping contracts for carlton-blues


Unnamed: 0,Name,Final Year,Years Service*,Status,team
0,Andrew McPherson,2023,6,Non-Free Agent,adelaide-crows
1,Ben Keays,2024,5,Non-Free Agent,adelaide-crows
2,Billy Dowling,2024,2,Non-Free Agent,adelaide-crows
3,Brayden Cook,2024,4,Non-Free Agent,adelaide-crows
4,Brodie Smith,2024,14,Unrestricted Free Agent,adelaide-crows
...,...,...,...,...,...
40,Sam Philp,2023,4,Non-Free Agent,carlton-blues
41,Sam Walsh,2026,8,Restricted Free Agent,carlton-blues
42,Tom De Koning,2023,6,Non-Free Agent,carlton-blues
43,Zac Fisher,2025,9,Restricted Free Agent,carlton-blues
