In [1]:
# Import the required libraries for webscraping
from bs4 import BeautifulSoup
import requests
import re
# Import requried utility libraries
import os

### Get urls from the input folder.
It is assumed that the input folder is at the same level with src, and the file holding the urls is called from_urls. Also it is assumed that the urls are stored and separated by newlines in the input file.\
It is also assumed that only the first two lines of the input file will be valid URLs. No verificiation is carried out for the input data as it is outside the assignment scope


In [2]:
# Initialize the url list
url_dict = {
    'season_126': '',
    'season_womens_seven': ''
}

# Define the import file path
INPUT_FILE_NAME = 'from_urls'

url_file_fn = os.path.join(
    os.path.abspath('..'), 
    'input',
    INPUT_FILE_NAME
)

# Open the file and read the urls into the url list
url_file = open(url_file_fn, 'r')
for k, v in url_dict.items():
    line = url_file.readline()
    if line:
        url_dict[k] = line.strip()
    else:
        pass
url_file.close()

### Request data from Wikipedia for AFL Season 126 and AFL Women's Season Seven
The raw data will also be saved into a dictionary of two BeautifulSoup objects

In [3]:
# Initialize the BeautifulSoup object dictionary
bs_dict = {
    'season_126': None,
    'season_womens_seven': None
}

# Iterate through url_dict and save the BeautifulSoup object to bs_dict
for k, v in url_dict.items():
    response = requests.get(v)
    html = response.text
    bs_dict[k] = BeautifulSoup(html, 'html.parser')
    response.close()


### Extract from the BeautifuleSoup projects for the tables under "Home-and-away season"

In [10]:
# TO-DO helper functoin to extract the required elements of the round tables
def extract_round_and_game_elements(soup_obj):
    # Note: round elements should be a game bs object list
    result_round_elements = []
    found_round_list = soup_obj.find_all('table', style='width: 100%; background-color: #f1f5fc; border: 2px solid #D0E5F5')
    for round in found_round_list:
        round_number = round.tbody.find('th').text
        games = round.tbody.find_all('tr', style = lambda x: x is None)
        for game in games:
            game_info = game.find_all('td')
            if (len(game_info) == 6) and (game_info[0].find('ul') is None) :
                game_datetime = game_info[0]
                home = game_info[1]
                result = game_info[2]
                away = game_info[3]
                location = game_info[4]
                result_round_elements.append(
                    {
                        'round_number': round_number,
                        'datetime': game_datetime.text,
                        'home': home.text,
                        'result': result.text,
                        'away': away.text,
                        'location': location.text
                    }
                )
    return result_round_elements

In [11]:
# TO-DO: Iterate through the BeautifulSoup object and extract all the ROUND tables under 
# body -> div class mw-page-container -> div class mw-content-container
# -> div class vector-body ve-init-mw-desktopArticleTarget-targetContainer
# -> div class mw-body-content mw-content-ltr
# -> div class mw-parser-output -> ALL non-"wikitable" class tables

# Initialize a game_dict to store the extracted game element lists
round_dict = {
    'season_126': [],
    'season_womens_seven': []
}
for k, v in bs_dict.items():
    round_dict[k] = extract_round_and_game_elements(v)


In [15]:
for k, v in round_dict.items():
    for v1 in v:
        for k2, v2 in v1.items():
            print(v2)

Round 1

Wednesday, 16 March (7:10 pm)

Melbourne 14.13 (97)

def.

Western Bulldogs 11.5 (71)

MCG (crowd: 58,002)

Round 1

Thursday, 17 March (7:25 pm)

Carlton 14.17 (101)

def.

Richmond 11.10 (76)

MCG (crowd: 72,179)

Round 1

Friday, 18 March (7:50 pm)

St Kilda 12.13 (85)

def. by

Collingwood 15.12 (102)

Marvel Stadium (crowd: 40,129)

Round 1

Saturday, 19 March (2:10 pm)

Geelong 20.18 (138)

def.

Essendon 11.6 (72)

MCG (crowd: 54,495)

Round 1

Saturday, 19 March (5:10 pm)

Greater Western Sydney 13.14 (92)

def. by

Sydney 17.10 (112)

Accor Stadium (crowd: 25,572)

Round 1

Saturday, 19 March (7:10 pm)

Brisbane Lions 11.14 (80)

def.

Port Adelaide 10.9 (69)

The Gabba (crowd: 25,100)

Round 1

Sunday, 20 March (1:10 pm)

Hawthorn 11.12 (78)

def.

North Melbourne 8.10 (58)

MCG (crowd: 38,279)

Round 1

Sunday, 20 March (3:40 pm)

Adelaide 12.10 (82)

def. by

Fremantle 11.17 (83)

Adelaide Oval (crowd: 28,186)

Round 1

Sunday, 20 March (4:40 pm)

West Coast 12.8 (

### Extract the game information from the round_dict and save to a dictionary of games

In [None]:
# TO-DO: Iterate through the round elements and save the game elements to a list of strings
def extract_game_elements_to_strings(games):
    game_string_list = []
    re_pattern_dict = {
        'round_number': '^Round (\d+)',
        'datetime': '^(.+), (.+) \((.+)\)$',
        'home': '^(.+) \d+.\d+.{2}(.+)\)$',
        'result': '.+',
        'away': '^(.+) \d+.\d+.{2}(.+)\)$',
        'location': '(.*?) \([cwd]'
    }
    matched_re_group_dict = {
        'round_number': None,
        'datetime': None,
        'home': None,
        'result': None,
        'away': None,
        'location': None
    }
    for game in games:
        # Hint: use regex here to find the below values from a round element
        for k, v in re_pattern_dict.items():
            matched_re_group_dict[k] = re.match(v, game[k])
        this_game_str = ','.join([
            matched_re_group_dict['round_number'].group(1),
            matched_re_group_dict['datetime'].group(0),
            matched_re_group_dict['datetime'].group(1),
            matched_re_group_dict['datetime'].group(2),
            matched_re_group_dict['home'].group(0)
            # to-do: add all values required
        ])
        game_string_list.append(this_game_str)
    return game_string_list

In [20]:
# TO-DO: iterate through the round_dict and extract the following information for each game
# RegEx is used here per example 3.5 Part 3
"""
Round Number (example: "1")
Day of the game (example: "Friday")
Date of the game (example: "19-Aug")
Time of the game (example: "7:50 pm")
First team name (example: "Brisbane Lions")
First team score in points only (example: "57")
First team win status (either "def. by" or "def.")
Second team name (example: "Melbourne")
Second team score in points only (example: "115")
Game location (example: "The Gabba")
Stadium Attendees (example: "32172")
"""

# Initialize a game_dict to store the extracted game element lists
csv_title_row = ','.join([
    'Round Number',
    'Day of the game',
    'Date of the game',
    'Time of the game',
    'First team name',
    'First team score',
    'First team win status',
    'Second team name',
    'Second team score',
    'Game location',
    'Stadium Attendees'
])
game_string_list = [csv_title_row]
for k, v in round_dict.items():
    game_string_list.extend(extract_game_elements_to_strings(v))

### Save the result to a csv output
It is assumed that the output folder is at the same level as the src folder \
It is also assumed that a csv file output named scraped_game_table.csv file will exist and contain the result \
The file will be overwritten regardless of its existing contents and no verification of existence will be carried out\
since it is outside the assignment scope

In [21]:
# Define the output file path
OUTPUT_FILE_NAME = 'scraped_game_table.csv'
scraped_game_table_fn = os.path.join(
    os.path.abspath('..'), 
    'output',
    OUTPUT_FILE_NAME
)

# Write the result_list to the output file
result_file = open(scraped_game_table_fn, 'w')
for result_str in game_string_list:
    result_file.write(f"{result_str}\n")
result_file.close()