In [1]:
# Import the required libraries for webscraping
from bs4 import BeautifulSoup
import requests
import re
# Import requried OS utility libraries for input and output file navigation
import os

### 1. Get urls from the input folder.
It is assumed that the input folder is at the same level with src, and the file holding the urls is called from_urls. Also it is assumed that the urls are stored and separated by newlines in the input file.\
It is also assumed that only the first two lines of the input file will be valid URLs. No verificiation is carried out for the input data file as it is outside the assignment scope

In [2]:
# Initialize the url list
url_dict = {
    'season_126': '',
    'season_womens_seven': ''
}

# Define the import file path
INPUT_FILE_NAME = 'from_urls'

# Join the directory which is one level above this file
# and the input folder and the input file name
url_file_fn = os.path.join(
    os.path.abspath('..'), 
    'input',
    INPUT_FILE_NAME
)

# Open the file and read the urls into the url dictionary
url_file = open(url_file_fn, 'r')
for k, v in url_dict.items():
    line = url_file.readline()
    if line:
        url_dict[k] = line.strip()
    else:
        pass
url_file.close()

### 2. Request data from Wikipedia for AFL Season 126 and AFL Women's Season Seven
The raw data will also be saved into a dictionary of two BeautifulSoup objects

In [3]:
# Initialize the BeautifulSoup object dictionary
bs_dict = {
    'season_126': None,
    'season_womens_seven': None
}

# Iterate through url_dict, use requests to get the website data
# use the data to instantiate one BeautifulSoup object each
# and save the BeautifulSoup object to bs_dict
for k, v in url_dict.items():
    response = requests.get(v)
    html = response.text
    bs_dict[k] = BeautifulSoup(html, 'html.parser')
    response.close()


### 3. Extract from the BeautifuleSoup objects for the tables containing game rows
Both the Pre-season and Final Series tables are considered here and scraped. It is to be noticed that only the pre-season games will have a round number. All "rounds" in the final series are named as Qualifying finals, Elimination finals and etc. Regardless of the different naming conventions, these final serires rounds are treated in the same way per pre-season rounds. And they will be both referred to as "rounds".

In [4]:
def extract_round_and_game_elements(soup_obj):
    """
    This function extracts each game's information from a BeautifulSoup object
    scraped from Wikipedia. Three steps are invovled here: firstly the entire object
    is filtered to extract all rounds (both pre-season and final serires) and save them
    in a list, then each round in the round list is further filtered to extract all games
    and save them in another list. Finally each game in the game list is further filtered
    to extract the required attributes. Each game's attribute information is stored in a dictionary
    with the corresponding keys for convenient access later.

    :param soup_obj: BeautifulSoup object
    :return: a list of dictionaries, each dictionary contains the information of a game
    """

    result_round_elements = []
    # all round tables are with the same unique style
    found_round_list = soup_obj.find_all('table', style='width: 100%; background-color: #f1f5fc; border: 2px solid #D0E5F5')
    for round in found_round_list:
        # filter for all th elements as they and only they contain round number information
        round_number = round.tbody.find('th').text
        # games are in the tr elements without a style
        games = round.tbody.find_all('tr', style = lambda x: x is None)
        for game in games:
            # each game has 6 td elements to describe its attributes
            game_info = game.find_all('td')
            # some footer comments also exist as a tr element, they are not games
            # and the uniqueness is that they contail ul elements
            if (len(game_info) == 6) and (game_info[0].find('ul') is None) :
                game_datetime = game_info[0]
                home = game_info[1]
                result = game_info[2]
                away = game_info[3]
                location = game_info[4]
                # extract the attribute as strings and save in one dictionary
                # for each game
                result_round_elements.append(
                    {
                        'round_number': round_number,
                        'datetime': game_datetime.text,
                        'home': home.text,
                        'result': result.text,
                        'away': away.text,
                        'location': location.text
                    }
                )
    return result_round_elements

In [5]:
# Go into the BeautifulSoup objects and extract all the ROUND tables under them
# And then extract the game information and save games in one list for each season

# Initialize a round_dict to store the extracted game dictionary lists
round_dict = {
    'season_126': [],
    'season_womens_seven': []
}

# Iterate through the bs_dict and fill the results to round_dict
for k, v in bs_dict.items():
    round_dict[k] = extract_round_and_game_elements(v)

### 4. Use RegEx to extract and cleanse the game information from the round_dict and save to a final list of strings
The final list of strings has each game as an element, which is a string formed up by joining the RegEx-matched game attribute string results with comma delimiters.

In [59]:
def extract_game_elements_to_strings(games):
    """
    This function iterates through the game dictionaries in the input list
    and uses regexes to match and format the required attribute strings.
    The final attributes are joined together with comma delimiters for each game
    and all game strings are saved and returned in a list

    :param games: a list of dictionaries, each dictionary contains the information of a game
    :return: a list of strings, each string contains the information of a game
    """
    game_string_list = []
    # defined the regex required to extract and format the detailed attributes
    # the logics behind these regexes are explained in the report
    re_pattern_dict = {
        'round_number': r"^Round (\d+)( .+)*?$",
        'datetime': r"^.*?(\w+), (.+) \((.+)\)$",
        'home': r"^(.+) \d+\.\d+ \((.+)\)$",
        'result': r"^(.+)$",
        'away': r"^(.+) \d+\.\d+ \((.+)\)$",
        'location': r"^(.+) +\(crowd\:.(.+)\)$"
    }
    # initialize the matched result dictionary for easy access
    matched_re_group_dict = {
        'round_number': None,
        'datetime': None,
        'home': None,
        'result': None,
        'away': None,
        'location': None
    }
    for game in games:
        # iterate through the regex dictionary and match each attribute
        for k, v in re_pattern_dict.items():
            matched_re_group_dict[k] = re.match(v, game[k])
        # join the matched results with commas
        this_game_str = ','.join([
            # Round Number
            matched_re_group_dict['round_number'].group(1) \
                if matched_re_group_dict['round_number'] \
                else game['round_number'].strip(),
            # Day of the game 
            matched_re_group_dict['datetime'].group(1),
            # Date of the game
            matched_re_group_dict['datetime'].group(2),
            # Time of the game
            matched_re_group_dict['datetime'].group(3),
            # Home team name
            matched_re_group_dict['home'].group(1),
            # Home team score
            matched_re_group_dict['home'].group(2),
            # Result
            matched_re_group_dict['result'].group(1),
            # Away team name
            matched_re_group_dict['away'].group(1),
            # Away team score
            matched_re_group_dict['away'].group(2),
            # Game location
            matched_re_group_dict['location'].group(1),
            # Game attendees
            matched_re_group_dict['location'].group(2).replace(',', '')
        ])
        game_string_list.append(this_game_str)
    return game_string_list

In [60]:
# iterate through the round_dict and extract the following information for each game
"""
Round Number (example: "1")
Day of the game (example: "Friday")
Date of the game (example: "19-Aug")
Time of the game (example: "7:50 pm")
First team name (example: "Brisbane Lions")
First team score in points only (example: "57")
First team win status (either "def. by" or "def.")
Second team name (example: "Melbourne")
Second team score in points only (example: "115")
Game location (example: "The Gabba")
Stadium Attendees (example: "32172")
"""

# Initialize the title row for the csv file
csv_title_row = ','.join([
    'Round Number',
    'Day of the game',
    'Date of the game',
    'Time of the game',
    'First team name',
    'First team score',
    'First team win status',
    'Second team name',
    'Second team score',
    'Game location',
    'Stadium Attendees'
])

# Create the result list of csv rows, and insert the title row as the first element
game_string_list = [csv_title_row]

# Iterate through the round_dict and extract the game elements to comma-delimited strings,
# Then append the strings to the game_string_list one by one
for k, v in round_dict.items():
    game_string_list.extend(extract_game_elements_to_strings(v))

### 5. Save the result to a csv output
It is assumed that the output folder is at the same level as the src folder \
It is also assumed that a csv file output named scraped_game_table.csv file will exist and contain the result \
The file will be overwritten regardless of its existing contents and no verification of existence will be carried out\
since it is outside the assignment scope

In [62]:
# Define the output file path
OUTPUT_FILE_NAME = 'scraped_game_table.csv'
# Join the directory one level above the current file
# together with output folder and the output file name
scraped_game_table_fn = os.path.join(
    os.path.abspath('..'), 
    'output',
    OUTPUT_FILE_NAME
)

# Write the result_list to the output file
result_file = open(scraped_game_table_fn, 'w')
for result_str in game_string_list:
    result_file.write(f"{result_str}\n")
result_file.close()