In [3]:
# File: 1_data_collection.ipynb

# Import necessary libraries
import requests  # For fetching HTML content
from bs4 import BeautifulSoup  # For parsing HTML
import pandas as pd  # For structuring data
import os  # For managing file paths
import time  # For adding delays between requests

# Define seasons and generate URLs
base_url = "https://fbref.com/en/comps/9/{season}/schedule/{season}-Premier-League-Scores-and-Fixtures"
seasons = ["2018-2019", "2019-2020", "2020-2021", "2021-2022", "2022-2023", "2023-2024"]
urls = [base_url.format(season=season) for season in seasons]

# Print URLs to verify
print("Generated URLs:")
for url in urls:
    print(url)

Generated URLs:
https://fbref.com/en/comps/9/2018-2019/schedule/2018-2019-Premier-League-Scores-and-Fixtures
https://fbref.com/en/comps/9/2019-2020/schedule/2019-2020-Premier-League-Scores-and-Fixtures
https://fbref.com/en/comps/9/2020-2021/schedule/2020-2021-Premier-League-Scores-and-Fixtures
https://fbref.com/en/comps/9/2021-2022/schedule/2021-2022-Premier-League-Scores-and-Fixtures
https://fbref.com/en/comps/9/2022-2023/schedule/2022-2023-Premier-League-Scores-and-Fixtures
https://fbref.com/en/comps/9/2023-2024/schedule/2023-2024-Premier-League-Scores-and-Fixtures


In [4]:
# Test fetching HTML content for one season
test_url = urls[0]  # Use the first season's URL for testing
response = requests.get(test_url)

# Check if the request was successful
if response.status_code == 200:
    print("Successfully fetched HTML content!")
else:
    print(f"Failed to fetch HTML content. Status code: {response.status_code}")

# Print the first 500 characters of the HTML to verify
print("First 500 characters of the HTML:")
print(response.text[:500])

Successfully fetched HTML content!
First 500 characters of the HTML:
    
      
<!DOCTYPE html>
<html data-version="klecko-" data-root="/home/fb/deploy/www/base" lang="en" class="no-js" >
<head>
    <meta charset="utf-8">
    <meta http-equiv="x-ua-compatible" content="ie=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=2.0" />
    <link rel="dns-prefetch" href="https://cdn.ssref.net/req/202504030" />
<script>
/* https://docs.osano.com/hc/en-us/articles/22469433444372-Google-Consent-Mode-v2  */
  window.dataLayer = w


In [5]:
# Parse HTML and locate the table
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.find('table', {'class': 'stats_table'})

# Print the first few rows of the table to verify
if table:
    print("Table found!")
    print(table.find_all('tr')[:3])  # Print the first 3 rows
else:
    print("No table found.")

Table found!
[<tr> <th aria-label="Matchweek Number" class="poptip sort_default_asc center" data-stat="gameweek" data-tip="&lt;strong&gt;Matchweek Number&lt;/strong&gt;&lt;br&gt;Matchweek Number" scope="col">Wk</th> <th aria-label="Day" class="poptip sort_default_asc center" data-stat="dayofweek" data-tip="Day of week" scope="col">Day</th> <th aria-label="Date" class="poptip sort_default_asc center" data-stat="date" data-tip="Date listed is local to the match" scope="col">Date</th> <th aria-label="Time" class="poptip sort_default_asc center" data-stat="start_time" data-tip="Time listed is local to the match venue&lt;br&gt;Time is written in the 24-hour notation&lt;br&gt;Your local time is in (·) " scope="col">Time</th> <th aria-label="Home" class="poptip sort_default_asc center" data-stat="home_team" scope="col">Home</th> <th aria-label="xG: Expected Goals" class="poptip center" data-filter="1" data-name="xG: Expected Goals" data-stat="home_xg" data-tip="&lt;strong&gt;xG: Expected Goal

In [9]:
# Extract column headers and remove duplicates
headers = []
for header in table.find_all('th'):
    if 'data-stat' in header.attrs:
        header_name = header['data-stat']
        if header_name not in headers:  # Add only if it's not already in the list
            headers.append(header_name)

print("Column headers:", headers)

Column headers: ['gameweek', 'dayofweek', 'date', 'start_time', 'home_team', 'home_xg', 'score', 'away_xg', 'away_team', 'attendance', 'venue', 'referee', 'match_report', 'notes']


In [10]:
# Extract rows
rows = []
for row in table.find_all('tr')[1:]:  # Skip header row
    cells = [cell.text.strip() for cell in row.find_all(['td', 'th'])]
    rows.append(cells)

print("First few rows of data:")
print(rows[:3])

First few rows of data:
[['1', 'Fri', '2018-08-10', '20:00', 'Manchester Utd', '1.5', '2–1', '1.8', 'Leicester City', '74,439', 'Old Trafford', 'Andre Marriner', 'Match Report', ''], ['1', 'Sat', '2018-08-11', '12:30', 'Newcastle Utd', '1.0', '1–2', '2.0', 'Tottenham', '51,749', "St. James' Park", 'Martin Atkinson', 'Match Report', ''], ['1', 'Sat', '2018-08-11', '15:00', 'Fulham', '0.7', '0–2', '1.0', 'Crystal Palace', '24,821', 'Craven Cottage', 'Mike Dean', 'Match Report', '']]
