Project goals:
- Scrape football data from Premier League site.
- Process must be automated.
- Output must be dataframe of top 10 goal scorers for 2025/26 season, the club they play for, the number of goals they scored, the number of goals scored by their respective clubs, and a player vs club goals ratio.

The following is the relevant code to produce this output (should take about a minute to run):

In [1]:
# Import relevant libraries
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
import requests
import re
import time
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException, NoSuchElementException
import pandas as pd
import json
import subprocess

# Starting WebDriver Session
# If you don't want the page (GUI - Graphical User Interface) to show up, you can run...
# the following code (recommended when system is fully automated - will cut down on...
# computational expenses):

options = Options()
options.add_argument("--headless") # Run without popping up GUI
# set options parameter in 'driver' object to options object during instantiation

# Initializing firefox webdriver
driver = webdriver.Firefox(options)

# open the desired URL
driver.get("https://www.premierleague.com/")

# wait for URL to load
time.sleep(5)

# ACCEPT ALL COOKIES

try:
    # Locate the "Accept All Cookies" button
    accept_cookies_button = WebDriverWait(driver, 15).until(
        EC.presence_of_element_located((By.XPATH, "//button[contains(text(), 'Accept All Cookies')]")))
    
    # Scroll the "Accept All Cookies" button into view
    driver.execute_script("arguments[0].scrollIntoView({inline: 'center'});", accept_cookies_button)
    
    # Using ActionChanins() class to simulate/automate actions taken to click on the 'Accept All Cookies' button
    ActionChains(driver).move_to_element(accept_cookies_button).click(accept_cookies_button).perform()
# Exceptions
except TimeoutException as e:
    print('Accept All Cookies button did not appear in time')
except NoSuchElementException as e:
    print('Accept All Cookies button was not found')

# EXIT POP-UP

try:
    # Locate the "Close Advert" button
    # Use a list of common class names used in pop-ups on most sites
    close_advert_button = WebDriverWait(driver, 15).until(
        EC.presence_of_element_located((By.XPATH, "//*[contains(@class, 'close') or \
                                                       contains(@class, 'dismiss') or \
                                                       contains(@class, 'exit') or \
                                                       contains(@class, 'cancel') or \
                                                       contains(@class, 'hide') or \
                                                       text() = 'x' or \
                                                       text() = 'X']")))
    
    # Check if the "Close Advert" button is actually visible
    if close_advert_button.is_displayed():
        # Scroll the "Close Advert" button into view
        driver.execute_script("arguments[0].scrollIntoView({inline: 'center'});", close_advert_button)
        
        # Click the "Close Advert" button using ActionChains
        ActionChains(driver).move_to_element(close_advert_button).click(close_advert_button).perform()
        #print("Advert closed successfully.")
    #else:
        #print("Advert is present but not visible.")

    # Exceptions
except TimeoutException as e:
    print('Close Advert button did not appear in time')
except NoSuchElementException as e:
    print('Close Advert button was not found')

# Find stats tag and wait for it to be clickable
# Create re-usable function to click on stats tab - have to re-locate elements after use/navigation because DOMs...
# become stale
# [self::a or self::button] = match if element is 'a' or 'button' tag
def get_stats_tab(driver):
    return WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//*[self::a or self::button][contains(@class, 'main-nav') and \
                                                           .//text()[contains(normalize-space(), 'Statistics')]]")))

# Calling 'get_stats_tab' function and clicking 'stats_tab'
get_stats_tab(driver).click()

# Click players 'Goals' stats board
goals_board_players = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//a[contains(@class, 'stats-leaderboard__header-link') and \
                                                           contains(@href, 'players') and \
                                                           .//span[contains(normalize-space(), 'Goals')]]")))

# Using ActionChanins() class to simulate/automate actions taken to click on the goals board
ActionChains(driver).move_to_element(goals_board_players).click(goals_board_players).perform()

# Wait for player stats table to load before scraping
WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "a.stats-table__stat-row")))

# Extract player goal stats
html_player_goals = driver.page_source

# Go back to stats tab
get_stats_tab(driver).click()

# Click clubs 'Goals' stats board
goals_board_clubs = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//a[contains(@class, 'stats-leaderboard__header-link') and \
                                                           contains(@href, 'clubs') and \
                                                           .//span[contains(normalize-space(), 'Goals')]]")))

# Scroll 'goals_board_clubs' into view
driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", goals_board_clubs)

# Using ActionChanins() class to simulate/automate actions taken to click on the goals board
ActionChains(driver).move_to_element(goals_board_clubs).click(goals_board_clubs).perform()

# Wait for player stats table to load before scraping
WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "a.stats-table__stat-row")))

# Extract first page of club goal stats
html_club_goals_1 = driver.page_source

# Click on arrow to go to next page (using CSS_selector)
# Right arrow will be last arrow, hence '-1' python syntax list selection
next_page_button = WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, "button.stats-leaderboard-table__btn")))[-1]

# Using ActionChanins() class to simulate/automate actions taken to click on the goals board
ActionChains(driver).move_to_element(next_page_button).click(next_page_button).perform()

# Wait for player stats table to load before scraping
WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "a.stats-table__stat-row")))

# Extract second page of club goal stats
html_club_goals_2 = driver.page_source

# quit session
driver.quit()

# Combine 'html_club_goals_1' and 'html_club_goals_2'
combined_club_goals = "<div>" + html_club_goals_1 + html_club_goals_2 + "</div>"

# Parse the html
soup_player_goals = BeautifulSoup(html_player_goals, 'lxml')
soup_club_goals = BeautifulSoup(combined_club_goals, 'lxml')

# Creating a for-loop that will extract player name, club and goals scored from 'soup_player_goals', then...
# store as dataframe

# initialize an empty list to store the player data
player_data = []

# extracting details for all players (top 10)
player_details = soup_player_goals.find_all('a', class_ = 'stats-table__stat-row')

# for-loop
for player_deet in player_details:
    # extracting player name from 'player_details' object
    # .text if you just want the text
    name = player_deet.find('span', class_ = 'stats-table__name')
    name = name = name.text.strip() 
    # extracting club name from 'player_details' object
    club = player_deet.find('div', class_ = 'club-badge')
    club = club.find_next_sibling(string=True).strip()  # text was in the next node after the one being scanned through, hence 'find_next_sibling' 
    # extracting player goals from 'player_details' object                 
    player_goals = player_deet.find('span', class_ = 'stat-value-cell')
    player_goals = int(player_goals.text.strip()) # make numerical
    
    # Create dictionary output: append data to 'player_data' list
    player_data.append({
        'Name': name,
        'Club': club,
        'Player goals': player_goals
    })
   
# convert list of dictionaries into a Pandas dataframe
player_df = pd.DataFrame(player_data)

# Creating a for-loop that will extract club and goals scored from 'soup_club_goals', then...
# store as dataframe

# initialize an empty list to store the club data
club_data = []

# extracting details for all clubs
club_details = soup_club_goals.find_all('a', class_ = 'stats-table__stat-row')

# for-loop
for club_deet in club_details:
    # extracting club from 'club_details' object
    club = club_deet.find('span', class_ = 'stats-table__name')
    club = club.text.strip() 
    # extracting club goals from 'club_details' object                 
    club_goals = club_deet.find('span', class_ = 'stat-value-cell')
    club_goals = int(club_goals.text.strip())
    
    # Create dictionary output: append data to 'player_data' list
    club_data.append({
        'Club': club,
        'Club goals': club_goals
    })
   
# convert list of dictionaries into a Pandas dataframe
club_df = pd.DataFrame(club_data)

# Left join club data onto player data
pc_df = pd.merge(
    player_df,        
    club_df,          
    on="Club",
    how="left"
)

# Add row that calculates player vs club goal ratio
pc_df["Player-club ratio"] = (pc_df["Player goals"] / pc_df["Club goals"]).round(2)
pc_df

Unnamed: 0,Name,Club,Player goals,Club goals,Player-club ratio
0,Erling Haaland,Manchester City,17,38,0.45
1,Igor Thiago,Brentford,11,22,0.5
2,Hugo Ekitiké,Liverpool,7,26,0.27
3,Danny Welbeck,Brighton and Hove Albion,7,25,0.28
4,Jean-Philippe Mateta,Crystal Palace,7,20,0.35
5,Phil Foden,Manchester City,7,38,0.18
6,Bryan Mbeumo,Manchester United,6,26,0.23
7,Antoine Semenyo,Bournemouth,6,21,0.29
8,Richarlison,Tottenham Hotspur,6,25,0.24
9,Nick Woltemade,Newcastle United,5,21,0.24


The following code saves the dataframe as an Excel and a JSON file if required (PC local path must be specified):

In [3]:
# Covert to Excel file (enter desired path + file-save-name in 'pathsave' object)
# 'index=False: don't save row numbers in Excel file
pathsave = 'C:\\Users\\mufar\\Downloads\\Player Club Goal Stats.xlsx'
pc_df.to_excel(pathsave, index=False)

# Convert to JSON and save to local drive
pc_dict = pc_df.to_dict(orient="records")
with open("C:\\Users\\mufar\\Downloads\\Player Club Goal Stats.json", "w") as json_file: # enter desired path + file-save-name
    json.dump(pc_dict, json_file, indent=4)