# Premier League Data Web Scrapping 

In [None]:
# import libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import cloudscraper  # to enable javascript rendering incase the website the website restricts scraping
import time

## Extract Links to each teams statistics

This project part is focused on:

    - extract the links of each team stats

    - extract the statistics of the first team (scores and fixtures)
    
    - load the data to a dataframe

In [2]:
# initialize the cloud scrapper to bypass javascript restrictions
scraper = cloudscraper.create_scraper()

In [None]:
# site url 
url = "https://fbref.com/en/comps/9/Premier-League-Stats"

# request the data
response = scraper.get(url)

# initialize BeautifulSoup with HTML
soup = BeautifulSoup(response.content, 'html.parser') # alternatively response.content can be used to ouput the same contents
soup

~ the scrapped HTML document contains links to the statistics of each of the 20 premier league teams. 
extract the links 

In [5]:
# use css selector to narrow down to the class
stats_tables = soup.select('table.stats_table')[0] # select only the first
stats_tables

<table class="stats_table sortable min_width force_mobilize" data-cols-to-freeze=",2" id="results2024-202591_overall"> <caption>Premier League Table</caption> <colgroup><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/></colgroup> <thead> <tr> <th aria-label="Rank" class="poptip sort_default_asc center" data-stat="rank" data-tip="&lt;strong&gt;Rank&lt;/strong&gt;&lt;br&gt;Squad finish in competition&lt;br&gt;Finish within the league or competition.&lt;br&gt;For knockout competitions may show final round reached.&lt;br&gt;Colors and arrows represent promotion/relegation or qualifiation for continental cups.&lt;br&gt;Trophy indicates team won league whether by playoffs or by leading the table.&lt;br&gt;Star indicates topped table in league USING another means of naming champion." scope="col">Rk</th> <th aria-label="Squad" class="poptip sort_default_asc center" data-stat="team" scope="col">Squad</th> <th aria-label="Matches P

In [6]:
# find all tags with the links 
links = stats_tables.find_all('a')
links

[<a href="/en/squads/822bd0ba/Liverpool-Stats">Liverpool</a>,
 <a href="/en/players/e342ad68/Mohamed-Salah">Mohamed Salah</a>,
 <a href="/en/players/7a2e46a8/Alisson">Alisson</a>,
 <a href="/en/squads/18bb7c10/Arsenal-Stats">Arsenal</a>,
 <a href="/en/players/fed7cb61/Kai-Havertz">Kai Havertz</a>,
 <a href="/en/players/98ea5115/David-Raya">David Raya</a>,
 <a href="/en/squads/b8fd03ef/2024-2025/Manchester-City-Stats">Manchester City</a>,
 <a href="/en/players/1f44ac21/Erling-Haaland">Erling Haaland</a>,
 <a href="/en/players/3bb7b8b4/Ederson">Ederson</a>,
 <a href="/en/squads/cff3d9bb/2024-2025/Chelsea-Stats">Chelsea</a>,
 <a href="/en/players/dc7f8a28/Cole-Palmer">Cole Palmer</a>,
 <a href="/en/players/6a713852/Robert-Sanchez">Robert Sánchez</a>,
 <a href="/en/squads/b2b47a98/Newcastle-United-Stats">Newcastle Utd</a>,
 <a href="/en/players/8e92be30/Alexander-Isak">Alexander Isak</a>,
 <a href="/en/players/4b40d9ca/Nick-Pope">Nick Pope</a>,
 <a href="/en/squads/8602292d/Aston-Villa-Sta

In [7]:
# select only the 'href' property of the links
links = [l.get('href') for l in links]
links

['/en/squads/822bd0ba/Liverpool-Stats',
 '/en/players/e342ad68/Mohamed-Salah',
 '/en/players/7a2e46a8/Alisson',
 '/en/squads/18bb7c10/Arsenal-Stats',
 '/en/players/fed7cb61/Kai-Havertz',
 '/en/players/98ea5115/David-Raya',
 '/en/squads/b8fd03ef/2024-2025/Manchester-City-Stats',
 '/en/players/1f44ac21/Erling-Haaland',
 '/en/players/3bb7b8b4/Ederson',
 '/en/squads/cff3d9bb/2024-2025/Chelsea-Stats',
 '/en/players/dc7f8a28/Cole-Palmer',
 '/en/players/6a713852/Robert-Sanchez',
 '/en/squads/b2b47a98/Newcastle-United-Stats',
 '/en/players/8e92be30/Alexander-Isak',
 '/en/players/4b40d9ca/Nick-Pope',
 '/en/squads/8602292d/Aston-Villa-Stats',
 '/en/players/aed3a70f/Ollie-Watkins',
 '/en/players/7956236f/Emiliano-Martinez',
 '/en/squads/e4a775cb/Nottingham-Forest-Stats',
 '/en/players/4e9a0555/Chris-Wood',
 '/en/players/834b5c4c/Matz-Sels',
 '/en/squads/d07537b9/Brighton-and-Hove-Albion-Stats',
 '/en/players/ce5143da/Danny-Welbeck',
 '/en/players/e8832875/Joao-Pedro',
 '/en/players/cf134113/Bart-

In [8]:
# filter to output links with only squads in them
links = [l for l in links if 'squads'in l]
links

['/en/squads/822bd0ba/Liverpool-Stats',
 '/en/squads/18bb7c10/Arsenal-Stats',
 '/en/squads/b8fd03ef/2024-2025/Manchester-City-Stats',
 '/en/squads/cff3d9bb/2024-2025/Chelsea-Stats',
 '/en/squads/b2b47a98/Newcastle-United-Stats',
 '/en/squads/8602292d/Aston-Villa-Stats',
 '/en/squads/e4a775cb/Nottingham-Forest-Stats',
 '/en/squads/d07537b9/Brighton-and-Hove-Albion-Stats',
 '/en/squads/4ba7cbea/Bournemouth-Stats',
 '/en/squads/cd051869/Brentford-Stats',
 '/en/squads/fd962109/Fulham-Stats',
 '/en/squads/47c64c55/Crystal-Palace-Stats',
 '/en/squads/d3fd31cc/Everton-Stats',
 '/en/squads/7c21e445/West-Ham-United-Stats',
 '/en/squads/19538871/Manchester-United-Stats',
 '/en/squads/8cec06e1/Wolverhampton-Wanderers-Stats',
 '/en/squads/361ca564/Tottenham-Hotspur-Stats',
 '/en/squads/a2d435b3/Leicester-City-Stats',
 '/en/squads/b74092de/Ipswich-Town-Stats',
 '/en/squads/33c895d4/Southampton-Stats']

In [9]:
# format the links accordingly(using https prefix to create absolute links)
team_links = [f"https://fbref.com{l}" for l in links]
team_links


['https://fbref.com/en/squads/822bd0ba/Liverpool-Stats',
 'https://fbref.com/en/squads/18bb7c10/Arsenal-Stats',
 'https://fbref.com/en/squads/b8fd03ef/2024-2025/Manchester-City-Stats',
 'https://fbref.com/en/squads/cff3d9bb/2024-2025/Chelsea-Stats',
 'https://fbref.com/en/squads/b2b47a98/Newcastle-United-Stats',
 'https://fbref.com/en/squads/8602292d/Aston-Villa-Stats',
 'https://fbref.com/en/squads/e4a775cb/Nottingham-Forest-Stats',
 'https://fbref.com/en/squads/d07537b9/Brighton-and-Hove-Albion-Stats',
 'https://fbref.com/en/squads/4ba7cbea/Bournemouth-Stats',
 'https://fbref.com/en/squads/cd051869/Brentford-Stats',
 'https://fbref.com/en/squads/fd962109/Fulham-Stats',
 'https://fbref.com/en/squads/47c64c55/Crystal-Palace-Stats',
 'https://fbref.com/en/squads/d3fd31cc/Everton-Stats',
 'https://fbref.com/en/squads/7c21e445/West-Ham-United-Stats',
 'https://fbref.com/en/squads/19538871/Manchester-United-Stats',
 'https://fbref.com/en/squads/8cec06e1/Wolverhampton-Wanderers-Stats',
 'ht

Now that we have the links to the individual clubs statistics, get the data from them.
    - data selected from the html documents in the documents above will be scores & fixture data and 
    - some data columns from the shootings table

pandas read_html method will be used to get the data from the HTML document (id for matching data from the specific table)
columns to select from shootings table:
    

In [10]:
# scrape the score and fixtures table from the first link('Liverpool')

import cloudscraper
# use cloudsscraper to scrape from the team_links
team_link = team_links[0]

# initialise the cloudscraper and scrape data from the link
scraper = cloudscraper.create_scraper()

response = scraper.get(team_link)


In [11]:
# find the scores and fixtures table using pandas
table_sf = pd.read_html(response.text, match= 'Scores & Fixtures') # use 'Scores & Fixtures' caption to match the table

# 
stats_table = table_sf[0] 
stats_table.head(10)

  table_sf = pd.read_html(response.text, match= 'Scores & Fixtures') # use 'Scores & Fixtures' caption to match the table


Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Opp Formation,Referee,Match Report,Notes
0,2024-08-17,12:30,Premier League,Matchweek 1,Sat,Away,W,2,0,Ipswich Town,2.6,0.5,62.0,30014.0,Virgil van Dijk,4-2-3-1,4-2-3-1,Tim Robinson,Match Report,
1,2024-08-25,16:30,Premier League,Matchweek 2,Sun,Home,W,2,0,Brentford,2.5,0.5,62.0,60017.0,Virgil van Dijk,4-2-3-1,4-4-2,Stuart Attwell,Match Report,
2,2024-09-01,16:00,Premier League,Matchweek 3,Sun,Away,W,3,0,Manchester Utd,1.8,1.4,47.0,73738.0,Virgil van Dijk,4-2-3-1,4-2-3-1,Anthony Taylor,Match Report,
3,2024-09-14,15:00,Premier League,Matchweek 4,Sat,Home,L,0,1,Nott'ham Forest,0.9,0.4,68.0,60344.0,Virgil van Dijk,4-2-3-1,4-2-3-1,Michael Oliver,Match Report,
4,2024-09-17,21:00,Champions Lg,League phase,Tue,Away,W,3,1,it Milan,3.1,0.6,51.0,59826.0,Virgil van Dijk,4-2-3-1,4-2-3-1,Espen Eskås,Match Report,
5,2024-09-21,15:00,Premier League,Matchweek 5,Sat,Home,W,3,0,Bournemouth,2.0,1.1,58.0,60347.0,Virgil van Dijk,4-2-3-1,4-2-3-1,Tony Harrington,Match Report,
6,2024-09-25,20:00,EFL Cup,Third round,Wed,Home,W,5,1,West Ham,,,61.0,60044.0,Joe Gomez,4-2-3-1,4-2-3-1,Andy Madley,Match Report,
7,2024-09-28,17:30,Premier League,Matchweek 6,Sat,Away,W,2,1,Wolves,2.5,0.6,55.0,31413.0,Virgil van Dijk,4-2-3-1,4-1-4-1,Anthony Taylor,Match Report,
8,2024-10-02,20:00,Champions Lg,League phase,Wed,Home,W,2,0,it Bologna,1.2,0.6,51.0,59816.0,Virgil van Dijk,4-2-3-1,4-1-4-1,Nikola Dabanović,Match Report,
9,2024-10-05,12:30,Premier League,Matchweek 7,Sat,Away,W,1,0,Crystal Palace,1.4,0.6,68.0,25185.0,Virgil van Dijk,4-2-3-1,3-4-3,Simon Hooper,Match Report,


In [12]:
stats_table.shape

(56, 20)

## Extract Shooting data
This part focused on: 

    - on extracting the shooting_stats link from the first team_links(team_links[0])

    - parsing the shooting_stats link to exract information on the shooting_stats table

    - creating shooting_table with pandas
    
    - merging the stats_table and specific column on the shooting_table

In [13]:
# extract the html document from  team_link and parse it

# extract html
response = scraper.get(team_link)

# parse
soup = BeautifulSoup(response.text, 'html.parser')

In [14]:
# extract the shooting stats link
links = soup.find_all('a')

#list comprehension to extract the shooting_stats link
links = [l.get('href') for l in links]


In [15]:
# Filter for keeper match logs
shooting_stats = [l for l in links if l and '/all_comps/shooting' in l]
shooting_stats
#href="/en/squads/822bd0ba/2024-2025/matchlogs/all_comps/shooting/Liverpool-Match-Logs-All-Competitions"

['/en/squads/822bd0ba/2024-2025/matchlogs/all_comps/shooting/Liverpool-Match-Logs-All-Competitions',
 '/en/squads/822bd0ba/2024-2025/matchlogs/all_comps/shooting/Liverpool-Match-Logs-All-Competitions',
 '/en/squads/822bd0ba/2024-2025/matchlogs/all_comps/shooting/Liverpool-Match-Logs-All-Competitions',
 '/en/squads/822bd0ba/2024-2025/matchlogs/all_comps/shooting/Liverpool-Match-Logs-All-Competitions']

In [16]:
shooting_stats = [f'https://fbref.com{l}' for l in shooting_stats]
shooting_stats 

['https://fbref.com/en/squads/822bd0ba/2024-2025/matchlogs/all_comps/shooting/Liverpool-Match-Logs-All-Competitions',
 'https://fbref.com/en/squads/822bd0ba/2024-2025/matchlogs/all_comps/shooting/Liverpool-Match-Logs-All-Competitions',
 'https://fbref.com/en/squads/822bd0ba/2024-2025/matchlogs/all_comps/shooting/Liverpool-Match-Logs-All-Competitions',
 'https://fbref.com/en/squads/822bd0ba/2024-2025/matchlogs/all_comps/shooting/Liverpool-Match-Logs-All-Competitions']

In [17]:
# use scrapper to get data
data = scraper.get(shooting_stats[0])

shooting_table = pd.read_html(data.text, attrs={"id":"matchlogs_for"})
shooting_table = shooting_table[0]

  shooting_table = pd.read_html(data.text, attrs={"id":"matchlogs_for"})


In [18]:
shooting_table.shape

(57, 26)

In [19]:
# drop extra level
shooting_table.columns = shooting_table.columns.droplevel()
shooting_table.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Dist,FK,PK,PKatt,xG,npxG,npxG/Sh,G-xG,np:G-xG,Match Report
0,2024-08-17,12:30,Premier League,Matchweek 1,Sat,Away,W,2,0,Ipswich Town,...,14.8,0.0,0,0,2.6,2.6,0.15,-0.6,-0.6,Match Report
1,2024-08-25,16:30,Premier League,Matchweek 2,Sun,Home,W,2,0,Brentford,...,13.6,1.0,0,0,2.5,2.5,0.14,-0.5,-0.5,Match Report
2,2024-09-01,16:00,Premier League,Matchweek 3,Sun,Away,W,3,0,Manchester Utd,...,13.4,0.0,0,0,1.8,1.8,0.16,1.2,1.2,Match Report
3,2024-09-14,15:00,Premier League,Matchweek 4,Sat,Home,L,0,1,Nott'ham Forest,...,14.9,0.0,0,0,0.9,0.9,0.07,-0.9,-0.9,Match Report
4,2024-09-17,21:00,Champions Lg,League phase,Tue,Away,W,3,1,it Milan,...,15.7,1.0,0,0,3.1,3.1,0.14,-0.1,-0.1,Match Report


In [20]:
shooting_table.shape

(57, 26)

In [21]:
# concatenate the dataframes on date column
epl_stats_2024 = stats_table.merge(shooting_table[['Date','Gls','Sh','SoT','SoT%', 'G/Sh','G/SoT','Dist','FK','PK','PKatt' ]], on = 'Date')
epl_stats_2024.head(10)

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Gls,Sh,SoT,SoT%,G/Sh,G/SoT,Dist,FK,PK,PKatt
0,2024-08-17,12:30,Premier League,Matchweek 1,Sat,Away,W,2,0,Ipswich Town,...,2,18.0,5.0,27.8,0.11,0.4,14.8,0.0,0,0
1,2024-08-25,16:30,Premier League,Matchweek 2,Sun,Home,W,2,0,Brentford,...,2,19.0,8.0,42.1,0.11,0.25,13.6,1.0,0,0
2,2024-09-01,16:00,Premier League,Matchweek 3,Sun,Away,W,3,0,Manchester Utd,...,3,11.0,3.0,27.3,0.27,1.0,13.4,0.0,0,0
3,2024-09-14,15:00,Premier League,Matchweek 4,Sat,Home,L,0,1,Nott'ham Forest,...,0,14.0,5.0,35.7,0.0,0.0,14.9,0.0,0,0
4,2024-09-17,21:00,Champions Lg,League phase,Tue,Away,W,3,1,it Milan,...,3,23.0,11.0,47.8,0.13,0.27,15.7,1.0,0,0
5,2024-09-21,15:00,Premier League,Matchweek 5,Sat,Home,W,3,0,Bournemouth,...,3,19.0,12.0,63.2,0.16,0.25,16.6,0.0,0,0
6,2024-09-25,20:00,EFL Cup,Third round,Wed,Home,W,5,1,West Ham,...,5,21.0,11.0,52.4,0.24,0.45,,,0,0
7,2024-09-28,17:30,Premier League,Matchweek 6,Sat,Away,W,2,1,Wolves,...,2,9.0,5.0,55.6,0.11,0.2,18.0,1.0,1,1
8,2024-10-02,20:00,Champions Lg,League phase,Wed,Home,W,2,0,it Bologna,...,2,9.0,4.0,44.4,0.22,0.5,15.4,0.0,0,0
9,2024-10-05,12:30,Premier League,Matchweek 7,Sat,Away,W,1,0,Crystal Palace,...,1,16.0,4.0,25.0,0.06,0.25,18.9,0.0,0,0


In [3]:
import pandas as pd

In [22]:
print(type(epl_stats_2024))

<class 'pandas.core.frame.DataFrame'>


In [25]:
# save df to csv
epl_stats_2024.to_csv('liverpool_stats_2024.csv', index = False)

# Exracting the data for the rest of the teams using loops

    - 