In [1]:
import pandas as pd
import requests
import time
from bs4 import BeautifulSoup

In [2]:
# Define the webscraping URL
standingsUrl = "https://fbref.com/en/comps/9/2021-2022/2021-2022-Premier-League-Stats"
# Download the html of the page
data = requests.get(standingsUrl)
# Initialize BeautifulSoup
soup = BeautifulSoup(data.text)
# Use CSS selecting to choose the table from the URL
standingsTable = soup.select('table.stats_table')[0]
# Find the 'a' tags, get the href properties for each link, and filter the links to remove unneeded links
links = standingsTable.find_all('a')
links = [l.get("href") for l in links]
links = [l for l in links if '/squads/' in l]

In [3]:
# Turn links into full URLs
teamUrls = [f"https://fbref.com{l}" for l in links]
teamUrls

['https://fbref.com/en/squads/b8fd03ef/2021-2022/Manchester-City-Stats',
 'https://fbref.com/en/squads/822bd0ba/2021-2022/Liverpool-Stats',
 'https://fbref.com/en/squads/cff3d9bb/2021-2022/Chelsea-Stats',
 'https://fbref.com/en/squads/361ca564/2021-2022/Tottenham-Hotspur-Stats',
 'https://fbref.com/en/squads/18bb7c10/2021-2022/Arsenal-Stats',
 'https://fbref.com/en/squads/19538871/2021-2022/Manchester-United-Stats',
 'https://fbref.com/en/squads/7c21e445/2021-2022/West-Ham-United-Stats',
 'https://fbref.com/en/squads/a2d435b3/2021-2022/Leicester-City-Stats',
 'https://fbref.com/en/squads/d07537b9/2021-2022/Brighton-and-Hove-Albion-Stats',
 'https://fbref.com/en/squads/8cec06e1/2021-2022/Wolverhampton-Wanderers-Stats',
 'https://fbref.com/en/squads/b2b47a98/2021-2022/Newcastle-United-Stats',
 'https://fbref.com/en/squads/47c64c55/2021-2022/Crystal-Palace-Stats',
 'https://fbref.com/en/squads/cd051869/2021-2022/Brentford-Stats',
 'https://fbref.com/en/squads/8602292d/2021-2022/Aston-Vill

In [4]:
# Focus on 1st link and figure out what is needed from it
teamUrl = teamUrls[0]
data = requests.get(teamUrl)
# Use pandas to read the table out of the html & turn it into a dataframe
matches = pd.read_html(data.text, match = "Scores & Fixtures")
# Now you can view the pandas dataframe
matches[0]

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes
0,2021-08-07,17:15,Community Shield,FA Community Shield,Sat,Neutral,L,0,1,Leicester City,,,57,,Fernandinho,4-3-3,Paul Tierney,Match Report,
1,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0,1,Tottenham,1.9,1.3,64,58262.0,Fernandinho,4-3-3,Anthony Taylor,Match Report,
2,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5,0,Norwich City,2.7,0.1,67,51437.0,İlkay Gündoğan,4-3-3,Graham Scott,Match Report,
3,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5,0,Arsenal,3.8,0.1,80,52276.0,İlkay Gündoğan,4-3-3,Martin Atkinson,Match Report,
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1,0,Leicester City,2.9,0.8,61,32087.0,İlkay Gündoğan,4-3-3,Paul Tierney,Match Report,
5,2021-09-15,20:00,Champions Lg,Group stage,Wed,Home,W,6,3,de RB Leipzig,2.1,0.6,51,38062.0,Rúben Dias,4-3-3,Serdar Gözübüyük,Match Report,
6,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Home,D,0,0,Southampton,1.1,0.4,63,52698.0,Fernandinho,4-3-3,Jonathan Moss,Match Report,
7,2021-09-21,19:45,EFL Cup,Third round,Tue,Home,W,6,1,Wycombe,,,79,30959.0,Kevin De Bruyne,4-3-3,Robert Jones,Match Report,
8,2021-09-25,12:30,Premier League,Matchweek 6,Sat,Away,W,1,0,Chelsea,1.7,0.3,60,40036.0,Rúben Dias,4-3-3,Michael Oliver,Match Report,
9,2021-09-28,21:00,Champions Lg,Group stage,Tue,Away,L,0,2,fr Paris S-G,1.9,0.8,54,37350.0,Rúben Dias,4-3-3,Carlos del Cerro,Match Report,


In [5]:
# More data is needed (shooting data) so repeat previous steps but for the shooting data
soup = BeautifulSoup(data.text)
links = soup.find_all('a')
links = [l.get("href") for l in links]
links = [l for l in links if l and 'all_comps/shooting/' in l]
links

['/en/squads/b8fd03ef/2021-2022/matchlogs/all_comps/shooting/Manchester-City-Match-Logs-All-Competitions',
 '/en/squads/b8fd03ef/2021-2022/matchlogs/all_comps/shooting/Manchester-City-Match-Logs-All-Competitions',
 '/en/squads/b8fd03ef/2021-2022/matchlogs/all_comps/shooting/Manchester-City-Match-Logs-All-Competitions',
 '/en/squads/b8fd03ef/2021-2022/matchlogs/all_comps/shooting/Manchester-City-Match-Logs-All-Competitions']

In [6]:
# Download the html of the shooting page
data = requests.get(f"http://fbref.com{links[0]}")
# Read in the shooting data and turn into dataframe
shootingStats = pd.read_html(data.text, match = "Shooting")[0]
# There are 2 index levels so drop 1
shootingStats.columns = shootingStats.columns.droplevel()
# Now you can view the shooting dataframe
shootingStats.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Dist,FK,PK,PKatt,xG,npxG,npxG/Sh,G-xG,np:G-xG,Match Report
0,2021-08-07,17:15,Community Shield,FA Community Shield,Sat,Neutral,L,0,1,Leicester City,...,,,0,0,,,,,,Match Report
1,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0,1,Tottenham,...,16.9,1.0,0,0,1.9,1.9,0.11,-1.9,-1.9,Match Report
2,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5,0,Norwich City,...,17.3,1.0,0,0,2.7,2.7,0.17,1.3,1.3,Match Report
3,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5,0,Arsenal,...,14.3,0.0,0,0,3.8,3.8,0.15,1.2,1.2,Match Report
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1,0,Leicester City,...,14.0,0.0,0,0,2.9,2.9,0.12,-1.9,-1.9,Match Report


In [7]:
matches = matches[0]

In [8]:
# Combine the 2 dataframes into 1. Avoid duplicate columns
teamData = matches.merge(shootingStats[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on = "Date")
teamData.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Formation,Referee,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt
0,2021-08-07,17:15,Community Shield,FA Community Shield,Sat,Neutral,L,0,1,Leicester City,...,4-3-3,Paul Tierney,Match Report,,12,3,,,0,0
1,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0,1,Tottenham,...,4-3-3,Anthony Taylor,Match Report,,18,4,16.9,1.0,0,0
2,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5,0,Norwich City,...,4-3-3,Graham Scott,Match Report,,16,4,17.3,1.0,0,0
3,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5,0,Arsenal,...,4-3-3,Martin Atkinson,Match Report,,25,10,14.3,0.0,0,0
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1,0,Leicester City,...,4-3-3,Paul Tierney,Match Report,,25,8,14.0,0.0,0,0


In [9]:
# So far the standings have been scraped,
# The data has been downloaded,
# And the data for a single team from a single season has been combined into 1 dataframe
# Next the same will be done but for multiple teams and multiple seasons

In [10]:
# Pick which years to scrape and put it into a list
years = list(range(2022, 2020, -1))
years

[2022, 2021]

In [11]:
# Initialize a list that will contain several dataframes that will be combined into 1
allMatches = []
# Define the starting URL
standingsUrl = "https://fbref.com/en/comps/9/11160/2021-2022-Premier-League-Stats"

In [17]:
# Here is the for-loop that will scrape data for multiple seasons and teams
# Many of the previous steps are done again here

for year in years:
    data = requests.get(standingsUrl)
    soup = BeautifulSoup(data.text)
    standingsTable = soup.select('table.stats_table')[0]
    
    links = [l.get("href") for l in standingsTable.find_all('a')]
    links = [l for l in links if '/squads/' in l]
    teamUrls = [f"https://fbref.com{l}" for l in links]
    
# Grab the URL for previous seasons
    previousSeason = soup.select("a.prev")[0].get("href")
    standingsUrl = f"http://fbref.com/{previousSeason}"
    
# Here we will individually scrape match logs for each team
    for teamUrl in teamUrls:
# Set the team names properly from the URL link
        teamName = teamUrl.split("/")[-1].replace("-Stats", "").replace("-", " ")
# Parse table
        data = requests.get(teamUrl)
        matches = pd.read_html(data.text, match="Scores & Fixtures")[0]
# Pull the shooting link
        soup = BeautifulSoup(data.text)
        links = [l.get("href") for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/shooting/' in l]
# Convert to absolute URL and read in data
        data = requests.get(f"https://fbref.com{links[0]}")
        shootingStats
        
# Some teams don't have shooting data so the try method is used to skip them to avoid errors
        try:
            teamData
        except ValueError:
            continue
# Filter out data that is not in premier league
        teamData = teamData[teamData['Comp'] == "Premier League"]
# Add columns to represent the season and team
        teamData["Season"] = year
        teamData["Team"] = teamName
# Add teamData to the list of dataframes
        allMatches.append(teamData)
# Sleep to avoid being blocked by the website for scraping too quickly
        time.sleep(5)

In [61]:
# Combine all of the dataframes into 1
matchDF = pd.concat(allMatches)

In [64]:
# Delete any duplicate data
matchDF = matchDF.drop_duplicates(['Date', 'Time', 'Team', 'Season'])

In [69]:
# Shape shows the number of rows and columns are in the dataset
# Number of rows should equal the total number of games played for all teams
matchDF.shape

(1634, 27)

In [65]:
# Show the number of games each team has played
matchDF['Team'].value_counts()

Arsenal                     76
Leicester City              76
Brighton and Hove Albion    76
Watford                     76
Southampton                 76
Burnley                     76
Chelsea                     76
Everton                     76
West Ham United             76
Manchester City             76
Wolverhampton Wanderers     76
Crystal Palace              76
Bournemouth                 76
Tottenham Hotspur           76
Manchester United           76
Fulham                      76
Newcastle United            76
Liverpool                   76
West Bromwich Albion        38
Aston Villa                 38
Norwich City                38
Cardiff City                38
Leeds United                38
Sheffield United            38
Huddersfield Town           38
Name: Team, dtype: int64

In [70]:
# Reorder the columns to whatever is convenient
matchDF = matchDF[['Team', 'Opponent', 'Round', 'Season', 'Result', 'Date', 'Time', 'GF', 'GA', 'Sh', 'SoT', 'Dist', 'FK', 'PK', 'PKatt', 'xG', 'xGA', 'Poss', 'Captain', 'Referee', 'Attendance', 'Formation', 'Match Report', 'Notes', 'Day', 'Venue', 'Comp']]
matchDF

Unnamed: 0,Team,Opponent,Round,Season,Result,Date,Time,GF,GA,Sh,...,Poss,Captain,Referee,Attendance,Formation,Match Report,Notes,Day,Venue,Comp
1,Manchester City,Tottenham,Matchweek 1,2022,L,2021-08-15,16:30,0,1,18,...,64,Fernandinho,Anthony Taylor,58262.0,4-3-3,Match Report,,Sun,Away,Premier League
2,Manchester City,Norwich City,Matchweek 2,2022,W,2021-08-21,15:00,5,0,16,...,67,İlkay Gündoğan,Graham Scott,51437.0,4-3-3,Match Report,,Sat,Home,Premier League
3,Manchester City,Arsenal,Matchweek 3,2022,W,2021-08-28,12:30,5,0,25,...,80,İlkay Gündoğan,Martin Atkinson,52276.0,4-3-3,Match Report,,Sat,Home,Premier League
4,Manchester City,Leicester City,Matchweek 4,2022,W,2021-09-11,15:00,1,0,25,...,61,İlkay Gündoğan,Paul Tierney,32087.0,4-3-3,Match Report,,Sat,Away,Premier League
6,Manchester City,Southampton,Matchweek 5,2022,D,2021-09-18,15:00,0,0,16,...,63,Fernandinho,Jonathan Moss,52698.0,4-3-3,Match Report,,Sat,Home,Premier League
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52,Huddersfield Town,Leeds United,Matchweek 35,2021,W,2022-04-30,17:30,4,0,19,...,59,İlkay Gündoğan,Paul Tierney,35771.0,4-2-3-1,Match Report,,Sat,Away,Premier League
54,Huddersfield Town,Newcastle Utd,Matchweek 36,2021,W,2022-05-08,16:30,5,0,21,...,71,İlkay Gündoğan,Stuart Attwell,53336.0,4-2-3-1,Match Report,,Sun,Home,Premier League
55,Huddersfield Town,Wolves,Matchweek 33,2021,W,2022-05-11,20:15,5,1,15,...,67,Fernandinho,Martin Atkinson,32000.0,4-2-3-1,Match Report,,Wed,Away,Premier League
56,Huddersfield Town,West Ham,Matchweek 37,2021,D,2022-05-15,14:00,2,2,30,...,78,Fernandinho,Anthony Taylor,59972.0,4-3-3,Match Report,,Sun,Away,Premier League


In [68]:
# Write to a csv file
matchDF.to_csv("matches2.csv")