# Webscraping Premier League Stat Data 

## Scraping our first webpage

In [1]:
import requests

In [2]:
standings_url = 'https://fbref.com/en/comps/9/Premier-League-Stats'

In [3]:
data = requests.get(standings_url) #This generates the HTML string we need 

## Parsing HTML Links with BeautifulSoup

In [4]:
from bs4 import BeautifulSoup

In [5]:
soup = BeautifulSoup(data.text) #Intialize the soup object

In [6]:
standings_table = soup.select('table.stats_table')[0] #Returns standings table HTML

In [7]:
links = standings_table.find_all('a')
links = [l.get('href') for l in links]
links = [l for l in links if '/squads' in l] 
#Used two list comprehensions to filter for 'href' property, then filter again for 'a' tags that have 'squad' to extract teams

In [8]:
team_urls = [f'https://fbref.com{l}.com' for l in links] #Used format string to turn the links into absolute links

## Extracting match stats using pandas and requests

Lets start off with the first URL to build the logic we need

In [9]:
team1 = team_urls[0] #Manchester City Scores & Fixtures Page
team1

'https://fbref.com/en/squads/b8fd03ef/Manchester-City-Stats.com'

In [10]:
data = requests.get(team1)

In [11]:
import pandas as pd

In [12]:
matches = pd.read_html(data.text, match='Scores & Fixtures')

In [13]:
matches[0].head() #Used read_html to extract first match list element 

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes
0,2021-08-07,17:15,Community Shield,FA Community Shield,Sat,Neutral,L,0,1,Leicester City,,,57,,Fernandinho,4-3-3,Paul Tierney,Match Report,
1,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0,1,Tottenham,1.9,1.3,64,58262.0,Fernandinho,4-3-3,Anthony Taylor,Match Report,
2,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5,0,Norwich City,2.7,0.1,67,51437.0,İlkay Gündoğan,4-3-3,Graham Scott,Match Report,
3,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5,0,Arsenal,3.8,0.1,80,52276.0,İlkay Gündoğan,4-3-3,Martin Atkinson,Match Report,
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1,0,Leicester City,2.9,0.8,61,32087.0,İlkay Gündoğan,4-3-3,Paul Tierney,Match Report,


## Extracting match shooting stats with requests and pandas

In [14]:
soup = BeautifulSoup(data.text)
links = soup.find_all('a')
links = [l.get('href') for l in links] # Used to get the absolute URl 
links = [l for l in links if l and 'all_comps/shooting/' in l] #Closes in on the specific link
links #Same link is duplicated four times on the page

['/en/squads/b8fd03ef/2021-2022/matchlogs/all_comps/shooting/Manchester-City-Match-Logs-All-Competitions',
 '/en/squads/b8fd03ef/2021-2022/matchlogs/all_comps/shooting/Manchester-City-Match-Logs-All-Competitions',
 '/en/squads/b8fd03ef/2021-2022/matchlogs/all_comps/shooting/Manchester-City-Match-Logs-All-Competitions',
 '/en/squads/b8fd03ef/2021-2022/matchlogs/all_comps/shooting/Manchester-City-Match-Logs-All-Competitions']

In [15]:
data = requests.get(f'https://fbref.com{links[0]}')
shooting = pd.read_html(data.text, match='Shooting')[0]

In [16]:
shooting.head()

Unnamed: 0_level_0,For Manchester City,For Manchester City,For Manchester City,For Manchester City,For Manchester City,For Manchester City,For Manchester City,For Manchester City,For Manchester City,For Manchester City,...,Standard,Standard,Standard,Standard,Expected,Expected,Expected,Expected,Expected,Unnamed: 25_level_0
Unnamed: 0_level_1,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Dist,FK,PK,PKatt,xG,npxG,npxG/Sh,G-xG,np:G-xG,Match Report
0,2021-08-07,17:15,Community Shield,FA Community Shield,Sat,Neutral,L,0,1,Leicester City,...,,,0,0,,,,,,Match Report
1,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0,1,Tottenham,...,16.9,1.0,0,0,1.9,1.9,0.11,-1.9,-1.9,Match Report
2,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5,0,Norwich City,...,17.3,1.0,0,0,2.7,2.7,0.17,1.3,1.3,Match Report
3,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5,0,Arsenal,...,14.3,0.0,0,0,3.8,3.8,0.15,1.2,1.2,Match Report
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1,0,Leicester City,...,14.0,0.0,0,0,2.9,2.9,0.12,-1.9,-1.9,Match Report


# Cleaning and merging scraped data with pandas

The dataframe is a multindex so lets drop by one level

In [17]:
shooting.columns = shooting.columns.droplevel()

In [18]:
shooting.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Dist,FK,PK,PKatt,xG,npxG,npxG/Sh,G-xG,np:G-xG,Match Report
0,2021-08-07,17:15,Community Shield,FA Community Shield,Sat,Neutral,L,0,1,Leicester City,...,,,0,0,,,,,,Match Report
1,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0,1,Tottenham,...,16.9,1.0,0,0,1.9,1.9,0.11,-1.9,-1.9,Match Report
2,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5,0,Norwich City,...,17.3,1.0,0,0,2.7,2.7,0.17,1.3,1.3,Match Report
3,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5,0,Arsenal,...,14.3,0.0,0,0,3.8,3.8,0.15,1.2,1.2,Match Report
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1,0,Leicester City,...,14.0,0.0,0,0,2.9,2.9,0.12,-1.9,-1.9,Match Report


We now have two dataframes. Our `matches` dataframe as well as our `shooting` dataframe. Lets proceed to combine the two dataframes based on the  using the `pd.merge()` function to merge the two dataframes. We'll merge the `shooting` dataframe on the `Date`,`Sh`,`SoT`,`Dist`,`FK`,`PK` and `PKatt` columns

In [19]:
team_data = matches[0].merge(shooting[['Date', 'Sh', 'Dist', 'FK', 'PK', 'PKatt']], on='Date')

In [20]:
team_data.shape

(58, 24)

In [21]:
shooting.shape

(59, 26)

In [22]:
team_data.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Captain,Formation,Referee,Match Report,Notes,Sh,Dist,FK,PK,PKatt
0,2021-08-07,17:15,Community Shield,FA Community Shield,Sat,Neutral,L,0,1,Leicester City,...,Fernandinho,4-3-3,Paul Tierney,Match Report,,12,,,0,0
1,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0,1,Tottenham,...,Fernandinho,4-3-3,Anthony Taylor,Match Report,,18,16.9,1.0,0,0
2,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5,0,Norwich City,...,İlkay Gündoğan,4-3-3,Graham Scott,Match Report,,16,17.3,1.0,0,0
3,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5,0,Arsenal,...,İlkay Gündoğan,4-3-3,Martin Atkinson,Match Report,,25,14.3,0.0,0,0
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1,0,Leicester City,...,İlkay Gündoğan,4-3-3,Paul Tierney,Match Report,,25,14.0,0.0,0,0


# Scraping data for multiple season and teams with a loop

In [23]:
years = list(range(2022, 2020, -1)) #years we want to scrape for

In [24]:
years

[2022, 2021]

In [25]:
standings_url = 'https://fbref.com/en/comps/9/Premier-League-Stats'

In [26]:
all_matches=[]

In [27]:
import time
for year in years:
    data = requests.get(standings_url)
    soup = BeautifulSoup(data.text)
    standings_table = soup.select('table.stats_table')[0]

    links = [l.get("href") for l in standings_table.find_all('a')]
    links = [l for l in links if '/squads/' in l]
    team_urls = [f"https://fbref.com{l}" for l in links]
    
    previous_season = soup.select("a.prev")[0].get("href")
    standings_url = f"https://fbref.com{previous_season}"
    
    for team_url in team_urls:
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
        data = requests.get(team_url)
        matches = pd.read_html(data.text, match="Scores & Fixtures")[0]
        soup = BeautifulSoup(data.text)
        links = [l.get("href") for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/shooting/' in l]
        data = requests.get(f"https://fbref.com{links[0]}")
        shooting = pd.read_html(data.text, match="Shooting")[0]
        shooting.columns = shooting.columns.droplevel()
        try:
            team_data = matches.merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")
        except ValueError:
            continue
        team_data = team_data[team_data["Comp"] == "Premier League"]
        
        team_data["Season"] = year
        team_data["Team"] = team_name
        all_matches.append(team_data)
        time.sleep(10)

In [28]:
match_df = pd.concat(all_matches)

In [29]:
match_df.columns = [c.lower() for c in match_df.columns]

In [30]:
match_df

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
1,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0,1,Tottenham,...,Match Report,,18.0,4.0,16.9,1.0,0.0,0.0,2022,Manchester City
2,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5,0,Norwich City,...,Match Report,,16.0,4.0,17.3,1.0,0.0,0.0,2022,Manchester City
3,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5,0,Arsenal,...,Match Report,,25.0,10.0,14.3,0.0,0.0,0.0,2022,Manchester City
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1,0,Leicester City,...,Match Report,,25.0,8.0,14.0,0.0,0.0,0.0,2022,Manchester City
6,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Home,D,0,0,Southampton,...,Match Report,,16.0,1.0,15.7,1.0,0.0,0.0,2022,Manchester City
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38,2021-05-02,19:15,Premier League,Matchweek 34,Sun,Away,L,0,4,Tottenham,...,Match Report,,8.0,1.0,17.4,0.0,0.0,0.0,2021,Sheffield United
39,2021-05-08,15:00,Premier League,Matchweek 35,Sat,Home,L,0,2,Crystal Palace,...,Match Report,,7.0,0.0,11.4,1.0,0.0,0.0,2021,Sheffield United
40,2021-05-16,19:00,Premier League,Matchweek 36,Sun,Away,W,1,0,Everton,...,Match Report,,10.0,3.0,17.0,0.0,0.0,0.0,2021,Sheffield United
41,2021-05-19,18:00,Premier League,Matchweek 37,Wed,Away,L,0,1,Newcastle Utd,...,Match Report,,11.0,1.0,16.0,1.0,0.0,0.0,2021,Sheffield United


In [31]:
match_df.to_csv('matches.csv')