## Initial Preprocessing

In [233]:
import requests as re

In [234]:
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [235]:
data = re.get(standings_url)

In [236]:
#print(data.text)

In [237]:
#parse html using bsoup

In [238]:
from bs4 import BeautifulSoup

In [239]:
soup = BeautifulSoup(data.text)

In [240]:
#initializing the object with bs RENDERS THE SAME AS CELL 9
#soup

In [241]:
#create selector to scrape only the stats table in the page to scrape
#take only the table class 
standings_table = soup.select('table.stats_table')[0]
#find all 'a' tags, the hyperlinks
links = standings_table.find_all('a')

In [242]:
#get the href(the URLs) property for each link, using a list comprehension
links = [l.get('href') for l in links]

In [243]:
#get links for the squads
links = [l for l in links if '/squads/' in l]
links

['/en/squads/b8fd03ef/Manchester-City-Stats',
 '/en/squads/822bd0ba/Liverpool-Stats',
 '/en/squads/cff3d9bb/Chelsea-Stats',
 '/en/squads/18bb7c10/Arsenal-Stats',
 '/en/squads/361ca564/Tottenham-Hotspur-Stats',
 '/en/squads/19538871/Manchester-United-Stats',
 '/en/squads/7c21e445/West-Ham-United-Stats',
 '/en/squads/8cec06e1/Wolverhampton-Wanderers-Stats',
 '/en/squads/d07537b9/Brighton-and-Hove-Albion-Stats',
 '/en/squads/b2b47a98/Newcastle-United-Stats',
 '/en/squads/a2d435b3/Leicester-City-Stats',
 '/en/squads/47c64c55/Crystal-Palace-Stats',
 '/en/squads/8602292d/Aston-Villa-Stats',
 '/en/squads/cd051869/Brentford-Stats',
 '/en/squads/33c895d4/Southampton-Stats',
 '/en/squads/943e8050/Burnley-Stats',
 '/en/squads/5bfb9659/Leeds-United-Stats',
 '/en/squads/d3fd31cc/Everton-Stats',
 '/en/squads/2abfe087/Watford-Stats',
 '/en/squads/1c781004/Norwich-City-Stats']

In [244]:
# turn links into full URLs (their absolute links)
team_urls = [f'https://fbref.com/{l}' for l in links]
team_urls

['https://fbref.com//en/squads/b8fd03ef/Manchester-City-Stats',
 'https://fbref.com//en/squads/822bd0ba/Liverpool-Stats',
 'https://fbref.com//en/squads/cff3d9bb/Chelsea-Stats',
 'https://fbref.com//en/squads/18bb7c10/Arsenal-Stats',
 'https://fbref.com//en/squads/361ca564/Tottenham-Hotspur-Stats',
 'https://fbref.com//en/squads/19538871/Manchester-United-Stats',
 'https://fbref.com//en/squads/7c21e445/West-Ham-United-Stats',
 'https://fbref.com//en/squads/8cec06e1/Wolverhampton-Wanderers-Stats',
 'https://fbref.com//en/squads/d07537b9/Brighton-and-Hove-Albion-Stats',
 'https://fbref.com//en/squads/b2b47a98/Newcastle-United-Stats',
 'https://fbref.com//en/squads/a2d435b3/Leicester-City-Stats',
 'https://fbref.com//en/squads/47c64c55/Crystal-Palace-Stats',
 'https://fbref.com//en/squads/8602292d/Aston-Villa-Stats',
 'https://fbref.com//en/squads/cd051869/Brentford-Stats',
 'https://fbref.com//en/squads/33c895d4/Southampton-Stats',
 'https://fbref.com//en/squads/943e8050/Burnley-Stats',


## Extract Match Stats with Pandas and Requests

In [245]:
# get scores and fixtures table from web
import pandas as pd

In [246]:
# get only from the html site, the scores and matches table for man City in this case
data = re.get(team_urls[0])
matches = pd.read_html(data.text, match="Scores & Fixtures")[0]

## Get Match Shooting Stats

In [247]:
#same initial process as above
soup = BeautifulSoup(data.text)
links = soup.find_all('a')
links = [l.get('href') for l in links]

In [248]:
links = [l for l in links if l and 'all_comps/shooting' in l]
links

['/en/squads/b8fd03ef/2021-2022/matchlogs/all_comps/shooting/Manchester-City-Match-Logs-All-Competitions',
 '/en/squads/b8fd03ef/2021-2022/matchlogs/all_comps/shooting/Manchester-City-Match-Logs-All-Competitions',
 '/en/squads/b8fd03ef/2021-2022/matchlogs/all_comps/shooting/Manchester-City-Match-Logs-All-Competitions',
 '/en/squads/b8fd03ef/2021-2022/matchlogs/all_comps/shooting/Manchester-City-Match-Logs-All-Competitions']

In [249]:
# add absolute url
shooting_url = [f'https://fbref.com/{l}' for l in links][0]
shooting_url

'https://fbref.com//en/squads/b8fd03ef/2021-2022/matchlogs/all_comps/shooting/Manchester-City-Match-Logs-All-Competitions'

In [250]:
shooting = pd.read_html(shooting_url)
shooting = shooting[0] #multi level index, which is not that useful
shooting.columns = shooting.columns.droplevel() # If resulting index has only 1 level left, the result will be of Index type, not MultiIndex.

In [251]:
shooting.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Dist,FK,PK,PKatt,xG,npxG,npxG/Sh,G-xG,np:G-xG,Match Report
0,2021-08-07,17:15,Community Shield,FA Community Shield,Sat,Neutral,L,0,1,Leicester City,...,,,0,0,,,,,,Match Report
1,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0,1,Tottenham,...,16.9,1.0,0,0,1.9,1.9,0.11,-1.9,-1.9,Match Report
2,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5,0,Norwich City,...,17.3,1.0,0,0,2.7,2.7,0.17,1.3,1.3,Match Report
3,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5,0,Arsenal,...,14.3,0.0,0,0,3.8,3.8,0.15,1.2,1.2,Match Report
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1,0,Leicester City,...,14.0,0.0,0,0,2.9,2.9,0.12,-1.9,-1.9,Match Report


## Combine Matches and Shooting DFs

In [252]:
# left merge of matches wrt shooting stats
team_data = matches.merge(shooting, how='left')
team_data.columns.duplicated(), team_data.shape

(array([False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False]),
 (58, 33))

# Scraping Data for Multiple Seasons and Teams

In [253]:
import time

In [254]:
#get a list of the years to scrape, like the last decade
# initialize an empty list 
years = list(range(2022,2011, -1))
all_matches = [] 

In [255]:
for year in years[:1]:
    data = re.get(standings_url)
    soup = BeautifulSoup(data.text)
    data = re.get(standings_url)
    standings_table = soup.select('table.stats_table')[0]

    links = [l.get("href") for l in standings_table.find_all('a')]
    links = [l for l in links if '/squads/' in l]
    team_urls = [f"https://fbref.com{l}" for l in links]
    
    previous_season = soup.select("a.prev")[0].get("href")
    standings_url = f"https://fbref.com/{previous_season}"
    
    for team_url in team_urls:
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
        data = re.get(team_url)
        matches = pd.read_html(data.text, match="Scores & Fixtures")[0]
        soup = BeautifulSoup(data.text)
        links = [l.get("href") for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/shooting/' in l]
        data = re.get(f"https://fbref.com{links[0]}")
        shooting = pd.read_html(data.text, match="Shooting")[0]
        shooting.columns = shooting.columns.droplevel()
        
        team_data = matches.merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")
        team_data = team_data[team_data["Comp"] == "Premier League"]
        
        team_data["Season"] = year
        team_data["Team"] = team_name
        all_matches.append(team_data)
        matches_df = pd.concat(all_matches)

In [258]:
matches_df.columns = [c.lower() for c in matches_df.columns] #make the column names lowercase

In [259]:
matches_df.to_csv('Matches.csv')