In [None]:
# Import necessary modules

import requests 
import datetime
import pandas as pd 
import time
import os
from bs4 import BeautifulSoup


# URL of the Premier League standings
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

# Get the web page content
data = requests.get(standings_url)

# Parse the web page content using BeautifulSoup
soup = BeautifulSoup(data.text)

# Find the table with the standings
standings_table = soup.select('table.stats_table')[0]

# Find all links in the standings table
links = standings_table.find_all('a')

# Extract href values of links
links = [l.get("href") for l in links]

# Filter links that contain '/squads/'
links = [l for l in links if '/squads/' in l]

# Construct complete URLs for the teams
team_urls = [f"https://fbref.com{l}" for l in links]

# Fetch the first team page
data = requests.get(team_urls[0])

# Extract 'Scores & Fixtures' table
matches = pd.read_html(data.text, match="Scores & Fixtures")[0]

# Parse the team page content using BeautifulSoup
soup = BeautifulSoup(data.text)

# Find all links in the page
links = soup.find_all('a')

# Extract href values of links
links = [l.get("href") for l in links]

# Filter links that contain 'all_comps/shooting/'
links = [l for l in links if l and 'all_comps/shooting/' in l]

# Fetch the shooting data
data = requests.get(f"https://fbref.com{links[0]}")

# Extract 'Shooting' table
shooting = pd.read_html(data.text, match="Shooting")[0]

# Remove the multi-level column names in shooting DataFrame
shooting.columns = shooting.columns.droplevel()

# Merge the matches and shooting data on the 'Date' column
team_data = matches.merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")

# Years for which data will be collected
years = list(range(2023, 2020, -1))

# List to store all match data
all_matches = []

# Loop through all years
for year in years:
    # Fetch the standings page
    data = requests.get(standings_url)
    soup = BeautifulSoup(data.text)
    standings_table = soup.select('table.stats_table')[0]

    # Extract and filter links as before
    links = [l.get("href") for l in standings_table.find_all('a')]
    links = [l for l in links if '/squads/' in l]
    team_urls = [f"https://fbref.com{l}" for l in links]
    
    # Get the link for the previous season
    previous_season = soup.select("a.prev")[0].get("href")
    standings_url = f"https://fbref.com{previous_season}"
    
    # Loop through all teams
    for team_url in team_urls:
        # Extract team name from URL
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
        
        # Fetch the team page and extract tables
        data = requests.get(team_url)
        matches = pd.read_html(data.text, match="Scores & Fixtures")[0]
        soup = BeautifulSoup(data.text)
        links = [l.get("href") for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/shooting/' in l]
        data = requests.get(f"https://fbref.com{links[0]}")
        shooting = pd.read_html(data.text, match="Shooting")[0]
        shooting.columns = shooting.columns.droplevel()

        # Merge matches and shooting data, and filter for Premier League matches
        try:
            team_data = matches.merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")
        except ValueError:
            continue
        team_data = team_data[team_data["Comp"] == "Premier League"]
        
        # Add season and team info to the DataFrame
        team_data["Season"] = year
        team_data["Team"] = team_name

        # Append to the list of all matches
        all_matches.append(team_data)
        
        # Pause execution for 1 second
        time.sleep(1)

# Concatenate all match data into a single DataFrame
match_df = pd.concat(all_matches)

# Check if 'output' directory exists and if not, create it
if not os.path.exists('output'):
    os.makedirs('output')

# Save the DataFrame to a CSV file in the 'output' folder
match_df.to_csv("output/matches.csv")
