In [None]:
import requests 
from bs4 import BeautifulSoup
import datetime
import pandas as pd 
import os 

In [None]:
# Define the base URL and the start and end years for scraping data
STANDINGS_URL = "https://fbref.com/en/comps/9/Premier-League-Stats"
START_YEAR = 2023
END_YEAR = 2020

# Function to fetch data from a given URL
def fetch_data(url):
    return requests.get(url).text

# Function to process team data for a given year
def process_team_data(team_url, year):
    # Fetch team data
    data = fetch_data(team_url)
    # Extract matches data
    matches = pd.read_html(data, match="Scores & Fixtures")[0]

    # Create a BeautifulSoup object
    soup = BeautifulSoup(data)
    # Extract shooting statistics links
    links = [l.get("href") for l in soup.find_all('a') if l and 'all_comps/shooting/' in l]
    # Fetch shooting statistics data
    data = fetch_data(f"https://fbref.com{links[0]}")

    # Extract shooting statistics
    shooting = pd.read_html(data, match="Shooting")[0]
    shooting.columns = shooting.columns.droplevel()

    # Try to merge matches and shooting statistics data
    try:
        team_data = matches.merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")
    except ValueError:
        # Return None if merge fails
        return None

    # Filter data to include only Premier League matches
    team_data = team_data[team_data["Comp"] == "Premier League"]
    # Add season and team name to the data
    team_data["Season"] = year
    team_data["Team"] = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")

    return team_data

# Function to process data for a given year
def process_year_data(year, url):
    all_matches = []
    # Fetch data for the year
    data = fetch_data(url)
    # Create a BeautifulSoup object
    soup = BeautifulSoup(data)

    # Extract team URLs
    links = [l.get("href") for l in soup.select('table.stats_table')[0].find_all('a') if '/squads/' in l]
    team_urls = [f"https://fbref.com{l}" for l in links]

    # Process data for each team
    for team_url in team_urls:
        team_data = process_team_data(team_url, year)
        if team_data is not None:
            all_matches.append(team_data)
            time.sleep(1)  # Pause to prevent overwhelming the server

    # Return all matches and the URL for the previous season
    return all_matches, f"https://fbref.com{soup.select('a.prev')[0].get('href')}"

# Main function to control the data scraping process
def main():
    standings_url = STANDINGS_URL
    all_matches = []

    # Process data for each year in the range
    for year in range(START_YEAR, END_YEAR, -1):
        year_matches, standings_url = process_year_data(year, standings_url)
        all_matches.extend(year_matches)

    # Concatenate all matches data and save to a CSV file
    match_df = pd.concat(all_matches)
    match_df.to_csv("matches.csv")

# If this script is run directly, start the data scraping process
if __name__ == "__main__":
    main()
