In [1]:
# Cell 1: Understat API (xG Data) for last two seasons
import requests
import pandas as pd
import json
from bs4 import BeautifulSoup

def fetch_understat_data(season_start_year: int):
    url = f"https://understat.com/league/EPL/{season_start_year}"
    res = requests.get(url)
    soup = BeautifulSoup(res.text, 'html.parser')

    # Extract JSON embedded in the page scripts
    scripts = soup.find_all('script')
    data = None
    for script in scripts:
        if 'teamsData' in script.text:
            json_text = script.text.split("JSON.parse('")[1].split("')")[0]
            json_text = json_text.encode('utf8').decode('unicode_escape')
            data = json.loads(json_text)
            break

    if not data:
        print(f"No Understat data found for season {season_start_year}.")
        return pd.DataFrame()

    matches = []
    for team, tdata in data.items():
        for match in tdata['history']:
            matches.append({
                'date': pd.to_datetime(match['date']),
                'team': tdata['title'],
                'xG': float(match['xG']),
                'xGA': float(match['xGA']),
                'result': match['result'],
                'season': season_start_year
            })
    df = pd.DataFrame(matches)
    return df

# Fetch data for last two seasons and concatenate
seasons = [2023, 2024]
understat_dfs = []

for season in seasons:
    print(f"Fetching Understat data for season {season}...")
    df_season = fetch_understat_data(season)
    understat_dfs.append(df_season)

understat_df = pd.concat(understat_dfs, ignore_index=True)
understat_df.to_csv('understat_data_last_two_seasons.csv', index=False)
print(f"Understat data for seasons {seasons} saved with {len(understat_df)} rows.")


Fetching Understat data for season 2023...
Fetching Understat data for season 2024...
Understat data for seasons [2023, 2024] saved with 1520 rows.


In [None]:
import requests
import pandas as pd
from time import sleep
from dotenv import load_dotenv
import os

load_dotenv()


API_KEY = os.getenv('API_KEY')
HEADERS = {'X-Auth-Token': API_KEY}

def fetch_matches_for_season(season_year):
    url = f"https://api.football-data.org/v4/competitions/PL/matches?season={season_year}"
    response = requests.get(url, headers=HEADERS)
    if response.status_code != 200:
        print(f"Failed to fetch data for season {season_year}: {response.status_code}")
        return pd.DataFrame()
    data = response.json()
    
    match_list = []
    for match in data.get('matches', []):
        full_time_score = match.get('score', {}).get('fullTime', {})
        home_score = full_time_score.get('home')
        away_score = full_time_score.get('away')
        match_list.append({
            'date': match['utcDate'],
            'home_team': match['homeTeam']['name'],
            'away_team': match['awayTeam']['name'],
            'home_score': home_score,
            'away_score': away_score,
            'status': match['status'],
            'season': season_year
        })
    return pd.DataFrame(match_list)

all_seasons_df = pd.DataFrame()

start_year = 2023
end_year = 2024  # Adjust for current year or last completed season

for year in range(start_year, end_year + 1):
    print(f"Fetching data for season {year}...")
    season_df = fetch_matches_for_season(year)
    all_seasons_df = pd.concat([all_seasons_df, season_df], ignore_index=True)
    sleep(1)  # be polite with API rate limits

# Convert date column to datetime
all_seasons_df['date'] = pd.to_datetime(all_seasons_df['date'])

print(f"Collected data for {len(all_seasons_df)} matches spanning {start_year}-{end_year}.")
all_seasons_df.to_csv('football_data_last_10_years.csv', index=False)

Fetching data for season 2023...
Fetching data for season 2024...
Collected data for 760 matches spanning 2023-2024.


In [14]:
import pandas as pd

# Load both datasets
understat_df = pd.read_csv('understat_data_last_two_seasons.csv', parse_dates=['date'])
football_df = pd.read_csv('football_data_last_10_years.csv', parse_dates=['date'])

# Remove timezone info to make dates comparable
football_df['date'] = football_df['date'].dt.tz_localize(None)
understat_df['date'] = understat_df['date'].dt.tz_localize(None)

# Prepare Understat data for home team merge
understat_home = understat_df.rename(columns={
    'team': 'home_team',
    'xG': 'home_xG',
    'xGA': 'home_xGA',
    'result': 'home_result'
})

# Prepare Understat data for away team merge
understat_away = understat_df.rename(columns={
    'team': 'away_team',
    'xG': 'away_xG',
    'xGA': 'away_xGA',
    'result': 'away_result'
})

# Merge Football-Data with Understat home team data
merged_df = pd.merge(
    football_df,
    understat_home[['date', 'home_team', 'home_xG', 'home_xGA', 'home_result']],
    on=['date', 'home_team'],
    how='left'
)

# Merge the above with Understat away team data
merged_df = pd.merge(
    merged_df,
    understat_away[['date', 'away_team', 'away_xG', 'away_xGA', 'away_result']],
    on=['date', 'away_team'],
    how='left'
)

# Optional: Check for any missing values and clean if needed
print(merged_df.isnull().sum())

# Save merged dataframe
merged_df.to_csv('merged_premier_league_data.csv', index=False)
print(f"Merged dataset saved with {len(merged_df)} rows.")


date             0
home_team        0
away_team        0
home_score       0
away_score       0
status           0
season           0
venue          760
home_xG        760
home_xGA       760
home_result    760
away_xG        760
away_xGA       760
away_result    760
dtype: int64
Merged dataset saved with 760 rows.
