In [1]:
# Imports and Directory Setup

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

# Define base directory for the project
project_dir = r"C:\Users\matth\OneDrive\Documents\data_science_project\premier-league-home-advantage"

# Define the directory where the processed (cleaned) data is stored
processed_data_dir = os.path.join(project_dir, "data", "processed_data")

# Define the path to the combined match data file
match_data_file = os.path.join(processed_data_dir, "all_seasons_match_data.csv")

# Define the directory paths for saving outputs
output_dir = os.path.join(project_dir, "output")
figures_dir = os.path.join(output_dir, "figures")
tables_dir = os.path.join(output_dir, "tables")

print("Required libraries imported.")
print(f"Processed data directory: {processed_data_dir}")
print(f"Match data file path: {match_data_file}")
print(f"Figures output directory: {figures_dir}")
print(f"Tables output directory: {tables_dir}")

# Set a default style for plots
sns.set_theme(style="whitegrid")

Required libraries imported.
Processed data directory: C:\Users\matth\OneDrive\Documents\data_science_project\premier-league-home-advantage\data\processed_data
Match data file path: C:\Users\matth\OneDrive\Documents\data_science_project\premier-league-home-advantage\data\processed_data\all_seasons_match_data.csv
Figures output directory: C:\Users\matth\OneDrive\Documents\data_science_project\premier-league-home-advantage\output\figures
Tables output directory: C:\Users\matth\OneDrive\Documents\data_science_project\premier-league-home-advantage\output\tables


In [2]:
# Load Match Data

print(f"\nLoading match data from: {match_data_file}")
try:
    match_df = pd.read_csv(match_data_file, encoding='utf-8')
    print(f" Successfully loaded match data. Shape: {match_df.shape}")
except FileNotFoundError:
    print(f"Error: File not found at {match_data_file}.")
    print(" Please ensure the file exists in the processed_data directory.")
    match_df = None
except Exception as e:
    print(f"An error occurred loading the match data: {e}")
    match_df = None


Loading match data from: C:\Users\matth\OneDrive\Documents\data_science_project\premier-league-home-advantage\data\processed_data\all_seasons_match_data.csv
 Successfully loaded match data. Shape: (2280, 12)


In [5]:
# Initial Data Inspection

# Check if data loaded successfully before inspecting
if match_df is not None:
    print("\n--- First 5 Rows (Head) ---")
    display(match_df.head())

    print("\n--- DataFrame Info ---")
    # Simply call .info() - Jupyter will display its output
    match_df.info()

    print("\n--- Summary Statistics ---")
    # Include 'all' to get stats for object/category columns too
    display(match_df.describe(include='all'))

else:
    print("\nCannot perform inspection because match data failed to load.")


--- First 5 Rows (Head) ---


Unnamed: 0,dayofweek,date,start_time,home_team,home_xg,score,away_xg,away_team,attendance,venue,referee,season
0,Fri,2018-08-10,20:00,Manchester Utd,1.5,Score: 2 - 1,1.8,Leicester City,74439.0,Old Trafford,Andre Marriner,2018-2019
1,Sat,2018-08-11,12:30,Newcastle Utd,1.0,Score: 1 - 2,2.0,Tottenham,51749.0,St. James' Park,Martin Atkinson,2018-2019
2,Sat,2018-08-11,15:00,Fulham,0.7,Score: 0 - 2,1.0,Crystal Palace,24821.0,Craven Cottage,Mike Dean,2018-2019
3,Sat,2018-08-11,15:00,Bournemouth,2.2,Score: 2 - 0,1.4,Cardiff City,10353.0,Vitality Stadium,Kevin Friend,2018-2019
4,Sat,2018-08-11,15:00,Huddersfield,0.3,Score: 0 - 3,1.9,Chelsea,24121.0,The John Smith's Stadium,Chris Kavanagh,2018-2019



--- DataFrame Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2280 entries, 0 to 2279
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   dayofweek   2280 non-null   object 
 1   date        2280 non-null   object 
 2   start_time  2280 non-null   object 
 3   home_team   2280 non-null   object 
 4   home_xg     2280 non-null   float64
 5   score       2280 non-null   object 
 6   away_xg     2280 non-null   float64
 7   away_team   2280 non-null   object 
 8   attendance  2280 non-null   float64
 9   venue       2280 non-null   object 
 10  referee     2280 non-null   object 
 11  season      2280 non-null   object 
dtypes: float64(3), object(9)
memory usage: 213.9+ KB

--- Summary Statistics ---


Unnamed: 0,dayofweek,date,start_time,home_team,home_xg,score,away_xg,away_team,attendance,venue,referee,season
count,2280,2280,2280,2280,2280.0,2280,2280.0,2280,2280.0,2280,2280,2280
unique,7,718,24,28,,46,,28,,31,37,6
top,Sat,2024-05-19,15:00,Manchester Utd,,Score: 1 - 1,,Wolves,,Old Trafford,Anthony Taylor,2018-2019
freq,1065,10,712,114,,239,,114,,114,177,380
mean,,,,,1.518333,,1.254167,,31118.999123,,,
std,,,,,0.855207,,0.761324,,21580.506976,,,
min,,,,,0.0,,0.0,,0.0,,,
25%,,,,,0.9,,0.7,,16876.0,,,
50%,,,,,1.4,,1.1,,31062.0,,,
75%,,,,,2.1,,1.7,,52123.25,,,


In [8]:
# Feature Engineering from Score

# Check if match_df exists
if 'match_df' in locals() and match_df is not None:
    print("\nPerforming feature engineering (Goals, Difference, Points)...")

    # Make a copy to avoid SettingWithCopyWarning
    match_df = match_df.copy()

    # Check if 'score' column exists
    if 'score' in match_df.columns:
        try:
            # Extract Home and Away Goals
            # Expected format: "Score: H - A"
            split_scores = match_df['score'].str.split(': ', expand=True)[1].str.split(' - ', expand=True)
            # Convert to numeric, coercing errors to NaN
            match_df['home_goals'] = pd.to_numeric(split_scores[0].str.strip(), errors='coerce')
            match_df['away_goals'] = pd.to_numeric(split_scores[1].str.strip(), errors='coerce')

            # Check for parsing errors (NaNs) - only if needed for debugging
            if match_df['home_goals'].isnull().any() or match_df['away_goals'].isnull().any():
                 print("Warning: Some scores may not have been parsed correctly (resulting in NaN goals).")

            # Calculate Goal Difference
            match_df['goal_difference'] = match_df['home_goals'] - match_df['away_goals']

            # Calculate Home Points
            conditions = [
                match_df['home_goals'] > match_df['away_goals'], # Home Win
                match_df['home_goals'] == match_df['away_goals'], # Draw
                match_df['home_goals'] < match_df['away_goals'] # Home Loss
            ]
            point_values = [3, 1, 0]
            match_df['home_points'] = np.select(conditions, point_values, default=np.nan)

            print("Feature engineering complete.")

            # Verification
            print("\n--- Verification of new columns (first 5 rows) ---")
            display(match_df[['score', 'home_goals', 'away_goals', 'goal_difference', 'home_points']].head())

        except Exception as e:
            print(f"Error during feature engineering: {e}")
            # Ensure columns exist as NaN if error occurs
            if 'home_goals' not in match_df.columns: match_df['home_goals'] = np.nan
            if 'away_goals' not in match_df.columns: match_df['away_goals'] = np.nan
            if 'goal_difference' not in match_df.columns: match_df['goal_difference'] = np.nan
            if 'home_points' not in match_df.columns: match_df['home_points'] = np.nan

    else:
        print("Error: 'score' column not found in DataFrame.")

else:
    print("\nSkipping feature engineering because match_df DataFrame not available.")


Performing feature engineering (Goals, Difference, Points)...
Feature engineering complete.

--- Verification of new columns (first 5 rows) ---


Unnamed: 0,score,home_goals,away_goals,goal_difference,home_points
0,Score: 2 - 1,2,1,1,3.0
1,Score: 1 - 2,1,2,-1,0.0
2,Score: 0 - 2,0,2,-2,0.0
3,Score: 2 - 0,2,0,2,3.0
4,Score: 0 - 3,0,3,-3,0.0
