# Data Cleaning and Aggregation

**Objective**: Collapse `whl_2025.csv` from shift-level to game-level and engineer features for ranking models.

## 1. Init & Load

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Setting random seed for reproducibility if needed
np.random.seed(42)

# Load Data
file_path = 'd:/A/Warton/Data/whl_2025.csv'
df = pd.read_csv(file_path)

print(f"Loaded dataset with {len(df)} records.")
df.head()

## 2. Data Aggregation

Grouping by `game_id` to aggregate statistics.

In [None]:
agg_rules = {
    'home_goals': 'sum',
    'away_goals': 'sum',
    'home_xg': 'sum',
    'away_xg': 'sum',
    'home_shots': 'sum',
    'away_shots': 'sum',
    'went_ot': 'max',
    'home_team': 'first',
    'away_team': 'first'
}

df_games = df.groupby('game_id').agg(agg_rules).reset_index()
print(f"Collapsed to {len(df_games)} unique games.")
df_games.head()

## 3. Feature Engineering

- **Winner**: Determine match winner.
- **Points**: Calculate league points (3-2-1-0 system).
- **Differentials**: GD and xGD.

In [None]:
# Determine Winner
df_games['winner'] = np.where(df_games['home_goals'] > df_games['away_goals'], 'Home', 'Away')

# Calculate Points (3 for Reg Win, 2 for OT Win, 1 for OT Loss, 0 for Reg Loss)

# Conditions
home_win = df_games['home_goals'] > df_games['away_goals']
away_win = df_games['away_goals'] > df_games['home_goals']
is_ot = df_games['went_ot'] == 1

# Vectorized Point Calculation
df_games['home_points'] = 0
df_games['away_points'] = 0

# Home Points
df_games.loc[home_win & ~is_ot, 'home_points'] = 3
df_games.loc[home_win & is_ot, 'home_points'] = 2
df_games.loc[away_win & is_ot, 'home_points'] = 1

# Away Points
df_games.loc[away_win & ~is_ot, 'away_points'] = 3
df_games.loc[away_win & is_ot, 'away_points'] = 2
df_games.loc[home_win & is_ot, 'away_points'] = 1

# Differentials
df_games['goal_diff'] = df_games['home_goals'] - df_games['away_goals']
df_games['xg_diff'] = df_games['home_xg'] - df_games['away_xg']

df_games[['game_id', 'home_team', 'away_team', 'home_goals', 'away_goals', 'home_points', 'away_points']].head()

## 4. Quality Control (QC)

Running assertions and statistical checks.

In [None]:
# 1. Uniqueness Check
assert df_games['game_id'].is_unique, "CRITICAL: Game IDs are not unique!"

# 2. Consistency Check (Points)
# In every game, sum of points should be 3 (Reg) or 3 (OT 2+1).
assert (df_games['home_points'] + df_games['away_points']).isin([3]).all(), "Point allocation error found!"

print("âœ… Assertions Passed.")

# 3. Distribution Check
print("\nGoal Distribution:")
print(df_games[['home_goals', 'away_goals']].describe())

# Statistician's Note
print("\n--- Statistician's Commentary ---")
h_mean = df_games['home_goals'].mean()
a_mean = df_games['away_goals'].mean()
print(f"Observed Home Advantage: +{h_mean - a_mean:.3f} goals/game")
if (h_mean - a_mean) > 0.2:
    print("Note: Home advantage seems significant, typical of professional leagues.")
else:
    print("Note: Home advantage is mild or negligible.")


## 5. result Display

Displaying the final aggregated dataframe.

In [None]:
df_games