In [1]:
import json
import pandas as pd
import numpy as np

# 1. File paths – adjust these to match your local setup
INPUT_PATH            = 'data/teams_points_2015-24.json'
RESIDUALS_OUTPUT_PATH = 'data/q2_residuals.json'
SUMMARY_OUTPUT_PATH   = 'data/q2_team_summary.json'

# 2. Load and flatten the JSON into a DataFrame
with open(INPUT_PATH, 'r') as f:
    teams = json.load(f)

records = []
for team in teams:
    for rec in team['values']:
        records.append({
            'team':      team['team'],
            'year':      rec['year'],
            'points':    rec['points'],
            'gd':        rec['gd']
        })

df = pd.DataFrame(records)
df

Unnamed: 0,team,year,points,gd
0,Arsenal,2015,75,35
1,Arsenal,2016,71,29
2,Arsenal,2017,75,33
3,Arsenal,2018,63,23
4,Arsenal,2019,70,22
...,...,...,...,...
195,Wolves,2020,59,11
196,Wolves,2021,45,-16
197,Wolves,2022,51,-5
198,Wolves,2023,41,-27


In [2]:
# 3. Fit global OLS
m, b = np.polyfit(df['gd'], df['points'], 1)
print(f"OLS regression: points = {m:.4f} * gd + {b:.2f}")

# 4. Compute residuals
df['predicted'] = m * df['gd'] + b
df['residual']  = df['points'] - df['predicted']

# 5. Export per-season residuals
df.to_json(RESIDUALS_OUTPUT_PATH, orient='records', indent=2)
print(f"Exported per‐season residuals to {RESIDUALS_OUTPUT_PATH}")

# 6. Aggregate by team, including total_pts
THRESHOLD = 3
agg = df.groupby('team').agg(
    mean_residual   = ('residual', 'mean'),
    std_residual    = ('residual', 'std'),
    seasons_played  = ('residual', 'count'),
    count_over_thr  = ('residual', lambda x: (x > THRESHOLD).sum()),
    count_under_thr = ('residual', lambda x: (x < -THRESHOLD).sum()),
    total_pts       = ('points',   'sum')
).reset_index()

# 7. Clean up & flag
agg['std_residual']    = agg['std_residual'].fillna(0)
agg['consistent_over']  = agg['mean_residual'] >  THRESHOLD
agg['consistent_under'] = agg['mean_residual'] < -THRESHOLD

# 8. Export summary
agg.to_json(SUMMARY_OUTPUT_PATH, orient='records', indent=2)
print(f"Exported team summary to {SUMMARY_OUTPUT_PATH}")

OLS regression: points = 0.6202 * gd + 52.51
Exported per‐season residuals to data/q2_residuals.json
Exported team summary to data/q2_team_summary.json


In [3]:
import json
import pandas as pd
import numpy as np

# --- paths ---
XG_CSV      = 'data/epl_xg.csv'
POINTS_JSON = 'data/teams_points_2015-24.json'
RESID_OUT   = 'data/q2_xg_residuals.json'
SUM_OUT     = 'data/q2_xg_team_summary.json'

# 1) Load & flatten the points JSON
with open(POINTS_JSON, 'r') as f:
    teams_data = json.load(f)

records = []
for team_obj in teams_data:
    team_name = team_obj['team']
    for season_rec in team_obj['values']:
        # assuming each rec has keys 'year' and 'points'
        records.append({
            'season_end_year': season_rec['year'],
            'team':            team_name,
            'points':          season_rec['points']
        })

pts_df = pd.DataFrame(records)
# now pts_df has exactly the columns we need

# 2) Load your xG table
xg_df = pd.read_csv(XG_CSV)

# 3) Merge on season_end_year & team
df = xg_df.merge(
    pts_df,
    on=['season_end_year','team'],
    how='inner'
)

# 4) Fit global OLS on xG
m, b = np.polyfit(df['xG'], df['points'], 1)
print(f"OLS regression: points = {m:.4f} * xG + {b:.2f}")

# 5) Compute residuals
df['predicted'] = m * df['xG'] + b
df['residual']  = df['points'] - df['predicted']

# 6) Export per‐season residuals
df.to_json(RESID_OUT, orient='records', indent=2)
print(f"Exported per‐season residuals to {RESID_OUT}")

# 7) Aggregate by team
THRESHOLD = 3
agg = df.groupby('team').agg(
    mean_residual   = ('residual', 'mean'),
    std_residual    = ('residual', 'std'),
    seasons_played  = ('residual', 'count'),
    count_over_thr  = ('residual', lambda x: (x > THRESHOLD).sum()),
    count_under_thr = ('residual', lambda x: (x < -THRESHOLD).sum()),
    total_pts       = ('points',   'sum')
).reset_index()

agg['std_residual']     = agg['std_residual'].fillna(0)
agg['consistent_over']  = agg['mean_residual'] >  THRESHOLD
agg['consistent_under'] = agg['mean_residual'] < -THRESHOLD

# 8) Export team summary
agg.to_json(SUM_OUT, orient='records', indent=2)
print(f"Exported team summary to {SUM_OUT}")


OLS regression: points = 0.7909 * xG + 11.76
Exported per‐season residuals to data/q2_xg_residuals.json
Exported team summary to data/q2_xg_team_summary.json
