In [26]:
from dotenv import load_dotenv
import os
import pandas as pd
from sqlalchemy import create_engine
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
import numpy as np
import re


In [27]:
load_dotenv()

db_user = os.getenv('DB_USER')
db_pass = os.getenv('DB_PASS')
db_host = os.getenv('DB_HOST')
db_port = os.getenv('DB_PORT')
db_name = os.getenv('DB_NAME')

In [28]:
engine = create_engine(f"postgresql+psycopg2://{db_user}:{db_pass}@{db_host}:{db_port}/{db_name}")

In [29]:
# helper function to sort rounds
def get_round_order(row):
    # Handle missing/null values
    if not row['round'] or pd.isnull(row['round']):
        return None

    # Match regular rounds: 'R1', 'R23', etc.
    match = re.match(r'R(\d+)$', row['round'])
    if match:
        return int(match.group(1))
    # Finals 
    finals_order = {
        'REF': 100,  # Elimination Final (week 1)
        'RQF': 101,  # Qualifying Final (week 1)
        'RSF': 102,  # Semi Final (week 2)
        'RPF': 103,  # Preliminary Final (week 3)
        'RGF': 104,  # Grand Final (week 4)
    }
    code = row['round'][1:]  # Removes leading "R"
    return finals_order.get(code, 999)  

In [30]:
# Add in win/loss helper function
def get_team_win(row):
    if row['team_id'] == row['home_team_id']:
        return 1 if row['home_result'] == 'W' else 0
    elif row['team_id'] == row['away_team_id']:
        return 1 if row['away_result'] == 'W' else 0
    else:
        return None  

In [31]:
# Helper function to sort players into roles
h_cutoff = 195
def assign_position_group(row):
    pos = row['position'].lower().replace(' ', '') if row['position'] else ''
    if 'defender' in pos:
        if row['height_cm'] is not None and row['height_cm'] >= 195:
            return ['tall_defender']
        elif row['height_cm'] is not None and row['height_cm'] < 195:
            return ['small_defender']
        else:
            return ['small_defender']  # fallback if height unknown
    if pos == 'midfield,forward':
        return ['midfield,forward']
    return pos.split(',') if pos else []

In [32]:
# Load in the tables
players = pd.read_sql('SELECT id as player_id, full_name, height_cm, position FROM players', engine)
player_game_stats = pd.read_sql('SELECT * FROM player_game_stats', engine)
games = pd.read_sql('SELECT id AS game_id, season_year, round, game_date, home_team_id, away_team_id, home_result, away_result FROM games', engine)
games['round_number'] = games.apply(get_round_order, axis=1)

In [33]:
# Clean position data, with midfield,forward being it's own group
def custom_position_split(pos_str):
    if not pos_str or pd.isnull(pos_str):
        return []
    pos_str = pos_str.lower().replace(' ', '')
    if pos_str == 'midfield,forward':
        return ['midfield,forward']
    return pos_str.split(',')

players['position_group'] = players.apply(assign_position_group, axis=1)
players_exploded = players.explode('position_group')
players_exploded = players_exploded[players_exploded['position_group'] != '']

In [34]:
# Merge tables to get results
pgs = player_game_stats.merge(
    games[['game_id', 'home_team_id', 'away_team_id', 'home_result', 'away_result']],
    on='game_id', how='left'
)

# Add in the positions from the position data above
pgs = pgs.merge(
    players_exploded[['player_id', 'position_group']],
    on='player_id', how='left'
)

pgs['won'] = pgs.apply(get_team_win, axis=1)
pgs = pgs[pgs['won'].notnull()]

In [35]:
# Add in a couple ratios: % of disposals that are kicks, % of disposals that are contested
pgs['pct_contested'] = pgs['contested_possessions'] / pgs['disposals']
pgs['pct_kicks'] = pgs['kicks'] / pgs['disposals']

# Avoid any 0 disposal games
pgs.loc[pgs['disposals'] == 0, 'pct_contested'] = None
pgs.loc[pgs['disposals'] == 0, 'pct_kicks'] = None

In [36]:
# Create the stats column
stat_cols = [
    'kicks', 'marks', 'handballs', 'goals', 'behinds', 'hit_outs', 'tackles',
    'rebounds', 'inside_50', 'clearances', 'clangers', 'frees_for', 'frees_against',
    'contested_possessions', 'uncontested_possessions', 'contested_marks', 'marks_inside_50',
    'one_percenters', 'bounces', 'goal_assists', 'pct_contested', 'pct_kicks'
]

In [75]:
# Build out a dataframe
positions = pgs['position_group'].unique()
correlation_results = []

for pos in positions:
    pgs_pos = pgs[pgs['position_group'] == pos]
    if len(pgs_pos) < 30:
        continue  
    for stat in stat_cols:
        if stat in pgs_pos.columns:
            vals = pgs_pos[stat].dropna()
            if len(vals.unique()) > 1:
                corr = pgs_pos[stat].corr(pgs_pos['won'])
                correlation_results.append({'position_group': pos, 'stat': stat, 'correlation': corr, 'n_games': len(pgs_pos)})

cor_df = pd.DataFrame(correlation_results)
cor_df = cor_df.sort_values(['position_group', 'correlation'], ascending=[True, False])

# Keep if you want to look at all results
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
display(cor_df)
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')

Unnamed: 0,position_group,stat,correlation,n_games
47,forward,goals,0.192528,37192
63,forward,goal_assists,0.15655,37192
60,forward,marks_inside_50,0.150516,37192
44,forward,kicks,0.137904,37192
45,forward,marks,0.130274,37192
52,forward,inside_50,0.102904,37192
58,forward,uncontested_possessions,0.102205,37192
57,forward,contested_possessions,0.092329,37192
48,forward,behinds,0.08563,37192
59,forward,contested_marks,0.068345,37192


In [38]:
# For each role conduct logistic regression
roles = ['forward', 'midfield', 'midfield,forward', 'ruck', 'tall_defender', 'small_defender']

# Dictionary to store coefficients for each role
role_coef_dict = {}

for role in roles:
    pgs_role = pgs[pgs['position_group'] == role].dropna(subset=stat_cols + ['won'])

    if len(pgs_role) < 50:
        print(f"Skipping {role} (not enough samples: {len(pgs_role)})")
        continue

    X = pgs_role[stat_cols].fillna(0)
    y = pgs_role['won']

    # Standardize
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Logistic regression
    model = LogisticRegression(max_iter=1000)
    model.fit(X_scaled, y)

    coefs = pd.Series(model.coef_[0], index=stat_cols)
    coefs = coefs.sort_values(key=np.abs, ascending=False)
  
    top = coefs.head(25)  
    role_coef_dict[role] = top

    print(f"\n{role.upper()} TOP COEFFICIENTS:\n")

    display(top)


FORWARD TOP COEFFICIENTS:



goals                      0.347901
goal_assists               0.243662
kicks                      0.210102
clangers                  -0.135133
hit_outs                   0.104088
pct_kicks                 -0.094317
rebounds                  -0.086666
one_percenters             0.082434
tackles                    0.075196
marks                      0.071431
frees_against              0.067163
contested_marks           -0.064314
handballs                  0.053409
contested_possessions     -0.043774
inside_50                  0.038728
behinds                    0.038466
frees_for                 -0.037641
bounces                    0.035718
marks_inside_50            0.034736
pct_contested              0.028714
uncontested_possessions   -0.014579
clearances                -0.012778
dtype: float64


MIDFIELD TOP COEFFICIENTS:



goal_assists               0.231588
kicks                      0.219833
clangers                  -0.205788
goals                      0.172308
rebounds                  -0.149507
handballs                  0.144405
frees_against              0.079802
marks                      0.074292
hit_outs                  -0.065315
one_percenters             0.054485
pct_kicks                  0.052715
marks_inside_50            0.048549
inside_50                  0.042139
contested_possessions     -0.041258
uncontested_possessions   -0.033528
contested_marks           -0.031725
frees_for                 -0.029882
tackles                    0.021074
bounces                    0.020144
pct_contested             -0.017252
behinds                    0.004137
clearances                -0.000296
dtype: float64


MIDFIELD,FORWARD TOP COEFFICIENTS:



kicks                      0.308666
goals                      0.229733
goal_assists               0.204094
clangers                  -0.176739
rebounds                  -0.166119
marks                      0.157714
uncontested_possessions   -0.124452
frees_against              0.104067
tackles                    0.099372
handballs                  0.094109
inside_50                  0.079391
pct_kicks                 -0.067313
one_percenters             0.064515
frees_for                 -0.051057
contested_marks           -0.042489
behinds                    0.041297
contested_possessions     -0.040445
bounces                   -0.029178
marks_inside_50            0.025667
clearances                -0.016799
pct_contested              0.007066
hit_outs                   0.005046
dtype: float64


RUCK TOP COEFFICIENTS:



clangers                  -0.205513
goals                      0.181325
frees_against              0.167900
goal_assists               0.166375
kicks                      0.109148
inside_50                  0.095953
marks_inside_50            0.090534
rebounds                  -0.089856
handballs                  0.089803
tackles                    0.078813
uncontested_possessions    0.067930
frees_for                 -0.061949
hit_outs                   0.045791
contested_marks            0.039146
marks                     -0.036270
contested_possessions     -0.033594
pct_kicks                 -0.023715
bounces                    0.022851
pct_contested              0.022254
clearances                 0.009097
behinds                    0.007468
one_percenters             0.000218
dtype: float64


TALL_DEFENDER TOP COEFFICIENTS:



clangers                  -0.221088
contested_possessions      0.217576
uncontested_possessions    0.154013
rebounds                  -0.133178
pct_contested             -0.118049
behinds                   -0.083222
frees_for                 -0.078578
pct_kicks                  0.063846
goal_assists               0.056581
inside_50                  0.054168
marks                      0.042912
handballs                 -0.041781
marks_inside_50            0.031637
one_percenters             0.030460
clearances                 0.030272
frees_against              0.029218
hit_outs                   0.019895
kicks                      0.016463
goals                     -0.009797
bounces                    0.004444
contested_marks           -0.004231
tackles                   -0.000437
dtype: float64


SMALL_DEFENDER TOP COEFFICIENTS:



clangers                  -0.238748
uncontested_possessions    0.230713
contested_possessions      0.164866
rebounds                  -0.161714
marks                      0.139264
goal_assists               0.137522
goals                      0.133785
handballs                 -0.125773
one_percenters             0.078544
clearances                -0.077347
frees_for                 -0.075558
frees_against              0.056599
marks_inside_50           -0.050515
pct_kicks                  0.050279
bounces                    0.041601
inside_50                  0.038532
contested_marks           -0.021458
hit_outs                  -0.014475
kicks                      0.014461
pct_contested              0.011044
tackles                   -0.002341
behinds                   -0.002283
dtype: float64

In [None]:
pgs_period = pgs.merge(
    games[['game_id', 'season_year', 'round_number', 'game_date']],
    on='game_id', how='left'
)


# Filter for the target period: 2022 to round 9, 2025
pgs_period = pgs_period[
    ((pgs_period['season_year'] > 2021) & 
    ((pgs_period['season_year'] < 2025) | 
     ((pgs_period['season_year'] == 2025) & (pgs_period['round_number'] <= 9))))
]

roles = ['forward', 'midfield', 'midfield,forward', 'ruck', 'tall_defender', 'small_defender']

top_players_per_role = {}
role_data_dict = {}
print(f"Number of games in dataset: {len(pgs_period)}")

for role in roles:
    coefs = role_coef_dict[role]
    stat_cols_role = list(coefs.index)

    # Subset for position and drop missing
    df_role = pgs_period[pgs_period['position_group'] == role].copy()
    df_role = df_role.dropna(subset=stat_cols_role)
    if len(df_role) < 10:
        print(f"Skipping {role} (not enough samples: {len(df_role)})")
        continue

    # Merge in full_name for later lookup
    df_role = df_role.merge(players[['player_id', 'full_name', 'height_cm']], on='player_id', how='left')

    # Standardize stats using mean and std from THIS PERIOD
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df_role[stat_cols_role])
    df_role_std = pd.DataFrame(X_scaled, columns=stat_cols_role, index=df_role.index)

    # Calculate impact score for each game
    df_role['impact_score'] = df_role_std.mul(coefs, axis=1).sum(axis=1)

    # Aggregate by player (average impact_score, games played)
    player_scores = (
        df_role.groupby('player_id')
        .agg(
            impact_score=('impact_score', 'mean'),
            games_played=('impact_score', 'count')
        )
        .reset_index()
        .merge(players[['player_id', 'full_name', 'height_cm']], on='player_id', how='left')
        .sort_values('impact_score', ascending=False)
    )

    # Put in minimum games of 15
    player_scores = player_scores[player_scores['games_played'] > 15]
    player_scores = player_scores.sort_values('impact_score', ascending=False)

    print(f"\nTop 5 {role.upper()}S:")
    display(player_scores.head(5)[['full_name', 'games_played', 'impact_score']])
    players_to_filter = 5
    top_players_per_role[role] = player_scores.head(players_to_filter)  #
    role_data_dict[role] = df_role.copy()
"""
# Below is a way to check a player's games to see which stand out
player_name_to_check = "INSERT NAME"  
role_to_check = "INSERT ROLE"

df = role_data_dict[role_to_check]
coefs = role_coef_dict[role_to_check]
stat_cols_role = list(coefs.index)

player_rows = df[df['full_name'] == player_name_to_check]

if player_rows.empty:
    print(f"No games for {player_name_to_check} in role {role_to_check}")
else:
    cols_to_show = [col for col in stat_cols_role if col in player_rows.columns]
    extra_cols = ['impact_score', 'game_id', 'round_number', 'season_year', 'game_date', 'height_cm']
    extra_cols = [col for col in extra_cols if col in player_rows.columns]
    display(player_rows[cols_to_show + extra_cols])
"""

Number of games in dataset: 33051

Top 5 FORWARDS:


Unnamed: 0,full_name,games_played,impact_score
85,Jeremy Cameron,75,0.683134
39,Toby Greene,71,0.564521
21,Charlie Curnow,76,0.54945
123,Taylor Walker,66,0.52521
87,Tom Hawkins,57,0.478029


Number of games in dataset: 33051

Top 5 MIDFIELDS:


Unnamed: 0,full_name,games_played,impact_score
90,Marcus Bontempelli,72,0.695789
3,Christian Petracca,70,0.579013
61,Zach Merrett,71,0.418897
10,Chad Warner,78,0.4019
16,Hugh McCluggage,84,0.345121


Number of games in dataset: 33051

Top 5 MIDFIELD,FORWARDS:


Unnamed: 0,full_name,games_played,impact_score
21,Kyle Langford,57,0.572625
7,Shai Bolton,75,0.458497
3,Errol Gulden,75,0.4537
18,Dustin Martin,42,0.421031
89,Jamie Elliott,66,0.417258


Number of games in dataset: 33051

Top 5 RUCKS:


Unnamed: 0,full_name,games_played,impact_score
17,Tim English,70,0.38039
22,Luke Jackson,73,0.342759
2,Hayden McLean,63,0.293929
15,Rowan Marshall,76,0.260594
33,Sean Darcy,52,0.197766


Number of games in dataset: 33051

Top 5 TALL_DEFENDERS:


Unnamed: 0,full_name,games_played,impact_score
2,Harris Andrews,84,0.212034
20,Mark Blicavs,74,0.21195
25,Tom Barrass,59,0.158341
27,Brennan Cox,60,0.149575
33,Jacob Weitering,74,0.149393


Number of games in dataset: 33051

Top 5 SMALL_DEFENDERS:


Unnamed: 0,full_name,games_played,impact_score
81,Jason Johannisen,32,0.350372
68,Callum Wilkie,77,0.291034
85,Darcy Byrne-Jones,77,0.267349
66,Bradley Hill,73,0.253701
26,Jayden Short,69,0.239432


Number of games in dataset: 33051


'\n# Below is a way to check a player\'s games to see which stand out\nplayer_name_to_check = "INSERT NAME"  \nrole_to_check = "INSERT ROLE"\n\ndf = role_data_dict[role_to_check]\ncoefs = role_coef_dict[role_to_check]\nstat_cols_role = list(coefs.index)\n\nplayer_rows = df[df[\'full_name\'] == player_name_to_check]\n\nif player_rows.empty:\n    print(f"No games for {player_name_to_check} in role {role_to_check}")\nelse:\n    cols_to_show = [col for col in stat_cols_role if col in player_rows.columns]\n    extra_cols = [\'impact_score\', \'game_id\', \'round_number\', \'season_year\', \'game_date\', \'height_cm\']\n    extra_cols = [col for col in extra_cols if col in player_rows.columns]\n    display(player_rows[cols_to_show + extra_cols])\n'

In [40]:
# --- POLYNOMIAL REGRESSION ANALYSIS (not used in final team rating) ---

# Polynomial regression analysis with L1 regularisation
roles = ['forward', 'midfield', 'midfield,forward', 'ruck', 'tall_defender', 'small_defender']

poly_results_dict = {}

for role in roles:
    df = pgs_period[pgs_period['position_group'] == role].dropna(subset=stat_cols + ['won']).copy()
    if len(df) < 50:
        print(f"Skipping {role}: not enough samples ({len(df)})")
        continue

    X_raw = df[stat_cols]
    y = df['won']

    # Standardise
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_raw)

    # Polynomial features (degree=2 includes squares and pairwise interactions)
    poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
    X_poly = poly.fit_transform(X_scaled)
    poly_feature_names = poly.get_feature_names_out(stat_cols)

    # Fit logistic regression
    model = LogisticRegressionCV(
        Cs=10,
        penalty='l1',
        solver='saga',      
        max_iter=5000,
        cv=5,
        scoring='accuracy',
        n_jobs=-1
    )
    model.fit(X_poly, y)

    # Coefficients as series
    coefs = pd.Series(model.coef_[0], index=poly_feature_names)
    coefs = coefs[coefs != 0].sort_values(key=np.abs, ascending=False)
    poly_results_dict[role] = coefs

    print(f"\n{role.upper()} – TOP POLYNOMIAL COEFFICIENTS")
    display(coefs.head(20)) 

# --- END POLYNOMIAL REGRESSION ---


FORWARD – TOP POLYNOMIAL COEFFICIENTS


goals                            0.334683
goal_assists                     0.231253
clangers                        -0.067788
handballs                        0.063715
hit_outs                         0.062232
uncontested_possessions          0.058576
tackles                          0.055132
kicks                            0.049312
one_percenters                   0.044489
marks uncontested_possessions    0.043815
inside_50                        0.040811
handballs rebounds              -0.035239
rebounds                        -0.034399
behinds                          0.030854
rebounds clearances             -0.023333
tackles rebounds                -0.023150
goals uncontested_possessions    0.020610
marks_inside_50^2                0.018670
kicks marks                      0.018287
clearances bounces              -0.017653
dtype: float64


MIDFIELD – TOP POLYNOMIAL COEFFICIENTS


goal_assists               0.181146
goals                      0.165754
kicks                      0.122247
clangers                  -0.121756
rebounds                  -0.112623
uncontested_possessions    0.082957
inside_50                  0.052205
marks_inside_50            0.039336
marks^2                    0.037272
marks                      0.029332
one_percenters             0.023973
goal_assists^2             0.023064
tackles^2                  0.018552
clearances^2               0.018058
clangers pct_contested     0.012714
goals pct_kicks           -0.011911
handballs^2                0.010904
behinds bounces            0.009817
pct_kicks^2                0.009563
hit_outs                  -0.009298
dtype: float64


MIDFIELD,FORWARD – TOP POLYNOMIAL COEFFICIENTS


goals                               0.213675
goal_assists                        0.177952
rebounds                           -0.136174
clangers                           -0.132545
marks                               0.126077
kicks                               0.101396
inside_50                           0.092780
tackles                             0.080299
frees_against                       0.051929
tackles rebounds                   -0.046828
one_percenters                      0.038642
rebounds uncontested_possessions   -0.037359
marks bounces                       0.034929
bounces                            -0.034156
handballs rebounds                 -0.033478
contested_possessions^2             0.033198
behinds                             0.033062
clangers frees_against              0.032618
marks uncontested_possessions       0.032588
marks handballs                     0.031925
dtype: float64


RUCK – TOP POLYNOMIAL COEFFICIENTS


goal_assists                        0.174342
goals                               0.156079
clangers                           -0.133843
marks_inside_50                     0.114077
inside_50                           0.106943
rebounds                           -0.101528
tackles                             0.090398
hit_outs contested_possessions     -0.067115
handballs                           0.066664
rebounds one_percenters             0.060327
uncontested_possessions             0.058935
clangers frees_against              0.058635
frees_against                       0.058267
behinds bounces                     0.053974
handballs rebounds                 -0.053969
hit_outs                            0.047599
clearances frees_for               -0.045181
hit_outs tackles                    0.044156
kicks behinds                      -0.040616
clangers uncontested_possessions   -0.037899
dtype: float64


TALL_DEFENDER – TOP POLYNOMIAL COEFFICIENTS


clangers                                -0.194810
kicks                                    0.124370
uncontested_possessions                  0.093394
behinds                                 -0.077723
clearances one_percenters               -0.076865
inside_50                                0.068478
rebounds                                -0.064861
bounces pct_kicks                       -0.060919
frees_against contested_marks            0.057097
clearances                              -0.054256
inside_50 clangers                       0.052059
inside_50 bounces                        0.050424
clearances bounces                      -0.049639
marks                                    0.047705
one_percenters^2                        -0.046565
hit_outs marks_inside_50                 0.046276
contested_possessions marks_inside_50    0.046087
contested_marks pct_contested           -0.042541
one_percenters                           0.041194
clearances uncontested_possessions      -0.040564



SMALL_DEFENDER – TOP POLYNOMIAL COEFFICIENTS


clangers                  -0.202948
marks                      0.154267
rebounds                  -0.130001
kicks                      0.090703
uncontested_possessions    0.089549
goal_assists               0.086309
one_percenters             0.074818
pct_kicks                  0.064863
inside_50                  0.059688
hit_outs                  -0.056637
goals                      0.053613
clearances                -0.052260
contested_possessions      0.045059
frees_for                 -0.040788
kicks rebounds            -0.037522
marks_inside_50           -0.031150
frees_against              0.029558
goals goal_assists         0.027522
bounces                    0.027352
goals clangers             0.026069
dtype: float64

In [74]:
# --- POLYNOMIAL REGRESSION ANALYSIS ---

# Looking at top players to sense check model
top_players_per_role = {}

# Filter for the target period: 2022 to round 9, 2025
"""
pgs_period = pgs_period[
    ((pgs_period['season_year'] > 2021) & 
    ((pgs_period['season_year'] < 2025) | 
     ((pgs_period['season_year'] == 2025) & (pgs_period['round_number'] <= 9))))
]
"""


for role in roles:
    coefs = poly_results_dict.get(role)
    if coefs is None or len(coefs) == 0:
        print(f"Skipping {role}: no coefficients found")
        continue
    poly_feature_names = list(coefs.index)
    
    # Subset for the role and time period, drop NaNs on original stats
    df_role = pgs_period[pgs_period['position_group'] == role].dropna(subset=stat_cols).copy()
    if len(df_role) < 10:
        print(f"Skipping {role}: not enough samples ({len(df_role)})")
        continue

    # Standardize and create polynomial features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df_role[stat_cols])
    poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
    X_poly = poly.fit_transform(X_scaled)
    poly_feature_names_all = poly.get_feature_names_out(stat_cols)
    X_poly_df = pd.DataFrame(X_poly, columns=poly_feature_names_all, index=df_role.index)

    # Only keep the features that survived L1 regularization
    X_poly_selected = X_poly_df[poly_feature_names]

    # Calculate impact score
    impact_score = X_poly_selected.mul(coefs, axis=1).sum(axis=1)
    df_role['impact_score'] = impact_score

    # Merge in player info
    df_role = df_role.merge(players[['player_id', 'full_name', 'height_cm']], on='player_id', how='left')

    # Group by player and get top 10 (with >= 10 games)
    player_scores = (
        df_role.groupby('player_id')
        .agg(
            impact_score=('impact_score', 'mean'),
            games_played=('impact_score', 'count')
        )
        .reset_index()
        .merge(players[['player_id', 'full_name', 'height_cm']], on='player_id', how='left')
        .sort_values('impact_score', ascending=False)
    )
    player_scores = player_scores[player_scores['games_played'] >= 10]
    # print(f"Number of games in dataset: {len(pgs_period)}")
    top_players_per_role[role] = player_scores.head(10)
    print(f"\nTop 10 {role.upper()}S:")
    display(player_scores.head(5)[['full_name', 'games_played', 'impact_score']])

# --- END POLYNOMIAL REGRESSION ---


Top 10 FORWARDS:


Unnamed: 0,full_name,games_played,impact_score
85,Jeremy Cameron,75,0.795362
39,Toby Greene,71,0.651204
123,Taylor Walker,66,0.581902
21,Charlie Curnow,76,0.570294
266,Tom Lynch,23,0.525674



Top 10 MIDFIELDS:


Unnamed: 0,full_name,games_played,impact_score
90,Marcus Bontempelli,72,0.769842
3,Christian Petracca,70,0.684768
61,Zach Merrett,71,0.529695
10,Chad Warner,78,0.496455
16,Hugh McCluggage,84,0.433423



Top 10 MIDFIELD,FORWARDS:


Unnamed: 0,full_name,games_played,impact_score
21,Kyle Langford,57,0.69006
7,Shai Bolton,75,0.622008
3,Errol Gulden,75,0.563588
6,Josh Dunkley,82,0.554034
34,Patrick Dangerfield,60,0.553518



Top 10 RUCKS:


Unnamed: 0,full_name,games_played,impact_score
17,Tim English,70,0.447355
22,Luke Jackson,73,0.365542
15,Rowan Marshall,76,0.227793
33,Sean Darcy,52,0.191768
2,Hayden McLean,63,0.178117



Top 10 TALL_DEFENDERS:


Unnamed: 0,full_name,games_played,impact_score
20,Mark Blicavs,74,0.570801
10,Kieren Briggs,52,0.429148
2,Harris Andrews,84,0.224022
27,Brennan Cox,60,0.172461
25,Tom Barrass,59,0.16529



Top 10 SMALL_DEFENDERS:


Unnamed: 0,full_name,games_played,impact_score
81,Jason Johannisen,32,0.35053
66,Bradley Hill,73,0.33377
216,Sam Reid,18,0.326004
85,Darcy Byrne-Jones,77,0.319055
26,Jayden Short,69,0.273209


In [68]:
# add in impact depending on the model type
model_type = "logistic"  # choose logistic or poly

impact_rows = []

for role in roles:
    if model_type == "logistic":
        coefs = role_coef_dict.get(role)
        if coefs is None or len(coefs) == 0:
            continue
        stat_cols_role = list(coefs.index)

        df_role = pgs_period[pgs_period['position_group'] == role].copy()
        df_role = df_role.dropna(subset=stat_cols_role)
        if df_role.empty:
            continue

        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(df_role[stat_cols_role])
        df_role_std = pd.DataFrame(X_scaled, columns=stat_cols_role, index=df_role.index)
        df_role['impact_score'] = df_role_std.mul(coefs, axis=1).sum(axis=1)

    elif model_type == "poly":
        coefs = poly_results_dict.get(role)
        if coefs is None or len(coefs) == 0:
            continue
        stat_cols_poly = list(coefs.index)

        # Use original stat_cols for polynomial expansion
        df_role = pgs_period[pgs_period['position_group'] == role].copy()
        df_role = df_role.dropna(subset=stat_cols)
        if df_role.empty:
            continue

        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(df_role[stat_cols])

        poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
        X_poly = poly.fit_transform(X_scaled)
        poly_feature_names = poly.get_feature_names_out(stat_cols)
        X_poly_df = pd.DataFrame(X_poly, columns=poly_feature_names, index=df_role.index)

        # Only use features selected by L1 regularization (nonzero coefficients)
        X_poly_selected = X_poly_df[stat_cols_poly]
        df_role['impact_score'] = X_poly_selected.mul(coefs, axis=1).sum(axis=1)

    else:
        raise ValueError(f"Unknown model_type: {model_type}")

    impact_rows.append(df_role)

# Concatenate all roles back into a single DataFrame
pgs_period_with_score = pd.concat(impact_rows).sort_values(['player_id', 'game_date'])
# print(f"Number of games in dataset: {len(pgs_period_with_score)}")

In [None]:
# Start to build out team model
pgs_period_with_score = pgs_period_with_score.sort_values(['player_id', 'game_date'])

# Get the rolling game average of players with a minimum period of games
rolling_games = 30
minimum_games = 5
pgs_period_with_score['impact_rolling'] = (
    pgs_period_with_score.groupby('player_id')['impact_score']
    .transform(lambda x: x.shift(1).rolling(window=rolling_games, min_periods=minimum_games).mean())
)

# Find the position average for when players don't meet 15 games with a 10% penalty so the player is treated as slightly below average
replacement_factor = 0.9 
pos_avg = (
    pgs_period_with_score.groupby('position_group')['impact_score']
    .mean()
    .mul(replacement_factor)
    .rename('pos_avg_score')
    .reset_index()
)

# Drop in case of a rerun (solves a bug)
if 'pos_avg_score' in pgs_period_with_score.columns:
    pgs_period_with_score = pgs_period_with_score.drop(columns=['pos_avg_score'])

# Merge position scores into pgs_period
pgs_period_with_score = pgs_period_with_score.merge(pos_avg, on='position_group', how='left')
pgs_period_with_score['impact_for_model'] = pgs_period_with_score['impact_rolling'].fillna(pgs_period_with_score['pos_avg_score'])

# Merge into teams
pgs_period_merge = pgs_period_with_score[['player_id', 'game_id', 'team_id', 'impact_for_model']]
team_game_impacts = (
    pgs_period_merge.groupby(['game_id', 'team_id'])['impact_for_model']
    .mean()  # or sum(), depending on your philosophy
    .reset_index()
)

# Merge team impacts into games model
games_model = games.merge(
    team_game_impacts.rename(columns={'team_id': 'home_team_id', 'impact_for_model': 'home_team_impact'}),
    on=['game_id', 'home_team_id'], how='left'
).merge(
    team_game_impacts.rename(columns={'team_id': 'away_team_id', 'impact_for_model': 'away_team_impact'}),
    on=['game_id', 'away_team_id'], how='left'
)
games_model['impact_diff'] = games_model['home_team_impact'] - games_model['away_team_impact']
games_model['winner'] = (games_model['home_result'] == 'W').astype(int)

In [70]:
# Build out the model to predict winners
df = games_model.dropna(subset=['home_team_impact', 'away_team_impact', 'impact_diff', 'winner']).copy()

X = df[['impact_diff']]
y = df['winner']

# split out training data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

player_impact_model = LogisticRegression(max_iter=1000)
player_impact_model.fit(X_train, y_train)

y_pred = player_impact_model.predict(X_test)
y_prob = player_impact_model.predict_proba(X_test)[:, 1]

print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_prob):.3f}")
print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.637
ROC AUC: 0.708
Confusion matrix:
[[121 166]
 [ 60 276]]


In [67]:
role_counts = (
    pgs_period_with_score
    .groupby(['team_id', 'game_id', 'position_group'])
    .size()
    .reset_index(name='count')
)

# Pivot so each role is a column (easier to see all at once)
role_counts_pivot = (
    role_counts
    .pivot_table(
        index=['team_id', 'game_id'],
        columns='position_group',
        values='count',
        fill_value=0
    )
    .reset_index()
)

# Now, calculate the *average* number of each role per team per game across all games
team_role_averages = (
    role_counts_pivot
    .groupby('team_id')
    .mean()
    .reset_index()
)

# Optional: Show as integers, rounded to 2 decimals for easy reading
team_role_averages = team_role_averages.round(2)

display(team_role_averages)

position_group,team_id,game_id,forward,midfield,"midfield,forward",ruck,small_defender,tall_defender
0,5,11955.59,5.53,6.96,1.07,1.27,6.25,1.62
1,19881,11965.12,6.11,4.9,1.55,1.65,6.8,1.41
2,19882,11928.66,4.98,5.98,1.83,1.67,7.51,0.48
3,19974,11939.69,5.37,5.7,0.95,1.45,6.75,2.06
4,20065,11954.65,5.33,5.6,2.72,1.49,6.26,1.13
5,20066,11838.07,6.38,5.44,1.71,1.3,6.2,1.3
6,20157,11935.28,6.37,5.96,2.12,1.2,4.88,2.03
7,20158,11966.3,6.11,5.23,2.73,0.92,5.92,1.27
8,20433,11908.44,6.31,4.11,1.47,1.16,8.67,1.09
9,20434,11942.93,5.94,4.72,1.93,1.57,6.81,1.84
