In [331]:
import pandas as pd
from tqdm import tqdm
from copy import copy
import statsmodels.api as sm

from sklearn.feature_selection import SelectKBest, f_classif


In [428]:
data = pd.read_parquet('../../data/curated/clean_stats_13-22.parquet')

In [429]:
data.columns[18:28]

Index(['is_captain', 'player_position_defender', 'player_position_rover',
       'player_position_key_defender', 'player_position_key_forward',
       'player_position_forward', 'player_position_INT',
       'player_position_ruck', 'player_position_sub', 'player_position_wing'],
      dtype='object')

In [434]:
data.columns[78:131]

Index(['rating_points', 'winning_margin', 'won_match', '30_and_2',
       'high_goal_scorer', 'top_disposal_getter', 'top_clearance_getter',
       'midfielder_goals', 'kicks_proportion', 'marks_proportion',
       'handballs_proportion', 'disposals_proportion',
       'effective_disposals_proportion', 'goals_proportion',
       'afl_fantasy_score_proportion', 'SC_proportion', 'behinds_proportion',
       'hitouts_proportion', 'tackles_proportion', 'rebounds_proportion',
       'inside_fifties_proportion', 'clearances_proportion',
       'clangers_proportion', 'free_kicks_for_proportion',
       'free_kicks_against_proportion', 'contested_possessions_proportion',
       'uncontested_possessions_proportion', 'contested_marks_proportion',
       'marks_inside_fifty_proportion', 'one_percenters_proportion',
       'bounces_proportion', 'goal_assists_proportion',
       'centre_clearances_proportion', 'stoppage_clearances_proportion',
       'score_involvements_proportion', 'metres_gained_

In [436]:
data.columns[176:]

Index(['coaches_votes', 'average_votes_prev', 'brownlow_votes'], dtype='object')

# Feature Selection

In [437]:
potential_features = [
    'is_captain', 'player_position_defender', 'player_position_rover',
    'player_position_key_defender', 'player_position_key_forward',
    'player_position_forward', 'player_position_INT',
    'player_position_ruck', 'player_position_sub', 'player_position_wing',
    'rating_points', 'winning_margin', 'won_match', '30_and_2',
    'high_goal_scorer','top_disposal_getter', 'top_clearance_getter',
    'midfielder_goals',
    'kicks_proportion', 'marks_proportion',
    'handballs_proportion', 'disposals_proportion',
    'effective_disposals_proportion', 'goals_proportion',
    'afl_fantasy_score_proportion', 'SC_proportion', 'behinds_proportion',
    'hitouts_proportion', 'tackles_proportion', 'rebounds_proportion',
    'inside_fifties_proportion', 'clearances_proportion',
    'clangers_proportion', 'free_kicks_for_proportion',
    'free_kicks_against_proportion', 'contested_possessions_proportion',
    'uncontested_possessions_proportion', 'contested_marks_proportion',
    'marks_inside_fifty_proportion', 'one_percenters_proportion',
    'bounces_proportion', 'goal_assists_proportion',
    'centre_clearances_proportion', 'stoppage_clearances_proportion',
    'score_involvements_proportion', 'metres_gained_proportion',
    'turnovers_proportion', 'intercepts_proportion',
    'tackles_inside_fifty_proportion', 'contest_def_losses_proportion',
    'contest_def_one_on_ones_proportion',
    'contest_off_one_on_ones_proportion', 'contest_off_wins_proportion',
    'def_half_pressure_acts_proportion', 'effective_kicks_proportion',
    'f50_ground_ball_gets_proportion', 'ground_ball_gets_proportion',
    'hitouts_to_advantage_proportion', 'intercept_marks_proportion',
    'marks_on_lead_proportion', 'score_launches_proportion',
    'shots_at_goal_proportion', 'spoils_proportion',
    'coaches_votes', 'average_votes_prev'
    ]
target = 'brownlow_votes'

# Stepwise Selection

Define functions forwards and backward that conduct forward selection and backward selection respectively.

Then iterate through a stepwise selection process until the model AIC no longer decreases.

In [438]:
def forward(current_features, X, y_train, min_aic=10_000_000):
    
    next_feature = None
        
    for f in tqdm(potential_features, desc="Forward Selection"):

        # do not want to add same feature multiple times
        if f not in current_features:
            
            curr_features = list(current_features) + [f]


            # train model with new feature added
            X_train = X[curr_features]
            
            model = sm.MNLogit(y_train, sm.add_constant(X_train)).fit(disp=0)

            # calc aic
            local_aic = model.aic


            if local_aic < min_aic:

                next_feature = f

                min_aic = local_aic  
    
    return (min_aic, next_feature)

In [439]:
def backward(current_features, X, y_train, min_aic=10_000_000):
    
    next_feature = None
        
    for f in tqdm(potential_features, desc="Backward Selection"):

        # do not want to add same feature multiple times
        if f in current_features:
            
            curr_features = list(current_features)
            curr_features.remove(f)


            # train model with new feature added
            X_train = X[curr_features]
            
            model = sm.MNLogit(y_train, sm.add_constant(X_train)).fit(disp=0)

            # calc aic
            local_aic = model.aic


            if local_aic < min_aic:

                next_feature = f

                min_aic = local_aic  
    
    return (min_aic, next_feature)

In [440]:
# # since there are many features, this takes a long time to run.
# # features are saved to features.txt for loading 

# current_features = []
# previous_aic = float('inf')  # Start with a large AIC
# X = data.query('season < 2022')[potential_features]
# y_train = data.query('season < 2022')[target]

# while True:
#     # Forward selection
#     forward_aic, forward_feature = forward(current_features, X, y_train, previous_aic)
    
#     # Backward elimination
#     backward_aic, backward_feature = backward(current_features, X, y_train, previous_aic)
    
#     # Check which method (forward or backward) improves the model more
#     if forward_aic < backward_aic and forward_aic < previous_aic:
#         current_features.append(forward_feature)
#         print(f'added: {forward_feature}\n')
#         previous_aic = forward_aic
#     elif backward_aic < previous_aic:
#         current_features.remove(backward_feature)
#         print(f'removed: {backward_feature}\n')
#         previous_aic = backward_aic
#     else:
#         # If no improvement is made, break out of the loop
#         print('stepwise selection finished')
#         break

# # write features to text file to avoid stepwise selection when making changes
# with open('stepwise_selection_features.txt', 'w') as f:
#     for feature in current_features:
#         f.write(f'{feature}\n')

# SelectKBest feature selection

In [441]:
X_train, y_train = data.query('season < 2022')[potential_features], data.query('season < 2022')[target]

In [442]:
k_best = SelectKBest(score_func=f_classif, k=30)  # Select 10 best features

k_best.fit_transform(X_train, y_train)
current_features = list(k_best.get_feature_names_out())

In [443]:
current_features

['player_position_rover',
 'rating_points',
 '30_and_2',
 'high_goal_scorer',
 'top_disposal_getter',
 'top_clearance_getter',
 'midfielder_goals',
 'kicks_proportion',
 'marks_proportion',
 'handballs_proportion',
 'disposals_proportion',
 'effective_disposals_proportion',
 'goals_proportion',
 'afl_fantasy_score_proportion',
 'SC_proportion',
 'inside_fifties_proportion',
 'clearances_proportion',
 'contested_possessions_proportion',
 'uncontested_possessions_proportion',
 'centre_clearances_proportion',
 'stoppage_clearances_proportion',
 'score_involvements_proportion',
 'metres_gained_proportion',
 'turnovers_proportion',
 'effective_kicks_proportion',
 'ground_ball_gets_proportion',
 'score_launches_proportion',
 'shots_at_goal_proportion',
 'coaches_votes',
 'average_votes_prev']

In [444]:
# features that have been dropped during stepwise selection
dropped_features = set(potential_features) - set(current_features)
dropped_features

{'behinds_proportion',
 'bounces_proportion',
 'clangers_proportion',
 'contest_def_losses_proportion',
 'contest_def_one_on_ones_proportion',
 'contest_off_one_on_ones_proportion',
 'contest_off_wins_proportion',
 'contested_marks_proportion',
 'def_half_pressure_acts_proportion',
 'f50_ground_ball_gets_proportion',
 'free_kicks_against_proportion',
 'free_kicks_for_proportion',
 'goal_assists_proportion',
 'hitouts_proportion',
 'hitouts_to_advantage_proportion',
 'intercept_marks_proportion',
 'intercepts_proportion',
 'is_captain',
 'marks_inside_fifty_proportion',
 'marks_on_lead_proportion',
 'one_percenters_proportion',
 'player_position_INT',
 'player_position_defender',
 'player_position_forward',
 'player_position_key_defender',
 'player_position_key_forward',
 'player_position_ruck',
 'player_position_sub',
 'player_position_wing',
 'rebounds_proportion',
 'spoils_proportion',
 'tackles_inside_fifty_proportion',
 'tackles_proportion',
 'winning_margin',
 'won_match'}

In [500]:
# add and remove features based on domain knowledge statistical significance
# explore whether features improve model performance

feat = list(set(current_features) - set([
   'effective_kicks_proportion',
   'shots_at_goal_proportion',
   'clearances_proportion',
   'effective_disposals_proportion',
   'player_position_rover',
   'rating_points',
   'disposals_proportion',
   'marks_proportion',
   'score_launches_proportion',
   'turnovers_proportion',
   'stoppage_clearances_proportion',
   'inside_fifties_proportion',
   'score_involvements_proportion',
   'ground_ball_gets_proportion',
   'top_clearance_getter',
   'uncontested_possessions_proportion',
   '30_and_2'
   
])) 

feat += [
    'winning_margin',
    'is_captain',
    'hitouts_to_advantage_proportion',
    'intercept_marks_proportion',
    'spoils_proportion',
    'contested_marks_proportion',

    # 'bounces_proportion',
]

In [501]:
len(feat)

19

In [502]:
X_train, y_train = data.query('season < 2022')[feat], data.query('season < 2022')[target]

In [503]:
model2 = sm.MNLogit(y_train, sm.add_constant(X_train)).fit(disp=0, maxiter=10_000)
model2.aic

25179.633420511233

In [504]:
model2.summary()

0,1,2,3
Dep. Variable:,brownlow_votes,No. Observations:,76187.0
Model:,MNLogit,Df Residuals:,76127.0
Method:,MLE,Df Model:,57.0
Date:,"Sun, 01 Oct 2023",Pseudo R-squ.:,0.4911
Time:,23:07:13,Log-Likelihood:,-12530.0
converged:,True,LL-Null:,-24620.0
Covariance Type:,nonrobust,LLR p-value:,0.0

brownlow_votes=1,coef,std err,z,P>|z|,[0.025,0.975]
const,-10.2140,0.179,-57.130,0.000,-10.564,-9.864
handballs_proportion,28.5828,3.225,8.862,0.000,22.261,34.904
midfielder_goals,-0.0526,0.054,-0.981,0.326,-0.158,0.052
coaches_votes,0.2455,0.010,23.523,0.000,0.225,0.266
high_goal_scorer,1.3202,0.195,6.763,0.000,0.938,1.703
kicks_proportion,34.9365,6.194,5.641,0.000,22.797,47.076
metres_gained_proportion,10.2772,3.495,2.940,0.003,3.426,17.128
afl_fantasy_score_proportion,91.7896,8.579,10.700,0.000,74.976,108.603
top_disposal_getter,0.3764,0.100,3.757,0.000,0.180,0.573
average_votes_prev,0.5055,0.096,5.291,0.000,0.318,0.693


In [505]:
# final features are:
feat

['handballs_proportion',
 'midfielder_goals',
 'coaches_votes',
 'high_goal_scorer',
 'kicks_proportion',
 'metres_gained_proportion',
 'afl_fantasy_score_proportion',
 'top_disposal_getter',
 'average_votes_prev',
 'SC_proportion',
 'contested_possessions_proportion',
 'centre_clearances_proportion',
 'goals_proportion',
 'winning_margin',
 'is_captain',
 'hitouts_to_advantage_proportion',
 'intercept_marks_proportion',
 'spoils_proportion',
 'contested_marks_proportion']

In [506]:
# write features to text file so they can be opened in next notebook
with open('final_features.txt', 'w') as f:
    for feature in feat:
        f.write(f'{feature}\n')