In [1]:
import pandas as pd
import numpy as np 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [2]:
df = pd.read_parquet('../../data/curated/clean_stats_13-22.parquet')

In [3]:
df.set_index('player_id', inplace=True)

In [4]:
features = []
with open('final_features.txt', 'r') as f:
    for line in f.readlines():
        features.append(line.strip())

target = 'brownlow_votes'

In [5]:
features

['winning_margin',
 'coaches_votes',
 'contested_possessions_proportion',
 'goals_proportion',
 'average_votes_prev',
 'marks_inside_fifty_proportion',
 'metres_gained_proportion',
 '30_and_2',
 'handballs_proportion',
 'intercept_marks_proportion',
 'is_captain',
 'high_goal_scorer',
 'spoils_proportion',
 'centre_clearances_proportion',
 'uncontested_possessions_proportion',
 'hitouts_to_advantage_proportion',
 'kicks_proportion',
 'player_position_rover',
 'player_position_key_defender',
 'player_position_key_forward',
 'player_position_forward',
 'player_position_ruck',
 'player_position_wing',
 'free_kicks_for']

In [7]:
X_train, y_train = df.query('season < 2022')[features], df.query('season < 2022')[target]
X_test, y_test = df.query('season == 2022')[features], df.query('season == 2022')[target]

In [8]:
# if assigned every performance a 0, would maintain an accuracy of 0.935 (benchmark)
list(y_test).count(0) / y_test.count()

0.9334975369458128

In [9]:
LR = LogisticRegression(max_iter=10_000, multi_class='multinomial', solver='saga', penalty='l1')

In [10]:
LR.fit(X_train, y_train)

In [11]:
predictions = LR.predict(X_test)

In [12]:
print(classification_report(y_pred=predictions, y_true=y_test))

              precision    recall  f1-score   support

         0.0       0.96      0.99      0.98      8338
         1.0       0.25      0.01      0.01       198
         2.0       0.41      0.06      0.10       198
         3.0       0.49      0.66      0.56       198

    accuracy                           0.94      8932
   macro avg       0.53      0.43      0.41      8932
weighted avg       0.92      0.94      0.93      8932



In [13]:
for i, col in enumerate(np.transpose(LR.coef_)):
    rounded_col = [round(e, 3) for e in col]
    print(f'{features[i]:>35}: {rounded_col}')

                     winning_margin: [-0.008, -0.001, 0.0, 0.002]
                      coaches_votes: [-0.394, -0.036, 0.082, 0.276]
   contested_possessions_proportion: [-6.032, 2.094, 1.325, 1.173]
                   goals_proportion: [-7.253, 2.246, 2.189, 1.363]
                 average_votes_prev: [-0.94, -0.13, 0.391, 0.634]
      marks_inside_fifty_proportion: [-3.293, 0.0, 0.0, 2.247]
           metres_gained_proportion: [-8.003, 3.0, 2.08, 1.49]
                           30_and_2: [-0.868, -0.2, 0.307, 0.761]
               handballs_proportion: [-8.114, 2.496, 2.335, 1.846]
         intercept_marks_proportion: [-3.153, 1.823, 0.783, 0.0]
                         is_captain: [-0.124, -0.031, 0.104, 0.042]
                   high_goal_scorer: [-1.364, -0.002, 0.078, 0.404]
                  spoils_proportion: [0.695, 0.0, -0.181, 0.0]
       centre_clearances_proportion: [-2.218, 0.634, -0.061, 1.411]
 uncontested_possessions_proportion: [-9.319, 3.104, 2.798, 1.984]
    hito

In [15]:
df_2022 = df.query('season == 2022')
df_2022['predicted_votes'] = predictions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2022['predicted_votes'] = predictions


In [16]:
df_2022.groupby(['player_first_name', 'player_last_name'])['predicted_votes'].sum().sort_values(ascending=False)[:10]

player_first_name  player_last_name
Touk               Miller              30.0
Clayton            Oliver              29.0
Lachie             Neale               27.0
Christian          Petracca            26.0
Jeremy             Cameron             22.0
Andrew             Brayshaw            21.0
Patrick            Cripps              20.0
Connor             Rozee               18.0
Darcy              Parish              15.0
Callum             Mills               15.0
Name: predicted_votes, dtype: float64

In [17]:
cols = [
    'match_id', 'match_round', 'player_first_name', 'player_last_name', 'player_team',
]
df_2022 = df_2022[cols]

In [18]:
df_2022.head()

Unnamed: 0_level_0,match_id,match_round,player_first_name,player_last_name,player_team
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
11904,16117,1,Tom,Liberatore,Western Bulldogs
11945,16117,1,Steven,May,Melbourne
11972,16117,1,Max,Gawn,Melbourne
12015,16117,1,Tom,McDonald,Melbourne
12034,16117,1,Adam,Tomlinson,Melbourne


In [19]:
prob_2022 = pd.DataFrame(LR.predict_proba(X_test))

In [20]:
prob_2022['player_id'] = y_test.index

In [21]:
prob_2022.set_index('player_id', inplace=True)
prob_2022['predictions'] = predictions

In [22]:
prob_2022.head()

Unnamed: 0_level_0,0,1,2,3,predictions
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
11904,0.995934,0.002583,0.001251,0.000232,0.0
11945,0.992242,0.005301,0.002085,0.000373,0.0
11972,0.929986,0.032486,0.030978,0.006549,0.0
12015,0.99208,0.00439,0.003016,0.000514,0.0
12034,0.995563,0.00295,0.001256,0.000231,0.0


In [23]:
final_df = pd.concat([df_2022, prob_2022], axis=1)

In [24]:
final_df.head()

Unnamed: 0_level_0,match_id,match_round,player_first_name,player_last_name,player_team,0,1,2,3,predictions
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
11904,16117,1,Tom,Liberatore,Western Bulldogs,0.995934,0.002583,0.001251,0.000232,0.0
11945,16117,1,Steven,May,Melbourne,0.992242,0.005301,0.002085,0.000373,0.0
11972,16117,1,Max,Gawn,Melbourne,0.929986,0.032486,0.030978,0.006549,0.0
12015,16117,1,Tom,McDonald,Melbourne,0.99208,0.00439,0.003016,0.000514,0.0
12034,16117,1,Adam,Tomlinson,Melbourne,0.995563,0.00295,0.001256,0.000231,0.0


In [25]:
final_df.to_parquet('../../data/curated/2022_vote_probs.parquet')

  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
