# Model Training: Centers

The task here will be to train a model to predict that value of NHL players who play the Center position. Choice of features is based on the EDA done in 01_C_EDA.ipynb (Folder 4_ExploratoryDataAnalysis)

## Import our data

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [7]:
# Load in our data
filepath = '../../Data/entitiesResolved/merged_data_clean.csv'
data = pd.read_csv(filepath)

# Select all rows were the 'Position' is 'c' or 'c,l' or 'c,r'
centers = data[data['POSITION'].isin(['c', 'c, l', 'c, r'])]

# Select only players with 41 or more games played
centers = centers[centers['GP'] >= 60]
centers.shape

(1940, 116)

## Features to try:

Recall that the EDA done revealed that we should try to use the following features:

1. TOI/GP
2. XGF/60 - and possibly combining it with SCF/60, FF/60, HDCF/60, CF/60, SF/60, and MDCF/60 to create a new feature.
3. GF/60
4. TOTAL ASSISTS/60 - But possibly using FIRST ASSISTS/60 and SECOND_ASSISTS/60 instead.
5. GOALS/60
6. Handedness

In [8]:
# Select the features we want to use
numerical_features = ['XGF/60', 'GF/60', 'FIRST ASSISTS/60', 'SECOND ASSISTS/60', 'GOALS/60'] # Removed TOI/GP
categorical_features = ['HANDED']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ]
)

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', LinearRegression())])

X = centers
y = centers['Y_SALARY_CAP_PERCENTAGE']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 0.0007722088528824966


In [9]:
# Create a feature column in Centers for the predicted salary
centers['PRED_SALARY_PERCENTAGE'] = pipeline.predict(centers)

# Create a column for the actual predicted salary
centers['PRED_SALARY_CAP_AAV'] = centers['PRED_SALARY_PERCENTAGE'] * centers['Y_SALARY_CAP']

centers['PRED_SALARY_CAP_AAV'] = centers['PRED_SALARY_CAP_AAV'].astype(int)

# Convert Predicted_salaray_cap_aav to $
centers['PRED_SALARY_CAP_AAV'] = centers['PRED_SALARY_CAP_AAV'].map('${:,.0f}'.format)

centers.head(-5)

Unnamed: 0,POSITION,PLAYER,TEAM,TOI,GP,TOI/GP,GOALS/60,TOTAL ASSISTS/60,FIRST ASSISTS/60,SECOND ASSISTS/60,...,SALARY,BASE SALARY,S.BONUS,P.BONUS,SEASON,Y_SALARY_CAP,Y_SALARY_CAP_PERCENTAGE,DECEASED,PRED_SALARY_PERCENTAGE,PRED_SALARY_CAP_AAV
19,"c, l",andrew cogliano,col,1120.283333,82,13.661992,0.96,1.45,0.75,0.70,...,850000,850000,0,350000,2007-08,50300000,0.016899,0,0.044215,"$2,224,030"
28,c,antoine vermette,-,1423.616667,81,17.575514,1.01,1.22,0.72,0.51,...,1075000,1075000,0,0,2007-08,50300000,0.019881,0,0.040049,"$2,014,466"
30,c,anze kopitar,lak,1696.800000,82,20.692683,1.13,1.59,0.81,0.78,...,850000,765000,85000,134200,2007-08,50300000,0.016335,0,0.066530,"$3,346,472"
40,c,boyd gordon,-,1054.166667,67,15.733831,0.40,0.51,0.23,0.28,...,650000,650000,0,0,2007-08,50300000,0.012922,0,0.012267,"$617,029"
43,c,brad richards,-,1736.433333,74,23.465315,0.69,1.45,0.86,0.59,...,7800000,7800000,0,0,2007-08,50300000,0.155070,0,0.051155,"$2,573,113"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12878,c,tim stützle,ott,1438.666667,68,21.156863,0.71,1.96,0.75,1.21,...,5000000,5000000,0,0,2023-24,83500000,0.100000,0,0.080291,"$6,704,266"
12891,c,trent frederic,bos,979.833333,71,13.800469,1.04,1.29,0.86,0.43,...,2300000,2050000,250000,0,2023-24,83500000,0.027545,0,0.048963,"$4,088,427"
12892,c,trevor lewis,lak,785.083333,69,11.378019,0.61,0.46,0.31,0.15,...,775000,775000,0,0,2023-24,83500000,0.009281,0,0.020212,"$1,687,676"
12910,"c, r",tyler toffoli,wpg,1158.666667,67,17.293532,1.55,1.04,0.62,0.41,...,3500000,3500000,0,0,2023-24,83500000,0.050898,0,0.070882,"$5,918,658"


In [62]:
# Select the top 5 centers by predicted salary
top_centers = centers.sort_values(by='PREDICTED_SALARY_PERCENTAGE', ascending=False).head(15)
top_centers[['PLAYER', 'SEASON'] + numerical_features + categorical_features + ['Y_SALARY_CAP_PERCENTAGE', 'PREDICTED_SALARY_PERCENTAGE', 'PREDICTED_SALARY_CAP_AAV']]

Unnamed: 0,PLAYER,SEASON,XGF/60,GF/60,FIRST ASSISTS/60,SECOND ASSISTS/60,GOALS/60,HANDED,Y_SALARY_CAP_PERCENTAGE,PREDICTED_SALARY_PERCENTAGE,PREDICTED_SALARY_CAP_AAV
12328,connor mcdavid,2023-24,5.48,6.12,2.31,1.37,1.11,Left,0.149701,0.135799,"$11,339,186"
11482,connor mcdavid,2022-23,5.21,6.21,1.96,0.95,2.09,Left,0.151515,0.123299,"$10,172,135"
9730,connor mcdavid,2020-21,4.57,6.05,2.42,1.06,1.6,Left,0.153374,0.118978,"$9,696,711"
10590,connor mcdavid,2021-22,5.12,5.13,1.67,1.02,1.5,Left,0.153374,0.115533,"$9,415,925"
11774,leon draisaitl,2022-23,4.98,6.25,2.07,0.55,1.79,Left,0.10303,0.111221,"$9,175,722"
10815,jonathan huberdeau,2021-22,4.37,5.75,2.05,1.24,1.16,Left,0.072393,0.109352,"$8,912,222"
12590,leon draisaitl,2023-24,5.08,5.51,1.38,0.99,1.51,Left,0.101796,0.108184,"$9,033,388"
12698,nathan mackinnon,2023-24,4.72,6.04,1.48,1.37,1.6,Right,0.150898,0.106267,"$8,873,280"
9006,evgeni malkin,2019-20,4.49,4.92,1.51,1.23,1.4,Left,0.116564,0.104792,"$8,540,543"
11567,evgeni malkin,2022-23,5.03,4.33,1.42,0.79,1.06,Left,0.073939,0.103291,"$8,521,487"


In [63]:
# Select the top 15 centers with the largest discrepency between predicted salary and actual salary
centers['SALARY_PERCENTAGE_DIFF'] = centers['PREDICTED_SALARY_PERCENTAGE'] - centers['Y_SALARY_CAP_PERCENTAGE']
top_centers = centers.sort_values(by='SALARY_PERCENTAGE_DIFF', ascending=False).head(15)
top_centers[['PLAYER', 'SEASON'] + numerical_features + categorical_features + ['Y_SALARY_CAP_PERCENTAGE', 'PREDICTED_SALARY_PERCENTAGE']]

Unnamed: 0,PLAYER,SEASON,XGF/60,GF/60,FIRST ASSISTS/60,SECOND ASSISTS/60,GOALS/60,HANDED,Y_SALARY_CAP_PERCENTAGE,PREDICTED_SALARY_PERCENTAGE
396,sidney crosby,2007-08,4.28,4.89,1.79,0.81,1.3,Left,0.016899,0.097426
7261,connor mcdavid,2017-18,4.17,4.14,1.32,0.95,1.39,Left,0.012333,0.09158
8488,mitchell marner,2018-19,4.29,4.91,1.92,0.59,0.96,Right,0.011247,0.089244
6437,connor mcdavid,2016-17,4.03,4.57,1.52,0.9,1.04,Left,0.012671,0.088357
8625,sebastian aho,2018-19,4.36,4.28,1.05,0.87,1.09,Left,0.011635,0.086077
7974,auston matthews,2018-19,4.3,4.37,1.09,0.62,1.76,Left,0.011635,0.085615
5587,connor mcdavid,2015-16,3.88,4.17,1.13,1.13,1.13,Left,0.012955,0.084276
636,evgeni malkin,2008-09,3.86,4.65,1.53,1.01,1.14,Left,0.017358,0.088107
7583,mathew barzal,2017-18,3.78,4.82,1.44,1.15,0.91,Right,0.011511,0.081605
12060,tage thompson,2022-23,4.29,5.26,1.12,0.83,1.94,Right,0.01697,0.085472


In [39]:
centers['PREDICTED_SALARY'].describe()

count    2633.000000
mean        0.042474
std         0.021812
min        -0.009111
25%         0.025756
50%         0.041855
75%         0.057185
max         0.139332
Name: PREDICTED_SALARY, dtype: float64

In [40]:
centers['Y_SALARY_CAP_PERCENTAGE'].describe()

count    2633.000000
mean        0.042197
std         0.034639
min         0.007862
25%         0.012512
50%         0.028931
75%         0.065826
max         0.157233
Name: Y_SALARY_CAP_PERCENTAGE, dtype: float64

In [30]:
# Find all centers with a predicted salary that is negative
negative_centers = centers[centers['PREDICTED_SALARY'] < 0]
negative_centers[['PLAYER', 'SEASON'] + numerical_features + categorical_features + ['Y_SALARY_CAP_PERCENTAGE', 'PREDICTED_SALARY']]

Unnamed: 0,PLAYER,SEASON,XGF/60,GF/60,TOTAL ASSISTS/60,GOALS/60,HANDED,Y_SALARY_CAP_PERCENTAGE,PREDICTED_SALARY
1900,jay beagle,2010-11,1.43,1.29,0.18,0.37,Right,0.008628,-0.002876
2603,jerred smithson,2011-12,1.46,1.56,0.37,0.07,Right,0.012442,-0.000736
2624,john madden,2011-12,1.26,0.65,0.0,0.48,Left,0.009331,-0.004848
3140,colton gillies,2012-13,1.06,1.07,0.27,0.27,Left,0.010417,-0.00475
3166,darroll powe,2012-13,1.33,0.95,0.0,0.0,Left,0.017778,-0.006673
3314,jerred smithson,2012-13,1.33,1.13,0.38,0.38,Right,0.013333,-0.000507
3602,ryan white,2012-13,1.32,1.23,0.0,0.25,Right,0.011458,-0.008739
4196,marc-andre cliche,2013-14,1.27,0.82,0.45,0.07,Right,0.008359,-0.001512
4722,cody mccormick,2014-15,1.2,0.93,0.47,0.16,Right,0.021739,-0.001951
5029,manny malhotra,2014-15,1.02,0.67,0.29,0.1,Left,0.012319,-0.00549


# Preliminary Conclusions
It seems like we need to make some tweaks. I can see two ways to think about this:
1. We're missing something in our feature set that predicts how valuable players like Connor Mcdavid and Nathan Mackinnon really are.
2. Our model thinks that the highest paid players are significantly overpaid considering how well they produce on the ice.