# Predicting Salary Using Regression Model
---

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.ensemble import AdaBoostRegressor, VotingRegressor

import pickle

In [2]:
df = pd.read_csv('../data/past_salaries.csv').drop(['Unnamed: 0'], axis = 1)
df.head()

Unnamed: 0,playerID,yearID,teamID_x,lgID_x,G,AB,R,H,2B,3B,HR,RBI,BB,SO,AVG,OBP,SLG,OPS,salary
0,abreujo02,2015,CHA,AL,154,613,88,178,34,3,30,101.0,39,140.0,0.29,0.347,0.502,0.849,8666000
1,ackledu01,2015,SEA,AL,85,186,22,40,8,1,6,19.0,14,38.0,0.215,0.27,0.366,0.636,2600000
2,ackledu01,2015,NYA,AL,23,52,6,15,3,2,4,11.0,4,7.0,0.288,0.333,0.654,0.987,2600000
3,adamsma01,2015,SLN,NL,60,175,14,42,9,0,5,24.0,10,41.0,0.24,0.28,0.377,0.657,534000
4,ahmedni01,2015,ARI,NL,134,421,49,95,17,6,9,34.0,29,81.0,0.226,0.275,0.359,0.634,508500


In [3]:
df.shape

(878, 19)

## Create X and y variables for Salary

In [4]:
X = df.drop(columns = ['playerID', 'salary', 'teamID_x', 'lgID_x', 'yearID', 'G'])
y = df['salary']
X.head()

Unnamed: 0,AB,R,H,2B,3B,HR,RBI,BB,SO,AVG,OBP,SLG,OPS
0,613,88,178,34,3,30,101.0,39,140.0,0.29,0.347,0.502,0.849
1,186,22,40,8,1,6,19.0,14,38.0,0.215,0.27,0.366,0.636
2,52,6,15,3,2,4,11.0,4,7.0,0.288,0.333,0.654,0.987
3,175,14,42,9,0,5,24.0,10,41.0,0.24,0.28,0.377,0.657
4,421,49,95,17,6,9,34.0,29,81.0,0.226,0.275,0.359,0.634


## Train Test Split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.3, random_state=42)

## Standard Scaler

In [6]:
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

In [7]:
pca = PCA(random_state=42)
X_train_p = pca.fit_transform(X_train_sc)
X_test_p = pca.transform(X_test_sc)

In [8]:
def reg_metrics(x, y, model):
    y_pred = model.predict(x)
    resids = y - y_pred

    mse = metrics.mean_squared_error(y, y_pred)
    rmse = metrics.mean_squared_error(y, y_pred, squared = False)
    score = model.score(x, y)
    
    print(model)
    print("R2: ", score)
    print("MSE: ", mse)
    print("RMSE: ", rmse)

In [9]:
def fit_models(x, y, x_test, y_test, model):
    model.fit(x, y)
    
    print()
    print(f'Train')
    reg_metrics(x, y, model)
    
    print()
    print(f'Test')
    reg_metrics(x_test, y_test, model)

## Voting Model

In [10]:
l_alphas = np.logspace(-3, 1, 100)
e_alphas = np.linspace(0.01, 1, 100)
enet_ratio = 0.05
r_alphas = np.logspace(0, 5, 220)

In [11]:
vote = VotingRegressor([
    ('ad', AdaBoostRegressor(random_state=42)),
    ('rid', RidgeCV(r_alphas, scoring='r2', cv=5)),
    ('las', LassoCV(alphas=l_alphas, cv=5, max_iter=50_000, n_jobs=-1)),
    ('enet_model', ElasticNetCV(alphas=e_alphas, l1_ratio=enet_ratio, cv=5, max_iter=3000, n_jobs=-1))
])

In [12]:
fit_models(X_train_p, y_train, X_test_p, y_test, vote)


Train
VotingRegressor(estimators=[('ad', AdaBoostRegressor(random_state=42)),
                            ('rid',
                             RidgeCV(alphas=array([1.00000000e+00, 1.05397680e+00, 1.11086709e+00, 1.17082814e+00,
       1.23402569e+00, 1.30063444e+00, 1.37083853e+00, 1.44483200e+00,
       1.52281940e+00, 1.60501632e+00, 1.69164996e+00, 1.78295980e+00,
       1.87919826e+00, 1.98063137e+00, 2.08753951e+00, 2.20021820e+00,
       2.31897894e...
       0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43, 0.44,
       0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54, 0.55,
       0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65, 0.66,
       0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76, 0.77,
       0.78, 0.79, 0.8 , 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87, 0.88,
       0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99,
       1.  ]),
                                          cv=5, l1_ratio=0.05, max_iter=3000,
    

In [13]:
with open('../model/salary_model.pkl', 'wb') as pickle_out:
    pickle_out = pickle.dump(vote, pickle_out)

In [14]:
df = pd.read_csv('../data/player_forecasts.csv').drop(['Unnamed: 0'], axis = 1)
df.head()

Unnamed: 0,name,AB,R,H,2B,3B,HR,RBI,BB,SO,AVG,OBP,SLG,OPS
0,Jose Abreu,300.0,73.0,75.0,0.0,0.0,0.0,75.0,0.0,75.0,0.264,0.326,0.485,0.812
1,Willy Adames,197.0,0.0,65.0,0.0,0.0,0.0,1.0,0.0,65.0,0.24,0.303,0.391,0.694
2,Ehire Adrianza,113.0,0.0,53.0,0.0,0.0,0.0,0.0,0.0,53.0,0.233,0.3,0.363,0.663
3,Jesus Aguilar,210.0,0.0,70.0,0.0,0.0,0.0,70.0,0.0,70.0,0.261,0.341,0.464,0.805
4,Nick Ahmed,225.0,0.0,75.0,0.0,0.0,0.0,0.0,0.0,75.0,0.226,0.282,0.362,0.644


In [15]:
X = df.drop(columns = ['name'])

In [16]:
X = ss.transform(X)
X = pca.transform(X)

preds = vote.predict(X)

pred_df = pd.DataFrame(preds, columns = ['salary'], index = df['name'])

In [17]:
pred_df.to_csv('../data/pred_salary.csv')

In [18]:
with open('../pickles/pred_salary.pkl', 'wb') as pickle_out:
    pickle_out = pickle.dump(pred_df, pickle_out)