In [61]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [62]:
df = pd.read_csv('../datasets/Hitters.csv', index_col=0)

In [63]:
df

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
-Andy Allanson,293,66,1,30,29,14,1,293,66,1,30,29,14,A,E,446,33,20,,A
-Alan Ashby,315,81,7,24,38,39,14,3449,835,69,321,414,375,N,W,632,43,10,475.0,N
-Alvin Davis,479,130,18,66,72,76,3,1624,457,63,224,266,263,A,W,880,82,14,480.0,A
-Andre Dawson,496,141,20,65,78,37,11,5628,1575,225,828,838,354,N,E,200,11,3,500.0,N
-Andres Galarraga,321,87,10,39,42,30,2,396,101,12,48,46,33,N,E,805,40,4,91.5,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
-Willie McGee,497,127,7,65,48,37,5,2703,806,32,379,311,138,N,E,325,9,3,700.0,N
-Willie Randolph,492,136,5,76,50,94,12,5511,1511,39,897,451,875,A,E,313,381,20,875.0,A
-Wayne Tolleson,475,126,3,61,43,52,6,1700,433,7,217,93,146,A,W,37,113,7,385.0,A
-Willie Upshaw,573,144,9,85,60,78,8,3198,857,97,470,420,332,A,E,1314,131,12,960.0,A


In [64]:
# AIM: To predict the salary of a player based on the given features

In [65]:
# removing all the records with missing salary feature 
df = df.dropna(axis=0, subset='Salary')

In [66]:
# converting categorical columns via one-hot encoding
df = pd.get_dummies(df, columns=['League', 'Division', 'NewLeague'], drop_first=True)

In [67]:
df

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,PutOuts,Assists,Errors,Salary,League_N,Division_W,NewLeague_N
-Alan Ashby,315,81,7,24,38,39,14,3449,835,69,321,414,375,632,43,10,475.0,True,True,True
-Alvin Davis,479,130,18,66,72,76,3,1624,457,63,224,266,263,880,82,14,480.0,False,True,False
-Andre Dawson,496,141,20,65,78,37,11,5628,1575,225,828,838,354,200,11,3,500.0,True,False,True
-Andres Galarraga,321,87,10,39,42,30,2,396,101,12,48,46,33,805,40,4,91.5,True,False,True
-Alfredo Griffin,594,169,4,74,51,35,11,4408,1133,19,501,336,194,282,421,25,750.0,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
-Willie McGee,497,127,7,65,48,37,5,2703,806,32,379,311,138,325,9,3,700.0,True,False,True
-Willie Randolph,492,136,5,76,50,94,12,5511,1511,39,897,451,875,313,381,20,875.0,False,False,False
-Wayne Tolleson,475,126,3,61,43,52,6,1700,433,7,217,93,146,37,113,7,385.0,False,True,False
-Willie Upshaw,573,144,9,85,60,78,8,3198,857,97,470,420,332,1314,131,12,960.0,False,False,False


In [68]:
# making a new column for 'Log Salary', which we will predict instead of 'Salary'

In [69]:
df['Log Salary'] = np.log(df['Salary'])

In [70]:
X, y = df.drop(columns=['Salary', 'Log Salary']), df['Log Salary']

In [71]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=42)

In [72]:
# making a base model, can use cost-complexity pruning by tuning the ccp_alpha param
clf = DecisionTreeRegressor()

In [86]:
param_grid = {
    'ccp_alpha': np.logspace(-4, 0, num=100),
}

In [87]:
grid = GridSearchCV(clf, param_grid, scoring='neg_mean_squared_error', verbose=1)

In [88]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [89]:
grid.best_params_

{'ccp_alpha': 0.0054622772176843425}

In [90]:
y_pred = grid.predict(X_test)

In [91]:
mean_absolute_error(y_test, y_pred)

0.4401589857380733

In [92]:
np.sqrt(mean_squared_error(y_test, y_pred))

0.5761668761539788