In [459]:
# import dependencies
# Initial imports.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from collections import Counter
from sklearn.model_selection import KFold

In [460]:
# Loading data
file_path = Path('pitcher_salaries_cleaned.csv')
df = pd.read_csv(file_path)
df

Unnamed: 0,Year,Full Name,Age,Salary,ERA,Hits,Earned Runs,Strike Outs,Home Runs,Wins,Losses,Outs Pitched,Batters Faced by Pitcher,Games Finished,Weight,Height,League,Team,Games Started
0,1990,AbbottJim,23.0,185000.0,4.51,246,106,105,16,10,14,635,925.0,0,200.0,75.0,AL,CAL,33
1,1990,AbbottPaul,23.0,100000.0,5.97,37,23,25,0,0,5,104,162.0,0,185.0,75.0,AL,MIN,7
2,1990,AldredScott,22.0,100000.0,3.77,13,6,7,0,1,2,43,63.0,0,195.0,76.0,AL,DET,3
3,1990,AndersonAllan,26.0,300000.0,4.53,214,95,82,20,7,18,566,797.0,0,178.0,71.0,AL,MIN,31
4,1990,AppierKevin,23.0,100000.0,2.76,179,57,127,13,12,8,557,784.0,1,180.0,74.0,AL,KCA,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4932,2016,WorleyVance,29.0,2600000.0,3.53,84,34,56,11,2,2,260,365.0,13,240.0,74.0,AL,BAL,4
4933,2016,WrightMike,26.0,510500.0,5.79,81,48,50,12,3,4,224,328.0,5,240.0,78.0,AL,BAL,12
4934,2016,WrightSteven,32.0,514500.0,3.33,138,58,127,12,13,6,470,656.0,0,215.0,74.0,AL,BOS,24
4935,2016,YoungChris,37.0,4250000.0,6.19,104,61,94,28,3,9,266,406.0,7,255.0,82.0,AL,KCA,13


In [461]:
#clean data - drop Team and Full Name
df=df.drop(["Full Name","Team","Year"],1)
df

Unnamed: 0,Age,Salary,ERA,Hits,Earned Runs,Strike Outs,Home Runs,Wins,Losses,Outs Pitched,Batters Faced by Pitcher,Games Finished,Weight,Height,League,Games Started
0,23.0,185000.0,4.51,246,106,105,16,10,14,635,925.0,0,200.0,75.0,AL,33
1,23.0,100000.0,5.97,37,23,25,0,0,5,104,162.0,0,185.0,75.0,AL,7
2,22.0,100000.0,3.77,13,6,7,0,1,2,43,63.0,0,195.0,76.0,AL,3
3,26.0,300000.0,4.53,214,95,82,20,7,18,566,797.0,0,178.0,71.0,AL,31
4,23.0,100000.0,2.76,179,57,127,13,12,8,557,784.0,1,180.0,74.0,AL,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4932,29.0,2600000.0,3.53,84,34,56,11,2,2,260,365.0,13,240.0,74.0,AL,4
4933,26.0,510500.0,5.79,81,48,50,12,3,4,224,328.0,5,240.0,78.0,AL,12
4934,32.0,514500.0,3.33,138,58,127,12,13,6,470,656.0,0,215.0,74.0,AL,24
4935,37.0,4250000.0,6.19,104,61,94,28,3,9,266,406.0,7,255.0,82.0,AL,13


In [462]:
#encode data - League
df=pd.get_dummies(df,columns=['League'], prefix="League")
df

Unnamed: 0,Age,Salary,ERA,Hits,Earned Runs,Strike Outs,Home Runs,Wins,Losses,Outs Pitched,Batters Faced by Pitcher,Games Finished,Weight,Height,Games Started,League_AL,League_NL
0,23.0,185000.0,4.51,246,106,105,16,10,14,635,925.0,0,200.0,75.0,33,1,0
1,23.0,100000.0,5.97,37,23,25,0,0,5,104,162.0,0,185.0,75.0,7,1,0
2,22.0,100000.0,3.77,13,6,7,0,1,2,43,63.0,0,195.0,76.0,3,1,0
3,26.0,300000.0,4.53,214,95,82,20,7,18,566,797.0,0,178.0,71.0,31,1,0
4,23.0,100000.0,2.76,179,57,127,13,12,8,557,784.0,1,180.0,74.0,24,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4932,29.0,2600000.0,3.53,84,34,56,11,2,2,260,365.0,13,240.0,74.0,4,1,0
4933,26.0,510500.0,5.79,81,48,50,12,3,4,224,328.0,5,240.0,78.0,12,1,0
4934,32.0,514500.0,3.33,138,58,127,12,13,6,470,656.0,0,215.0,74.0,24,1,0
4935,37.0,4250000.0,6.19,104,61,94,28,3,9,266,406.0,7,255.0,82.0,13,1,0


In [463]:
df["Salary"].describe()

count    4.937000e+03
mean     3.011304e+06
std      4.265619e+06
min      1.000000e+05
25%      3.270000e+05
50%      9.800000e+05
75%      4.000000e+06
max      3.300000e+07
Name: Salary, dtype: float64

In [464]:
# create bins for salary
sal_bins=pd.qcut(df["Salary"], q=3)

In [465]:
# see how many rows in each bin
sal_bins.value_counts()

(420000.0, 2900000.0]      1649
(99999.999, 420000.0]      1647
(2900000.0, 33000000.0]    1641
Name: Salary, dtype: int64

In [466]:
# Adding column + labels for salary bins
df["Salary-bin"] = pd.qcut(
   df["Salary"], 
   q=3, 
   labels=['low', 'med', 'high']
)
df.head()

Unnamed: 0,Age,Salary,ERA,Hits,Earned Runs,Strike Outs,Home Runs,Wins,Losses,Outs Pitched,Batters Faced by Pitcher,Games Finished,Weight,Height,Games Started,League_AL,League_NL,Salary-bin
0,23.0,185000.0,4.51,246,106,105,16,10,14,635,925.0,0,200.0,75.0,33,1,0,low
1,23.0,100000.0,5.97,37,23,25,0,0,5,104,162.0,0,185.0,75.0,7,1,0,low
2,22.0,100000.0,3.77,13,6,7,0,1,2,43,63.0,0,195.0,76.0,3,1,0,low
3,26.0,300000.0,4.53,214,95,82,20,7,18,566,797.0,0,178.0,71.0,31,1,0,low
4,23.0,100000.0,2.76,179,57,127,13,12,8,557,784.0,1,180.0,74.0,24,1,0,low


In [467]:
# Create our features
x_cols=[i for i in df.columns if i not in ("Salary","Salary-bin")]
X = df[x_cols]


# Create our target
y = df["Salary-bin"].ravel()

In [468]:
X

Unnamed: 0,Age,ERA,Hits,Earned Runs,Strike Outs,Home Runs,Wins,Losses,Outs Pitched,Batters Faced by Pitcher,Games Finished,Weight,Height,Games Started,League_AL,League_NL
0,23.0,4.51,246,106,105,16,10,14,635,925.0,0,200.0,75.0,33,1,0
1,23.0,5.97,37,23,25,0,0,5,104,162.0,0,185.0,75.0,7,1,0
2,22.0,3.77,13,6,7,0,1,2,43,63.0,0,195.0,76.0,3,1,0
3,26.0,4.53,214,95,82,20,7,18,566,797.0,0,178.0,71.0,31,1,0
4,23.0,2.76,179,57,127,13,12,8,557,784.0,1,180.0,74.0,24,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4932,29.0,3.53,84,34,56,11,2,2,260,365.0,13,240.0,74.0,4,1,0
4933,26.0,5.79,81,48,50,12,3,4,224,328.0,5,240.0,78.0,12,1,0
4934,32.0,3.33,138,58,127,12,13,6,470,656.0,0,215.0,74.0,24,1,0
4935,37.0,6.19,104,61,94,28,3,9,266,406.0,7,255.0,82.0,13,1,0


In [469]:
y

['low', 'low', 'low', 'low', 'low', ..., 'med', 'med', 'med', 'high', 'high']
Length: 4937
Categories (3, object): ['low' < 'med' < 'high']

In [470]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [471]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [472]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=1) 

In [473]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [474]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)
predictions

array(['high', 'low', 'high', ..., 'low', 'high', 'low'], dtype=object)

In [475]:
# balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, predictions)

0.6265550469435165

In [476]:
# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)
acc_score

0.6267206477732794

In [477]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.20716093, 0.07144947, 0.06520874, 0.05616128, 0.09630727,
       0.0529714 , 0.04643378, 0.04459879, 0.07027419, 0.06682082,
       0.03650506, 0.06574285, 0.04422516, 0.05620225, 0.00996923,
       0.00996879])

In [478]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.2071609271293363, 'Age'),
 (0.0963072711000387, 'Strike Outs'),
 (0.071449468954414, 'ERA'),
 (0.07027419044952074, 'Outs Pitched'),
 (0.06682082286920621, 'Batters Faced by Pitcher'),
 (0.06574285474048933, 'Weight'),
 (0.06520873741533009, 'Hits'),
 (0.056202247709517435, 'Games Started'),
 (0.056161278139384097, 'Earned Runs'),
 (0.05297140118020422, 'Home Runs'),
 (0.0464337771855511, 'Wins'),
 (0.04459878886996277, 'Losses'),
 (0.0442251631018302, 'Height'),
 (0.03650505873778614, 'Games Finished'),
 (0.009969225302458572, 'League_AL'),
 (0.009968787114970123, 'League_NL')]

In [479]:
# We can sort the features by their importance.
feature_rank=sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

In [480]:
feature_df=pd.DataFrame(feature_rank, columns=['Feature Value','Feature Abbreviation'])
feature_df

Unnamed: 0,Feature Value,Feature Abbreviation
0,0.207161,Age
1,0.096307,Strike Outs
2,0.071449,ERA
3,0.070274,Outs Pitched
4,0.066821,Batters Faced by Pitcher
5,0.065743,Weight
6,0.065209,Hits
7,0.056202,Games Started
8,0.056161,Earned Runs
9,0.052971,Home Runs
