In [84]:
# import dependencies
# Initial imports.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from collections import Counter

In [72]:
# Loading data
file_path = Path("encoded_pitcher_stat_sal_dem.csv")
df = pd.read_csv(file_path)
df

Unnamed: 0,playerID,yearID,stint,teamID,lgID,W,L,G,GS,CG,...,R,SH,SF,GIDP,salary,birthYear,birthMonth,weight,height,throws
0,abadfe01,2016,1,16,0,1,4,39,0,0,...,11,0.0,1.0,6.0,1250000.0,1985.0,12.0,235.0,74.0,0
1,alberma01,2016,1,4,0,2,6,58,1,0,...,44,3.0,2.0,4.0,2000000.0,1983.0,1.0,225.0,73.0,1
2,allenco01,2016,1,7,0,3,5,67,0,0,...,23,3.0,2.0,7.0,4150000.0,1988.0,11.0,210.0,73.0,1
3,alvarjo02,2016,1,12,0,1,3,64,0,0,...,29,1.0,1.0,5.0,507500.0,1989.0,5.0,195.0,71.0,0
4,anderbr04,2016,1,13,1,1,2,4,3,0,...,15,1.0,1.0,0.0,15800000.0,1988.0,2.0,230.0,76.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
411,yateski01,2016,1,17,0,2,1,41,0,0,...,24,1.0,1.0,1.0,511900.0,1987.0,3.0,205.0,70.0,1
412,youngch03,2016,1,11,0,3,9,34,13,0,...,63,0.0,4.0,3.0,4250000.0,1979.0,5.0,255.0,82.0,1
413,zieglbr01,2016,1,0,1,2,3,36,0,0,...,13,1.0,1.0,10.0,5500000.0,1979.0,10.0,220.0,76.0,1
414,zimmejo02,2016,1,9,0,9,7,19,18,0,...,63,1.0,5.0,8.0,18000000.0,1986.0,5.0,225.0,74.0,1


In [73]:
# Create our features
x_cols=[i for i in df.columns if i not in ('salary','playerID')]
X = df[x_cols]


# Create our target
y = df['salary'].ravel()

In [74]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [75]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [76]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=1) 

In [77]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [78]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)
predictions

array([  516000.,   507500.,   507500.,  8000000.,   539000.,  4000000.,
         519300.,   850000.,   507500.,  9650000.,   514500.,   507500.,
        5500000.,  2500000.,  2150000.,   548000.,  8000000.,   507500.,
         511000.,  9000000.,   507500.,   507500.,   509500.,   525500.,
         507500.,   512500.,   587500.,   550000.,   530000.,   514400.,
        4000000.,  1490314.,  2500000.,   509500.,   512500.,  7000000.,
         507500.,  4500000.,   509500., 11500000.,   507500.,   507500.,
        8000000.,   512500.,  4225000.,   516000.,   507500.,   512500.,
       12673102.,   512500.,   512500., 15750000.,   507500.,  2750000.,
        7000000.,   900000.,   520000.,  4000000.,   509500., 10000000.,
         512500.,   511900.,  1500000.,  2500000.,   507500.,   532000.,
       28000000.,   509500., 10650000.,   512500.,  4000000.,  4100000.,
        7000000.,  2000000.,  2000000.,  5250000.,   539000.,   508500.,
        9625000.,  6000000.,  6000000.,   507500., 

In [79]:
# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)
acc_score

0.019230769230769232

In [82]:
# balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, predictions)



0.004705882352941177

### Rank Importance of Features

In [62]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.        , 0.        , 0.0485441 , 0.0129507 , 0.02906778,
       0.03076667, 0.04231217, 0.01901763, 0.00643337, 0.00400047,
       0.01950961, 0.03771211, 0.03733892, 0.03382374, 0.03607496,
       0.03605419, 0.04386256, 0.04455204, 0.05244631, 0.02481086,
       0.03047966, 0.02903542, 0.01311769, 0.0373138 , 0.0321866 ,
       0.03525188, 0.02479887, 0.02922353, 0.03247402, 0.0469652 ,
       0.0398387 , 0.04317126, 0.03698016, 0.00988501])

In [63]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.0524463148000548, 'ERA'),
 (0.04854410391380591, 'teamID'),
 (0.04696519908730298, 'birthYear'),
 (0.04455203561947979, 'BAOpp'),
 (0.0438625637697622, 'SO'),
 (0.04317125954477632, 'weight'),
 (0.04231216940014685, 'G'),
 (0.039838704238600434, 'birthMonth'),
 (0.037712109327823075, 'IPouts'),
 (0.03733891971555806, 'H'),
 (0.03731379783123367, 'BFP'),
 (0.03698015856070048, 'height'),
 (0.0360749563767137, 'HR'),
 (0.036054186605901525, 'BB'),
 (0.03525187811943732, 'R'),
 (0.03382374087317166, 'ER'),
 (0.03247401853930726, 'GIDP'),
 (0.03218660292717729, 'GF'),
 (0.030766668947502646, 'L'),
 (0.030479655885312415, 'WP'),
 (0.029223534605139094, 'SF'),
 (0.029067779284114612, 'W'),
 (0.02903542435159126, 'HBP'),
 (0.024810861444285515, 'IBB'),
 (0.02479887128915833, 'SH'),
 (0.019509614851432672, 'SV'),
 (0.019017632230973386, 'GS'),
 (0.013117685912411229, 'BK'),
 (0.01295070410869283, 'lgID'),
 (0.009885007170794349, 'throws'),
 (0.006433369991662188, 'CG'),
 (0.004000470675975

# MinMaxScaler
---

In [54]:
# try different scaler
from sklearn import preprocessing
# Creating a StandardScaler instance.
scaler = preprocessing.MinMaxScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [55]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=1) 

In [56]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [57]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)
predictions

array([  516000.,   507500.,   507500.,  8000000.,   539000.,  4000000.,
         519300.,   850000.,   507500.,  9650000.,   514500.,   507500.,
        5500000.,  2500000.,  2150000.,   548000.,  8000000.,   507500.,
         511000.,  9000000.,   507500.,   507500.,   509500.,   525500.,
         507500.,   512500.,   587500.,   550000.,   530000.,   514875.,
        4000000.,  1490314.,  2500000.,   509500.,   512500.,  7000000.,
         507500.,  4500000.,   509500., 11500000.,   507500.,   507500.,
        8000000.,   516000.,  4225000.,   516000.,   507500.,   512500.,
       12673102.,   512500.,   512500.,  1400000.,   507500.,  2750000.,
        7000000.,   900000.,   520000.,  4000000.,   509500., 10000000.,
         512500.,   511900.,  1500000.,  2500000.,   507500.,   532000.,
       28000000.,   509500., 10650000.,   532000.,  4000000.,  4100000.,
        7000000.,  2000000.,   507500.,  5250000.,   539000.,   508500.,
        9625000.,  6000000.,  6000000.,   507500., 

In [58]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.        , 0.        , 0.0485441 , 0.0129507 , 0.02906778,
       0.03076667, 0.04231217, 0.01901763, 0.00643337, 0.00400047,
       0.01950961, 0.03771211, 0.03733892, 0.03382374, 0.03607496,
       0.03605419, 0.04386256, 0.04455204, 0.05244631, 0.02481086,
       0.03047966, 0.02903542, 0.01311769, 0.0373138 , 0.0321866 ,
       0.03525188, 0.02479887, 0.02922353, 0.03247402, 0.0469652 ,
       0.0398387 , 0.04317126, 0.03698016, 0.00988501])

In [59]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.0524463148000548, 'ERA'),
 (0.04854410391380591, 'teamID'),
 (0.04696519908730298, 'birthYear'),
 (0.04455203561947979, 'BAOpp'),
 (0.0438625637697622, 'SO'),
 (0.04317125954477632, 'weight'),
 (0.04231216940014685, 'G'),
 (0.039838704238600434, 'birthMonth'),
 (0.037712109327823075, 'IPouts'),
 (0.03733891971555806, 'H'),
 (0.03731379783123367, 'BFP'),
 (0.03698015856070048, 'height'),
 (0.0360749563767137, 'HR'),
 (0.036054186605901525, 'BB'),
 (0.03525187811943732, 'R'),
 (0.03382374087317166, 'ER'),
 (0.03247401853930726, 'GIDP'),
 (0.03218660292717729, 'GF'),
 (0.030766668947502646, 'L'),
 (0.030479655885312415, 'WP'),
 (0.029223534605139094, 'SF'),
 (0.029067779284114612, 'W'),
 (0.02903542435159126, 'HBP'),
 (0.024810861444285515, 'IBB'),
 (0.02479887128915833, 'SH'),
 (0.019509614851432672, 'SV'),
 (0.019017632230973386, 'GS'),
 (0.013117685912411229, 'BK'),
 (0.01295070410869283, 'lgID'),
 (0.009885007170794349, 'throws'),
 (0.006433369991662188, 'CG'),
 (0.004000470675975

In [60]:
# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)
acc_score

0.019230769230769232

# Robust Scaler
---

In [38]:
# try different scaler
from sklearn import preprocessing
# Creating a StandardScaler instance.
scaler = preprocessing.RobustScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [39]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=1) 

In [40]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [41]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)
predictions

array([  516000.,   507500.,   507500.,  8000000.,   539000.,  4000000.,
         519300.,   850000.,   507500.,  9650000.,   514500.,   507500.,
        5500000.,  2500000.,  2150000.,   548000.,  8000000.,   507500.,
         511000.,  9000000.,   507500.,   507500.,   509500.,   525500.,
         507500.,   512500.,   587500.,   550000.,   530000.,   514400.,
        4000000.,  1490314.,  2500000.,   509500.,   512500.,  7000000.,
       12000000.,  4500000.,   509500., 11500000.,   507500.,   507500.,
        8000000.,   516000.,  7500000.,   516000.,   507500.,   512500.,
         524100.,   512500.,   512500.,   512500.,   507500.,  2750000.,
        7000000.,   900000.,   520000.,  4000000.,   509500., 10000000.,
         512500.,   511900.,  1500000.,  2500000.,   507500.,   532000.,
       28000000.,   509500., 10650000.,   532000.,  4000000.,  4100000.,
        7000000.,  2000000.,  2000000.,  5250000.,   539000.,   508500.,
        9625000., 20833333.,  6000000.,   507500., 

In [42]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.        , 0.        , 0.0485441 , 0.0129507 , 0.02906778,
       0.03076667, 0.04231217, 0.01901763, 0.00643337, 0.00400047,
       0.01950961, 0.03771211, 0.03733892, 0.03382374, 0.03607496,
       0.03605419, 0.04386256, 0.04455204, 0.05244631, 0.02481086,
       0.03047966, 0.02903542, 0.01311769, 0.0373138 , 0.0321866 ,
       0.03525188, 0.02479887, 0.02922353, 0.03247402, 0.0469652 ,
       0.0398387 , 0.04317126, 0.03698016, 0.00988501])

In [43]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.0524463148000548, 'ERA'),
 (0.04854410391380591, 'teamID'),
 (0.04696519908730298, 'birthYear'),
 (0.04455203561947979, 'BAOpp'),
 (0.0438625637697622, 'SO'),
 (0.04317125954477632, 'weight'),
 (0.04231216940014685, 'G'),
 (0.039838704238600434, 'birthMonth'),
 (0.037712109327823075, 'IPouts'),
 (0.03733891971555806, 'H'),
 (0.03731379783123367, 'BFP'),
 (0.03698015856070048, 'height'),
 (0.0360749563767137, 'HR'),
 (0.036054186605901525, 'BB'),
 (0.03525187811943732, 'R'),
 (0.03382374087317166, 'ER'),
 (0.03247401853930726, 'GIDP'),
 (0.03218660292717729, 'GF'),
 (0.030766668947502646, 'L'),
 (0.030479655885312415, 'WP'),
 (0.029223534605139094, 'SF'),
 (0.029067779284114612, 'W'),
 (0.02903542435159126, 'HBP'),
 (0.024810861444285515, 'IBB'),
 (0.02479887128915833, 'SH'),
 (0.019509614851432672, 'SV'),
 (0.019017632230973386, 'GS'),
 (0.013117685912411229, 'BK'),
 (0.01295070410869283, 'lgID'),
 (0.009885007170794349, 'throws'),
 (0.006433369991662188, 'CG'),
 (0.004000470675975

# Try different sampling techniques

### oversampling
---

In [103]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_oversampled, y_oversampled = ros.fit_resample(X_train, y_train)

Counter(y_oversampled)

Counter({1600000.0: 13,
         5854712.0: 13,
         520200.0: 13,
         539000.0: 13,
         12673102.0: 13,
         507500.0: 13,
         1400000.0: 13,
         5300000.0: 13,
         1250000.0: 13,
         11000000.0: 13,
         540000.0: 13,
         509500.0: 13,
         2375000.0: 13,
         511200.0: 13,
         5000000.0: 13,
         2950000.0: 13,
         509675.0: 13,
         513300.0: 13,
         3150000.0: 13,
         22000000.0: 13,
         8500000.0: 13,
         4350000.0: 13,
         895000.0: 13,
         7250000.0: 13,
         2000000.0: 13,
         3450000.0: 13,
         5500000.0: 13,
         4200000.0: 13,
         528200.0: 13,
         1000000.0: 13,
         532000.0: 13,
         18000000.0: 13,
         511000.0: 13,
         9200000.0: 13,
         1500000.0: 13,
         8000000.0: 13,
         523500.0: 13,
         3275000.0: 13,
         4225000.0: 13,
         587500.0: 13,
         536200.0: 13,
         1050000.0: 13,
   

In [104]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=1) 

In [105]:
# Fitting the model
rf_model = rf_model.fit(X_resampled, y_resampled)

In [106]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test)
predictions

array([  516000.,   509500.,  2525000.,  9650000.,   525500.,  4000000.,
         532900., 14000000.,  6100000.,  4200000.,   514500.,  6100000.,
         507500.,  1525000.,   512000.,   516000.,  9650000.,  6100000.,
        2500000., 10000000.,   513000.,   521700.,   510500.,   525500.,
         511000.,   541000.,  5000000.,   550000.,  1500000.,  7150000.,
        2750000.,  2400000.,  6100000.,   509500., 28000000., 12100000.,
         524100.,  9000000.,  5600000.,   536200.,  9200000.,   520000.,
        8000000.,  1475000., 17500000.,  1200000.,   514400.,   660000.,
       12673102.,  1200000., 17500000., 15750000.,   507500.,  2750000.,
        7250000.,   508500.,  2175000.,  1000000.,   535000., 22000000.,
         587500.,   511900., 14000000., 11000000.,   508200., 10800000.,
       17500000.,   516650.,  4150000.,  5100000., 10936574.,  5300000.,
       11500000.,   507500.,  2000000.,  3800000.,  4300000.,   508500.,
        9625000.,  2000000.,  4500000.,   510500., 

In [107]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.        , 0.        , 0.04735774, 0.00896171, 0.02828519,
       0.02987929, 0.04383719, 0.01912182, 0.00782427, 0.00564391,
       0.02149805, 0.03938353, 0.03853051, 0.0368674 , 0.03254731,
       0.04048899, 0.04366157, 0.04743841, 0.04392252, 0.02224241,
       0.02897662, 0.03026694, 0.01218389, 0.03882998, 0.03562828,
       0.03772485, 0.02455291, 0.03066159, 0.03193023, 0.04489157,
       0.03631763, 0.04704516, 0.03392187, 0.00957667])

In [108]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.04743841495450869, 'BAOpp'),
 (0.04735774155381508, 'teamID'),
 (0.04704516405801614, 'weight'),
 (0.04489157483670073, 'birthYear'),
 (0.043922520356712305, 'ERA'),
 (0.04383718656611716, 'G'),
 (0.04366156621811154, 'SO'),
 (0.04048898696834817, 'BB'),
 (0.039383530575323525, 'IPouts'),
 (0.03882997623945582, 'BFP'),
 (0.03853050666489274, 'H'),
 (0.03772484721996261, 'R'),
 (0.03686739869376188, 'ER'),
 (0.036317627053426844, 'birthMonth'),
 (0.0356282764901647, 'GF'),
 (0.03392187058269459, 'height'),
 (0.03254731389072257, 'HR'),
 (0.03193022677105912, 'GIDP'),
 (0.030661592130533863, 'SF'),
 (0.030266939066964593, 'HBP'),
 (0.029879294409103235, 'L'),
 (0.02897662151614642, 'WP'),
 (0.028285188669537126, 'W'),
 (0.024552905633004028, 'SH'),
 (0.02224240893460424, 'IBB'),
 (0.021498045620722973, 'SV'),
 (0.01912181696773603, 'GS'),
 (0.012183893062781198, 'BK'),
 (0.009576674682239604, 'throws'),
 (0.008961708266298982, 'lgID'),
 (0.007824270095103765, 'CG'),
 (0.0056439112514

## undersampling
---

- won't work because it makes each salary only sampled once

In [102]:
from imblearn.under_sampling import RandomUnderSampler
ros = RandomUnderSampler(random_state=1)
X_undersampled, y_undersampled = ros.fit_resample(X_train, y_train)
Counter(y_undersampled)

Counter({507500.0: 1,
         508200.0: 1,
         508500.0: 1,
         509300.0: 1,
         509500.0: 1,
         509675.0: 1,
         510500.0: 1,
         511000.0: 1,
         511200.0: 1,
         511250.0: 1,
         511500.0: 1,
         511900.0: 1,
         512000.0: 1,
         512100.0: 1,
         512500.0: 1,
         513000.0: 1,
         513300.0: 1,
         513900.0: 1,
         514000.0: 1,
         514200.0: 1,
         514400.0: 1,
         514500.0: 1,
         514875.0: 1,
         515000.0: 1,
         515900.0: 1,
         516000.0: 1,
         516100.0: 1,
         516500.0: 1,
         516650.0: 1,
         517000.0: 1,
         517500.0: 1,
         517800.0: 1,
         518000.0: 1,
         518500.0: 1,
         519000.0: 1,
         519200.0: 1,
         519300.0: 1,
         519700.0: 1,
         520000.0: 1,
         520200.0: 1,
         520500.0: 1,
         521000.0: 1,
         521200.0: 1,
         521300.0: 1,
         521700.0: 1,
         5