In [1]:
# import dependencies
# Initial imports.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from collections import Counter

In [3]:
# Loading data
file_path = Path("salaries-binned.csv")
df = pd.read_csv(file_path)
df

Unnamed: 0,playerID,yearID,stint,teamID,lgID,W,L,G,GS,CG,...,SH,SF,GIDP,salary,birthYear,birthMonth,weight,height,throws,salary-bin
0,abadfe01,2016,1,16,0,1,4,39,0,0,...,0.0,1.0,6.0,1250000.0,1985.0,12.0,235.0,74.0,0,mid
1,alberma01,2016,1,4,0,2,6,58,1,0,...,3.0,2.0,4.0,2000000.0,1983.0,1.0,225.0,73.0,1,mid
2,allenco01,2016,1,7,0,3,5,67,0,0,...,3.0,2.0,7.0,4150000.0,1988.0,11.0,210.0,73.0,1,high
3,alvarjo02,2016,1,12,0,1,3,64,0,0,...,1.0,1.0,5.0,507500.0,1989.0,5.0,195.0,71.0,0,low
4,anderbr04,2016,1,13,1,1,2,4,3,0,...,1.0,1.0,0.0,15800000.0,1988.0,2.0,230.0,76.0,0,high
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
411,yateski01,2016,1,17,0,2,1,41,0,0,...,1.0,1.0,1.0,511900.0,1987.0,3.0,205.0,70.0,1,low
412,youngch03,2016,1,11,0,3,9,34,13,0,...,0.0,4.0,3.0,4250000.0,1979.0,5.0,255.0,82.0,1,high
413,zieglbr01,2016,1,0,1,2,3,36,0,0,...,1.0,1.0,10.0,5500000.0,1979.0,10.0,220.0,76.0,1,high
414,zimmejo02,2016,1,9,0,9,7,19,18,0,...,1.0,5.0,8.0,18000000.0,1986.0,5.0,225.0,74.0,1,high


In [7]:
# Create our features
x_cols=[i for i in df.columns if i not in ('salary','playerID', 'salary-bin')]
X = df[x_cols]


# Create our target
y = df['salary-bin'].ravel()

In [8]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [9]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [10]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=1) 

In [11]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [12]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)
predictions

array(['mid', 'low', 'mid', 'high', 'mid', 'mid', 'low', 'mid', 'low',
       'high', 'low', 'low', 'mid', 'mid', 'low', 'low', 'high', 'low',
       'low', 'high', 'low', 'low', 'low', 'mid', 'low', 'high', 'high',
       'mid', 'mid', 'low', 'mid', 'mid', 'mid', 'low', 'low', 'high',
       'high', 'high', 'mid', 'high', 'low', 'mid', 'mid', 'high', 'high',
       'low', 'low', 'mid', 'low', 'high', 'high', 'low', 'low', 'mid',
       'high', 'mid', 'low', 'mid', 'low', 'high', 'low', 'low', 'mid',
       'high', 'low', 'high', 'high', 'low', 'high', 'low', 'mid', 'low',
       'high', 'low', 'high', 'mid', 'low', 'low', 'mid', 'high', 'high',
       'low', 'mid', 'high', 'high', 'low', 'high', 'mid', 'low', 'mid',
       'low', 'high', 'low', 'mid', 'mid', 'low', 'low', 'low', 'high',
       'mid', 'high', 'mid', 'high', 'mid'], dtype=object)

In [13]:
# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)
acc_score

0.6442307692307693

In [14]:
# balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, predictions)

0.6318155025051576

In [15]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.        , 0.        , 0.03358292, 0.00603005, 0.02605039,
       0.02822168, 0.03594557, 0.02608003, 0.00377961, 0.00197911,
       0.02677958, 0.03531951, 0.03967346, 0.03955292, 0.0325292 ,
       0.0295293 , 0.04081251, 0.04358803, 0.04316096, 0.02126189,
       0.03039889, 0.01963267, 0.00807495, 0.03465792, 0.04043896,
       0.03579205, 0.01922835, 0.01385861, 0.02510801, 0.14535119,
       0.0318463 , 0.04453299, 0.03204062, 0.00516176])

In [17]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.14535118631697233, 'birthYear'),
 (0.04453298588602458, 'weight'),
 (0.043588027452677196, 'BAOpp'),
 (0.0431609569868534, 'ERA'),
 (0.04081251144482249, 'SO'),
 (0.04043896105267377, 'GF'),
 (0.03967345802858526, 'H'),
 (0.039552921564781, 'ER'),
 (0.03594557173489704, 'G'),
 (0.03579204613366499, 'R'),
 (0.03531951387451324, 'IPouts'),
 (0.03465792262019832, 'BFP'),
 (0.033582924282034396, 'teamID'),
 (0.032529204621335364, 'HR'),
 (0.032040617681870213, 'height'),
 (0.031846298541353166, 'birthMonth'),
 (0.030398894940087504, 'WP'),
 (0.02952929648302648, 'BB'),
 (0.028221684907537344, 'L'),
 (0.026779580056518117, 'SV'),
 (0.026080025789570936, 'GS'),
 (0.026050390424475423, 'W'),
 (0.025108005446624476, 'GIDP'),
 (0.021261891204814, 'IBB'),
 (0.01963267425615192, 'HBP'),
 (0.019228353420288943, 'SH'),
 (0.013858608344986903, 'SF'),
 (0.008074953518590004, 'BK'),
 (0.006030054775529795, 'lgID'),
 (0.005161760781603341, 'throws'),
 (0.0037796099697049836, 'CG'),
 (0.001979107457

## Oversampling with binned salaries
---
Now that salaries are divided into only 3 classes, oversampling could potentially help the model.

In [18]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_oversampled, y_oversampled = ros.fit_resample(X_train, y_train)

Counter(y_oversampled)

Counter({'mid': 109, 'high': 109, 'low': 109})

In [19]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=1) 

In [20]:
# Fitting the model
rf_model = rf_model.fit(X_oversampled, y_oversampled)

In [21]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test)
predictions

array(['mid', 'low', 'low', 'high', 'low', 'mid', 'low', 'mid', 'low',
       'high', 'low', 'low', 'mid', 'mid', 'low', 'low', 'high', 'low',
       'low', 'high', 'low', 'low', 'low', 'mid', 'low', 'high', 'high',
       'mid', 'mid', 'low', 'low', 'mid', 'mid', 'low', 'low', 'high',
       'high', 'high', 'mid', 'high', 'low', 'mid', 'mid', 'high', 'high',
       'high', 'low', 'high', 'high', 'high', 'high', 'high', 'low',
       'mid', 'high', 'low', 'low', 'mid', 'low', 'high', 'mid', 'low',
       'mid', 'high', 'low', 'high', 'high', 'low', 'high', 'low', 'high',
       'low', 'high', 'low', 'high', 'mid', 'low', 'low', 'mid', 'high',
       'high', 'low', 'high', 'high', 'high', 'low', 'mid', 'mid', 'low',
       'mid', 'low', 'high', 'low', 'mid', 'mid', 'low', 'low', 'low',
       'high', 'mid', 'high', 'mid', 'high', 'mid'], dtype=object)

In [22]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.        , 0.        , 0.03417573, 0.00661458, 0.02555534,
       0.0313016 , 0.03807886, 0.02743232, 0.0030256 , 0.00134316,
       0.02525904, 0.03941245, 0.03796542, 0.03195501, 0.03061926,
       0.03786374, 0.04001885, 0.04068751, 0.04439927, 0.02269359,
       0.02779145, 0.0186967 , 0.00701789, 0.03607818, 0.03578234,
       0.03605916, 0.0206946 , 0.01347133, 0.02650963, 0.14768693,
       0.03047778, 0.04435291, 0.03011841, 0.00686136])

In [23]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.1476869252319295, 'birthYear'),
 (0.04439926766689964, 'ERA'),
 (0.044352906700076584, 'weight'),
 (0.04068751208375484, 'BAOpp'),
 (0.04001884896678965, 'SO'),
 (0.03941245198246071, 'IPouts'),
 (0.0380788578927964, 'G'),
 (0.037965415359413494, 'H'),
 (0.03786373844995162, 'BB'),
 (0.036078175143984724, 'BFP'),
 (0.036059163959516936, 'R'),
 (0.03578233888874391, 'GF'),
 (0.03417572505578022, 'teamID'),
 (0.031955011431021065, 'ER'),
 (0.031301597686362946, 'L'),
 (0.03061926332869309, 'HR'),
 (0.03047778461324782, 'birthMonth'),
 (0.030118408579489893, 'height'),
 (0.027791454675991638, 'WP'),
 (0.02743231776299934, 'GS'),
 (0.02650962758918431, 'GIDP'),
 (0.025555342927458976, 'W'),
 (0.02525904471619333, 'SV'),
 (0.02269359368373997, 'IBB'),
 (0.020694601155839796, 'SH'),
 (0.018696703835330532, 'HBP'),
 (0.013471330483389768, 'SF'),
 (0.007017893444965129, 'BK'),
 (0.0068613564864704, 'throws'),
 (0.00661458075850791, 'lgID'),
 (0.0030255966150827597, 'CG'),
 (0.0013431628439

## Increasing n_estimators to 200 (using oversampled sets)
---
see if that helps the feature correlations

In [25]:
# Create a random forest classifier.
rf_model_200 = RandomForestClassifier(n_estimators=200, random_state=1) 

In [26]:
# Fitting the model
rf_model_200 = rf_model.fit(X_oversampled, y_oversampled)

In [27]:
# Making predictions using the testing data.
predictions = rf_model_200.predict(X_test)
predictions

array(['mid', 'low', 'low', 'high', 'low', 'mid', 'low', 'mid', 'low',
       'high', 'low', 'low', 'mid', 'mid', 'low', 'low', 'high', 'low',
       'low', 'high', 'low', 'low', 'low', 'mid', 'low', 'high', 'high',
       'mid', 'mid', 'low', 'low', 'mid', 'mid', 'low', 'low', 'high',
       'high', 'high', 'mid', 'high', 'low', 'mid', 'mid', 'high', 'high',
       'high', 'low', 'high', 'high', 'high', 'high', 'high', 'low',
       'mid', 'high', 'low', 'low', 'mid', 'low', 'high', 'mid', 'low',
       'mid', 'high', 'low', 'high', 'high', 'low', 'high', 'low', 'high',
       'low', 'high', 'low', 'high', 'mid', 'low', 'low', 'mid', 'high',
       'high', 'low', 'high', 'high', 'high', 'low', 'mid', 'mid', 'low',
       'mid', 'low', 'high', 'low', 'mid', 'mid', 'low', 'low', 'low',
       'high', 'mid', 'high', 'mid', 'high', 'mid'], dtype=object)

In [28]:
# Calculate feature importance in the Random Forest model.
importances = rf_model_200.feature_importances_
importances

array([0.        , 0.        , 0.03417573, 0.00661458, 0.02555534,
       0.0313016 , 0.03807886, 0.02743232, 0.0030256 , 0.00134316,
       0.02525904, 0.03941245, 0.03796542, 0.03195501, 0.03061926,
       0.03786374, 0.04001885, 0.04068751, 0.04439927, 0.02269359,
       0.02779145, 0.0186967 , 0.00701789, 0.03607818, 0.03578234,
       0.03605916, 0.0206946 , 0.01347133, 0.02650963, 0.14768693,
       0.03047778, 0.04435291, 0.03011841, 0.00686136])

In [29]:
# We can sort the features by their importance.
sorted(zip(rf_model_200.feature_importances_, X.columns), reverse=True)

[(0.1476869252319295, 'birthYear'),
 (0.04439926766689964, 'ERA'),
 (0.044352906700076584, 'weight'),
 (0.04068751208375484, 'BAOpp'),
 (0.04001884896678965, 'SO'),
 (0.03941245198246071, 'IPouts'),
 (0.0380788578927964, 'G'),
 (0.037965415359413494, 'H'),
 (0.03786373844995162, 'BB'),
 (0.036078175143984724, 'BFP'),
 (0.036059163959516936, 'R'),
 (0.03578233888874391, 'GF'),
 (0.03417572505578022, 'teamID'),
 (0.031955011431021065, 'ER'),
 (0.031301597686362946, 'L'),
 (0.03061926332869309, 'HR'),
 (0.03047778461324782, 'birthMonth'),
 (0.030118408579489893, 'height'),
 (0.027791454675991638, 'WP'),
 (0.02743231776299934, 'GS'),
 (0.02650962758918431, 'GIDP'),
 (0.025555342927458976, 'W'),
 (0.02525904471619333, 'SV'),
 (0.02269359368373997, 'IBB'),
 (0.020694601155839796, 'SH'),
 (0.018696703835330532, 'HBP'),
 (0.013471330483389768, 'SF'),
 (0.007017893444965129, 'BK'),
 (0.0068613564864704, 'throws'),
 (0.00661458075850791, 'lgID'),
 (0.0030255966150827597, 'CG'),
 (0.0013431628439

## Decreasing n_estimators to 50 using oversampled set
---

In [40]:
# Create a random forest classifier.
rf_model_50 = RandomForestClassifier(n_estimators=50, random_state=1) 

In [41]:
# Fitting the model
rf_model_50 = rf_model_50.fit(X_oversampled, y_oversampled)

In [42]:
# Making predictions using the testing data.
predictions = rf_model_50.predict(X_test)
predictions

array(['mid', 'low', 'low', 'high', 'low', 'mid', 'low', 'mid', 'low',
       'high', 'low', 'low', 'mid', 'mid', 'low', 'low', 'high', 'low',
       'high', 'high', 'low', 'low', 'low', 'mid', 'low', 'high', 'high',
       'mid', 'mid', 'low', 'low', 'mid', 'mid', 'low', 'low', 'high',
       'high', 'high', 'mid', 'high', 'low', 'low', 'mid', 'high', 'high',
       'high', 'low', 'high', 'high', 'high', 'high', 'high', 'low',
       'mid', 'high', 'low', 'low', 'mid', 'low', 'high', 'mid', 'low',
       'mid', 'high', 'low', 'high', 'high', 'low', 'high', 'high',
       'high', 'mid', 'low', 'low', 'high', 'mid', 'low', 'low', 'high',
       'high', 'high', 'low', 'high', 'high', 'high', 'low', 'mid', 'mid',
       'low', 'mid', 'low', 'high', 'low', 'mid', 'mid', 'low', 'low',
       'low', 'high', 'mid', 'high', 'low', 'high', 'mid'], dtype=object)

In [43]:
# Calculate feature importance in the Random Forest model.
importances = rf_model_50.feature_importances_
importances

array([0.        , 0.        , 0.03316326, 0.00547121, 0.02615675,
       0.03229328, 0.03824714, 0.02602629, 0.00290005, 0.00172491,
       0.02493481, 0.04145405, 0.03701826, 0.03130505, 0.03263084,
       0.03664471, 0.04398512, 0.04004908, 0.05042771, 0.01988313,
       0.02661504, 0.0175798 , 0.00765964, 0.03696804, 0.03578439,
       0.03691812, 0.02075954, 0.01114957, 0.02263598, 0.14589024,
       0.032931  , 0.0469068 , 0.02724809, 0.00663811])

In [44]:
# We can sort the features by their importance.
sorted(zip(rf_model_50.feature_importances_, X.columns), reverse=True)

[(0.1458902357536968, 'birthYear'),
 (0.050427705340922745, 'ERA'),
 (0.046906797988424606, 'weight'),
 (0.04398511843596445, 'SO'),
 (0.04145405407240443, 'IPouts'),
 (0.040049082232284094, 'BAOpp'),
 (0.03824713898227751, 'G'),
 (0.037018255275695154, 'H'),
 (0.03696804003969633, 'BFP'),
 (0.036918124645763634, 'R'),
 (0.036644708911380885, 'BB'),
 (0.0357843919684012, 'GF'),
 (0.03316326027684191, 'teamID'),
 (0.03293100046731905, 'birthMonth'),
 (0.03263083506562404, 'HR'),
 (0.03229327868405183, 'L'),
 (0.03130505251302233, 'ER'),
 (0.02724808597238115, 'height'),
 (0.026615037893543275, 'WP'),
 (0.026156753824574698, 'W'),
 (0.02602629319099061, 'GS'),
 (0.024934810599119003, 'SV'),
 (0.022635975192949565, 'GIDP'),
 (0.020759539869626687, 'SH'),
 (0.01988312601719528, 'IBB'),
 (0.017579804931404323, 'HBP'),
 (0.01114956670214255, 'SF'),
 (0.0076596411893194395, 'BK'),
 (0.006638114385125813, 'throws'),
 (0.0054712081138191615, 'lgID'),
 (0.002900053660838086, 'CG'),
 (0.001724907