In [74]:
#import dependencies
import warnings
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

warnings.filterwarnings('ignore')

In [75]:
df = pd.read_csv('../combined_data.csv')
df = df[["class",
         "field_goal_percentage",
         "free_throw_attempt_rate",
         "height",
         "three_point_percentage",
         "true_shooting_percentage",
         "win_shares_per_40_minutes"]]

df.head()

Unnamed: 0,class,field_goal_percentage,free_throw_attempt_rate,height,three_point_percentage,true_shooting_percentage,win_shares_per_40_minutes
0,Player,0.586,0.494,81,0.222,0.592,0.346
1,Player,0.455,0.249,73,0.414,0.572,0.145
2,Player,0.542,0.555,83,0.329162,0.573,0.237
3,Bust,0.416,0.257,70,0.355,0.527,0.158
4,Bust,0.428,0.357,80,0.318,0.551,0.165


In [76]:
X = df.drop(["class"], axis=1)
y = df["class"]

In [77]:
from sklearn.preprocessing import LabelEncoder
labels = LabelEncoder()
labels.fit(y)
encoded_y = labels.transform(y)

In [78]:
df["code"]=encoded_y
df.head()

Unnamed: 0,class,field_goal_percentage,free_throw_attempt_rate,height,three_point_percentage,true_shooting_percentage,win_shares_per_40_minutes,code
0,Player,0.586,0.494,81,0.222,0.592,0.346,1
1,Player,0.455,0.249,73,0.414,0.572,0.145,1
2,Player,0.542,0.555,83,0.329162,0.573,0.237,1
3,Bust,0.416,0.257,70,0.355,0.527,0.158,0
4,Bust,0.428,0.357,80,0.318,0.551,0.165,0


In [79]:
labels_df = pd.DataFrame({"class": y, "code": encoded_y}).drop_duplicates(subset=None, keep="first")
labels_df

Unnamed: 0,class,code
0,Player,1
3,Bust,0


In [80]:
encoded_y =encoded_y.reshape(-1, 1)

In [81]:
# X = df.drop(["class","player_id","name","college",'assists_per_40',
#             'turnovers','points','blocks_per_40','turnover_percentage',
#             'blocks','steals','assists','three_pointers_per_40','three_pointers',
#             'total_rebounds', 'free_throw_percentage', 
#              'games_played','minutes_played', 'points_per_40','steals_per_40',
#             'total_rebounds_per_40','turnovers_per_40', "code", "effective_field_goal_percentage"], axis=1)
feature_names = X.columns
# X.head()

In [107]:
print(f"Shape of X Variables: {X.shape}")
print(f"Shape of Y Variables: {encoded_y.shape}")

Shape of X Variables: (51, 6)
Shape of Y Variables: (853, 1)


In [83]:
# from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(X, encoded_y, random_state=42)

In [84]:
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(X)
# y_scaler = MinMaxScaler().fit(y_train)

X_scaled = X_scaler.transform(X)
# y_train_scaled = y_scaler.transform(y_train)
# y_test_scaled = y_scaler.transform(y_test)


In [85]:
# Create a random forest classifier
rf = RandomForestClassifier(n_estimators=200)
# rf = rf.fit(X_train_scaled, y_train)
# rf.score(X_test_scaled, y_test)

In [106]:
from sklearn.model_selection import KFold

kf_scores = []
kf = KFold(n_splits=5, random_state=1, shuffle=True)   # Define the split - into 10 folds 
kf.get_n_splits(df)   # returns the number of splitting iterations in the cross-validator

for train_index, test_index in kf.split(df):
    X_train, X_test, y_train, y_test = X_scaled[train_index], X_scaled[test_index], encoded_y[train_index], encoded_y[test_index]
    rf = rf.fit(X_train, y_train)
    kf_scores.append(rf.score(X_test, y_test))

print(f"Scores for the K-5 Tests: {kf_scores}")

Scores for the K-5 Tests: [0.6783625730994152, 0.7426900584795322, 0.7719298245614035, 0.7176470588235294, 0.6882352941176471]


In [87]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rf.feature_importances_
importances

array([0.20566696, 0.16208242, 0.10463387, 0.17101832, 0.17576651,
       0.18083193])

In [88]:
# We can sort the features by their importance
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.2056669567751204, 'field_goal_percentage'),
 (0.18083193060598085, 'win_shares_per_40_minutes'),
 (0.1757665054717665, 'true_shooting_percentage'),
 (0.1710183151339564, 'three_point_percentage'),
 (0.16208242057072078, 'free_throw_attempt_rate'),
 (0.10463387144245502, 'height')]

In [89]:
# Create the GridSearch estimator along with a parameter object containing the values to adjust
from sklearn.model_selection import GridSearchCV
param_grid = {"n_estimators": [200]}
grid = GridSearchCV(rf, param_grid, verbose=3)

In [90]:
# Fit the model using the grid search estimator. 
grid.fit(X_train, y_train)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] n_estimators=200 ................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ....... n_estimators=200, score=0.7149122807017544, total=   0.1s
[CV] n_estimators=200 ................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


[CV] ....... n_estimators=200, score=0.7412280701754386, total=   0.1s
[CV] n_estimators=200 ................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.3s remaining:    0.0s


[CV] ....... n_estimators=200, score=0.7268722466960352, total=   0.1s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.6s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [200]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring=None, verbose=3)

In [91]:
# List the best parameters for this dataset
print(grid.best_params_)

{'n_estimators': 200}


In [109]:
# List the best score
print(f"Best Score after Grid Search Hypertuning: {grid.best_score_}")

Best Score after Grid Search Hypertuning: 0.7276720351390923


In [93]:
model = grid.best_estimator_
model

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [94]:
# Make predictions with the hypertuned model
predictions = grid.predict(X_test)

In [95]:
# Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.39      0.15      0.22        47
           1       0.74      0.91      0.81       123

   micro avg       0.70      0.70      0.70       170
   macro avg       0.56      0.53      0.51       170
weighted avg       0.64      0.70      0.65       170



In [96]:
import pickle

filename = 'random_forest_kfold.pkl'
pickle.dump(model, open(filename, 'wb'))

In [97]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

0.7


In [98]:
# load draft data csv
draft_data = pd.read_csv('../Draft_data.csv')
draft_data.head()

Unnamed: 0,rank,player_id,name,college,year,assists,blocks,effective_field_goal_percentage,field_goal_percentage,free_throw_attempt_rate,...,turnover_percentage,turnovers,win_shares_per_40_minutes,assists_per_40,blocks_per_40,points_per_40,steals_per_40,three_pointers_per_40,total_rebounds_per_40,turnovers_per_40
0,1,anthony-edwards-2,Anthony Edwards,missouri state,Fr.,68,12,0.485,0.411,0.328,...,12.1,59,0.163,3.726027,0.657534,24.767123,1.808219,3.013699,6.958904,3.232877
1,2,james-wiseman-1,James Wiseman,memphis,Fr.,1,9,0.769,0.769,1.038,...,7.2,3,0.443,0.57971,5.217391,34.202899,0.57971,0.0,18.550725,1.73913
2,4,obadiah-toppin-1,Obi Toppin,dayton,So.,111,60,0.674,0.646,0.306,...,14.6,116,0.244,2.792453,1.509434,23.295597,1.157233,0.830189,9.18239,2.918239
3,5,jaden-mcdaniels-1,Jaden McDaniels,washington,Fr.,49,35,0.453,0.396,0.347,...,21.2,77,0.087,2.673943,1.909959,15.552524,1.091405,1.527967,7.366985,4.20191
4,6,nico-mannion-1,Nico Mannion,arizona,Fr.,126,0,0.477,0.402,0.349,...,16.2,59,0.164,7.039106,0.0,18.044693,1.396648,2.178771,3.407821,3.296089


In [99]:
X = draft_data[[
             "field_goal_percentage",
             "free_throw_attempt_rate",
             "height",
             "three_point_percentage",
             "true_shooting_percentage",
             "win_shares_per_40_minutes"
              ]]
X.head()

Unnamed: 0,field_goal_percentage,free_throw_attempt_rate,height,three_point_percentage,true_shooting_percentage,win_shares_per_40_minutes
0,0.411,0.328,77,0.318,0.526,0.163
1,0.769,1.038,85,0.0,0.76,0.443
2,0.646,0.306,81,0.398,0.684,0.244
3,0.396,0.347,81,0.318,0.499,0.087
4,0.402,0.349,75,0.339,0.531,0.164


In [100]:
# load the model from disk
filename = 'random_forest_kfold.pkl'
loaded_model = pickle.load(open(filename, 'rb'))
# result = loaded_model.score(X_test, y_test)
print(loaded_model)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


In [101]:
# make a prediction
ynew = loaded_model.predict(X)
ynew = labels.inverse_transform(ynew)

ynew

array(['Bust', 'Bust', 'Bust', 'Bust', 'Bust', 'Bust', 'Bust', 'Bust',
       'Bust', 'Player', 'Bust', 'Bust', 'Bust', 'Bust', 'Player', 'Bust',
       'Bust', 'Bust', 'Bust', 'Bust', 'Bust', 'Bust', 'Bust', 'Bust',
       'Player', 'Bust', 'Bust', 'Bust', 'Bust', 'Bust', 'Bust', 'Bust',
       'Bust', 'Bust', 'Bust', 'Bust', 'Bust', 'Player', 'Bust', 'Bust',
       'Bust', 'Bust', 'Bust', 'Bust', 'Bust', 'Player', 'Bust', 'Bust',
       'Bust', 'Bust', 'Bust'], dtype=object)

In [102]:
draft_data["Prediction"]=ynew
draft_data[["name", "Prediction"]].head(10)

Unnamed: 0,name,Prediction
0,Anthony Edwards,Bust
1,James Wiseman,Bust
2,Obi Toppin,Bust
3,Jaden McDaniels,Bust
4,Nico Mannion,Bust
5,Daniel Oturu,Bust
6,Cole Anthony,Bust
7,Vernon Carey Jr.,Bust
8,Tyrese Haliburton,Bust
9,Onyeka Okongwu,Player
