In [45]:
# Imports
import pandas as pd
import numpy as np
from pathlib import Path
import tensorflow as tf
import hvplot.pandas
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder

In [46]:
# Read the draft_2008-2018.csv file from the Resources folder into a Pandas DataFrame
draft_data_df = pd.read_csv(
    Path("./Resources/college/players.csv")
)

# Review the DataFrame
draft_data_df.head()

Unnamed: 0,ID,active_from,active_to,birth_date,college,height (in),name,position,weight,NBA__3ptapg,...,NCAA_ppg,NBA Games Played,fg%,ppg,effective pct,height/weight,3pt %,Success1,In HoF,Success2
0,1569,2001,2003,12-Feb-78,Indiana University,73,A.J. Guyton,G,180,2.4,...,16.4,0.005682,0.0754,0.054817,0.061333,0.081111,0.0378,0.316144,Not in HoF,N
1,1610,2017,2017,27-Aug-92,Purdue University,85,A.J. Hammons,C,260,0.5,...,12.1,0.001563,0.081,0.021927,0.061867,0.065385,0.05,0.281741,Not in HoF,N
2,3280,2010,2015,7-Oct-86,University of Connecticut,74,A.J. Price,G,181,2.6,...,13.0,0.018537,0.076,0.057807,0.0608,0.081768,0.0316,0.326512,Not in HoF,N
3,4537,1991,1991,11-Sep-64,"University of Massachusetts Amherst, Fairfield...",74,A.J. Wynder,G,180,0.2,...,11.1,0.000426,0.05,0.019934,0.033333,0.082222,0.0,0.185915,Not in HoF,N
4,475,2008,2018,14-Jan-85,University of Oregon,73,Aaron Brooks,G,161,3.4,...,13.1,0.045739,0.0826,0.096678,0.065067,0.090683,0.037,0.417766,Not in HoF,N


In [47]:
draft_data_df.drop(columns=["name", "active_from", "active_to", "birth_date", "Success1", "In HoF"], inplace=True)
draft_data_df.head()

Unnamed: 0,ID,college,height (in),position,weight,NBA__3ptapg,NBA__3ptpct,NBA__3ptpg,NBA_efgpct,NBA_fg%,...,NCAA_ftpg,NCAA_games,NCAA_ppg,NBA Games Played,fg%,ppg,effective pct,height/weight,3pt %,Success2
0,1569,Indiana University,73,G,180,2.4,0.378,0.9,0.46,0.377,...,2.5,128,16.4,0.005682,0.0754,0.054817,0.061333,0.081111,0.0378,N
1,1610,Purdue University,85,C,260,0.5,0.5,0.2,0.464,0.405,...,2.7,132,12.1,0.001563,0.081,0.021927,0.061867,0.065385,0.05,N
2,3280,University of Connecticut,74,G,181,2.6,0.316,0.8,0.456,0.38,...,2.7,99,13.0,0.018537,0.076,0.057807,0.0608,0.081768,0.0316,N
3,4537,"University of Massachusetts Amherst, Fairfield...",74,G,180,0.2,0.0,0.0,0.25,0.25,...,2.8,118,11.1,0.000426,0.05,0.019934,0.033333,0.082222,0.0,N
4,475,University of Oregon,73,G,161,3.4,0.37,1.3,0.488,0.413,...,2.7,115,13.1,0.045739,0.0826,0.096678,0.065067,0.090683,0.037,N


In [48]:
# Determine column data types to assess what might need to be converted
draft_data_df.dtypes

ID                    int64
college              object
height (in)           int64
position             object
weight                int64
NBA__3ptapg         float64
NBA__3ptpct         float64
NBA__3ptpg          float64
NBA_efgpct          float64
NBA_fg%             float64
NBA_fg_per_game     float64
NBA_fga_per_game    float64
NBA_ft%             float64
NBA_ft_per_g        float64
NBA_fta_p_g         float64
NBA_g_played          int64
NBA_ppg             float64
NCAA__3ptapg        float64
NCAA__3ptpct        float64
NCAA__3ptpg         float64
NCAA_efgpct         float64
NCAA_fgapg          float64
NCAA_fgpct          float64
NCAA_fgpg           float64
NCAA_ft             float64
NCAA_ftapg          float64
NCAA_ftpg           float64
NCAA_games            int64
NCAA_ppg            float64
NBA Games Played    float64
fg%                 float64
ppg                 float64
effective pct       float64
height/weight       float64
3pt %               float64
Success2            

In [49]:
# Create a list of categorical variables 
categorical_variables = ["college", "position", "Success2"]

# Display the categorical variables list
categorical_variables

['college', 'position', 'Success2']

In [50]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

In [51]:
# Encode the categorcal variables using OneHotEncoder
encoded_data = enc.fit_transform(draft_data_df[categorical_variables])

In [52]:
# Create a DataFrame with the encoded variables
encoded_df = pd.DataFrame(
    encoded_data,
    columns=enc.get_feature_names(categorical_variables)
)

# Review the DataFrame
encoded_df

Unnamed: 0,"college_Alcorn State University, Jackson State University",college_Arizona State University,college_Auburn University,"college_Auburn University, Florida State University","college_Auburn University, Georgia Institute of Technology",college_Austin Peay State University,college_Ball State University,"college_Ball State University, Oakland University",college_Baylor University,"college_Baylor University, Oklahoma State University",...,college_Xavier University,position_C,position_C-F,position_F,position_F-C,position_F-G,position_G,position_G-F,Success2_N,Success2_Y
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1349,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1350,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1351,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1352,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [53]:
# Remove redundant second binary split column
encoded_df.drop(columns="Success2_N", inplace=True)

In [54]:
# Add the numerical variables from the original DataFrame to the one-hot encoding DataFrame
numerical_values_df = draft_data_df.drop(columns=categorical_variables)
encoded_df = pd.concat([encoded_df, numerical_values_df], axis=1)

# Review the Dataframe
encoded_df

Unnamed: 0,"college_Alcorn State University, Jackson State University",college_Arizona State University,college_Auburn University,"college_Auburn University, Florida State University","college_Auburn University, Georgia Institute of Technology",college_Austin Peay State University,college_Ball State University,"college_Ball State University, Oakland University",college_Baylor University,"college_Baylor University, Oklahoma State University",...,NCAA_ftapg,NCAA_ftpg,NCAA_games,NCAA_ppg,NBA Games Played,fg%,ppg,effective pct,height/weight,3pt %
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.2,2.5,128,16.4,0.005682,0.0754,0.054817,0.061333,0.081111,0.0378
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,2.7,132,12.1,0.001563,0.0810,0.021927,0.061867,0.065385,0.0500
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.7,2.7,99,13.0,0.018537,0.0760,0.057807,0.060800,0.081768,0.0316
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.3,2.8,118,11.1,0.000426,0.0500,0.019934,0.033333,0.082222,0.0000
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.2,2.7,115,13.1,0.045739,0.0826,0.096678,0.065067,0.090683,0.0370
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1349,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.9,3.8,45,15.5,0.013281,0.0714,0.034884,0.060000,0.070222,0.0301
1350,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.9,3.2,137,14.5,0.009020,0.0838,0.101661,0.067867,0.081111,0.0377
1351,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.6,2.7,122,14.6,0.001705,0.0816,0.049834,0.056000,0.078534,0.0300
1352,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.8,2.8,39,10.0,0.004545,0.0802,0.044850,0.061467,0.073276,0.0312


In [55]:
# Fill NBA_ft% null values with league average
encoded_df["NBA_ft%"].fillna(0.7326, inplace=True)

In [56]:
# Define the target set y using the IS_SUCCESSFUL column
y = encoded_df["Success2_Y"]

# Display a sample of y
y[:5]

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: Success2_Y, dtype: float64

In [57]:
# Define features set X by selecting all columns but IS_SUCCESSFUL
X = encoded_df.drop(columns=["Success2_Y"])

# Review the features DataFrame
X[:5]

Unnamed: 0,"college_Alcorn State University, Jackson State University",college_Arizona State University,college_Auburn University,"college_Auburn University, Florida State University","college_Auburn University, Georgia Institute of Technology",college_Austin Peay State University,college_Ball State University,"college_Ball State University, Oakland University",college_Baylor University,"college_Baylor University, Oklahoma State University",...,NCAA_ftapg,NCAA_ftpg,NCAA_games,NCAA_ppg,NBA Games Played,fg%,ppg,effective pct,height/weight,3pt %
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.2,2.5,128,16.4,0.005682,0.0754,0.054817,0.061333,0.081111,0.0378
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,2.7,132,12.1,0.001563,0.081,0.021927,0.061867,0.065385,0.05
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.7,2.7,99,13.0,0.018537,0.076,0.057807,0.0608,0.081768,0.0316
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.3,2.8,118,11.1,0.000426,0.05,0.019934,0.033333,0.082222,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.2,2.7,115,13.1,0.045739,0.0826,0.096678,0.065067,0.090683,0.037


In [58]:
# Split the preprocessed data into a training and testing dataset
# Assign the function a random_state equal to 1
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [59]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the scaler to the features training dataset
X_scaler = scaler.fit(X_train)

# Fit the scaler to the features training dataset
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [60]:
# Define the the number of inputs (features) to the model
number_input_features = len(X_train.iloc[0])

# Review the number of features
number_input_features

369

In [61]:
# Define the number of neurons in the output layer
number_output_neurons = 1

In [62]:
# Define the number of hidden nodes for the first hidden layer
hidden_nodes_layer1 =  number_input_features

# Review the number hidden nodes in the first layer
hidden_nodes_layer1

369

In [63]:
# Define the number of hidden nodes for the second hidden layer
hidden_nodes_layer2 =  (hidden_nodes_layer1 + 1) // 2

# Review the number hidden nodes in the second layer
hidden_nodes_layer2

185

In [64]:
# Create the Sequential model instance
nn = Sequential()

In [65]:
# Add the first hidden layer
nn.add(Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

In [66]:
# Add the second hidden layer
nn.add(Dense(units=hidden_nodes_layer2, activation="relu"))

In [67]:
# Add the output layer to the model specifying the number of output neurons and activation function
nn.add(Dense(units=number_output_neurons, activation="sigmoid"))

In [68]:
# Compile the Sequential model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [69]:
# Fit the model using 50 epochs and the training data
fit_model = nn.fit(X_train_scaled, y_train, epochs=50)

Train on 1015 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [70]:
# Evaluate the model loss and accuracy metrics using the evaluate method and the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled, y_test, verbose=2)

# Display the evaluation results
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

339/339 - 0s - loss: 0.2307 - acc: 0.9646
Loss: 0.23067362336054037, Accuracy: 0.9646017551422119


In [73]:
df = pd.DataFrame(fit_model.history, index=range(1, len(fit_model.history["loss"]) + 1))

df.hvplot(y="loss", title="Classifier Loss")

In [72]:
df.hvplot(y="acc", title="Classifier Accuracy")