In [1]:
# Imports
import pandas as pd
import numpy as np
from pathlib import Path
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder

In [2]:
# Read the draft_2008-2018.csv file from the Resources folder into a Pandas DataFrame
draft_data_df = pd.read_csv(
    Path("./Resources/college/draft_2008-2018.csv")
)

# Review the DataFrame
draft_data_df.head()

Unnamed: 0,Pk,Tm,Player,College,Yrs,G,FG%,3P%,FT%,MP2,PTS3,TRB4,AST5,WS,WS/48,BPM,VORP,Year
0,1,CHI,Derrick Rose,Memphis,12.0,646.0,0.457,0.311,0.829,32.0,18.5,3.3,5.5,42.8,0.099,1.3,17.6,2008
1,1,LAC,Blake Griffin,Oklahoma,11.0,668.0,0.495,0.334,0.696,34.2,20.9,8.6,4.3,77.2,0.162,3.6,32.1,2009
2,1,WAS,John Wall,Kentucky,10.0,613.0,0.431,0.323,0.779,35.6,19.1,4.3,9.1,44.1,0.097,2.4,24.1,2010
3,1,CLE,Kyrie Irving,Duke,10.0,582.0,0.47,0.391,0.881,33.9,22.8,3.8,5.7,68.7,0.167,4.6,32.7,2011
4,1,NOH,Anthony Davis,Kentucky,9.0,564.0,0.514,0.312,0.799,34.4,23.9,10.2,2.3,86.8,0.215,6.2,40.0,2012


In [3]:
draft_data_df["Success"] = np.random.rand(658)
draft_data_df.head()

Unnamed: 0,Pk,Tm,Player,College,Yrs,G,FG%,3P%,FT%,MP2,PTS3,TRB4,AST5,WS,WS/48,BPM,VORP,Year,Success
0,1,CHI,Derrick Rose,Memphis,12.0,646.0,0.457,0.311,0.829,32.0,18.5,3.3,5.5,42.8,0.099,1.3,17.6,2008,0.419922
1,1,LAC,Blake Griffin,Oklahoma,11.0,668.0,0.495,0.334,0.696,34.2,20.9,8.6,4.3,77.2,0.162,3.6,32.1,2009,0.495736
2,1,WAS,John Wall,Kentucky,10.0,613.0,0.431,0.323,0.779,35.6,19.1,4.3,9.1,44.1,0.097,2.4,24.1,2010,0.855461
3,1,CLE,Kyrie Irving,Duke,10.0,582.0,0.47,0.391,0.881,33.9,22.8,3.8,5.7,68.7,0.167,4.6,32.7,2011,0.962301
4,1,NOH,Anthony Davis,Kentucky,9.0,564.0,0.514,0.312,0.799,34.4,23.9,10.2,2.3,86.8,0.215,6.2,40.0,2012,0.843752


In [4]:
draft_data_df = draft_data_df.drop(columns=["Player"])
draft_data_df.head()

Unnamed: 0,Pk,Tm,College,Yrs,G,FG%,3P%,FT%,MP2,PTS3,TRB4,AST5,WS,WS/48,BPM,VORP,Year,Success
0,1,CHI,Memphis,12.0,646.0,0.457,0.311,0.829,32.0,18.5,3.3,5.5,42.8,0.099,1.3,17.6,2008,0.419922
1,1,LAC,Oklahoma,11.0,668.0,0.495,0.334,0.696,34.2,20.9,8.6,4.3,77.2,0.162,3.6,32.1,2009,0.495736
2,1,WAS,Kentucky,10.0,613.0,0.431,0.323,0.779,35.6,19.1,4.3,9.1,44.1,0.097,2.4,24.1,2010,0.855461
3,1,CLE,Duke,10.0,582.0,0.47,0.391,0.881,33.9,22.8,3.8,5.7,68.7,0.167,4.6,32.7,2011,0.962301
4,1,NOH,Kentucky,9.0,564.0,0.514,0.312,0.799,34.4,23.9,10.2,2.3,86.8,0.215,6.2,40.0,2012,0.843752


In [5]:
draft_data_df.dtypes

Pk           int64
Tm          object
College     object
Yrs        float64
G          float64
FG%        float64
3P%        float64
FT%        float64
MP2        float64
PTS3       float64
TRB4       float64
AST5       float64
WS         float64
WS/48      float64
BPM        float64
VORP       float64
Year         int64
Success    float64
dtype: object

In [6]:
draft_data_df["Pk"] = draft_data_df["Pk"].astype(float)
draft_data_df.head()

Unnamed: 0,Pk,Tm,College,Yrs,G,FG%,3P%,FT%,MP2,PTS3,TRB4,AST5,WS,WS/48,BPM,VORP,Year,Success
0,1.0,CHI,Memphis,12.0,646.0,0.457,0.311,0.829,32.0,18.5,3.3,5.5,42.8,0.099,1.3,17.6,2008,0.419922
1,1.0,LAC,Oklahoma,11.0,668.0,0.495,0.334,0.696,34.2,20.9,8.6,4.3,77.2,0.162,3.6,32.1,2009,0.495736
2,1.0,WAS,Kentucky,10.0,613.0,0.431,0.323,0.779,35.6,19.1,4.3,9.1,44.1,0.097,2.4,24.1,2010,0.855461
3,1.0,CLE,Duke,10.0,582.0,0.47,0.391,0.881,33.9,22.8,3.8,5.7,68.7,0.167,4.6,32.7,2011,0.962301
4,1.0,NOH,Kentucky,9.0,564.0,0.514,0.312,0.799,34.4,23.9,10.2,2.3,86.8,0.215,6.2,40.0,2012,0.843752


In [7]:
# Create a list of categorical variables 
categorical_variables = ["Tm", "College"]

# Display the categorical variables list
categorical_variables


['Tm', 'College']

In [8]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

In [9]:
# Encode the categorcal variables using OneHotEncoder
encoded_data = enc.fit_transform(draft_data_df[categorical_variables])

In [10]:
# Create a DataFrame with the encoded variables
encoded_df = pd.DataFrame(
    encoded_data,
    columns=enc.get_feature_names(categorical_variables)
)

# Review the DataFrame
encoded_df

Unnamed: 0,Tm_ATL,Tm_BOS,Tm_BRK,Tm_CHA,Tm_CHH,Tm_CHI,Tm_CHO,Tm_CLE,Tm_DAL,Tm_DEN,...,College_Washington State,College_Weber State,College_West Virginia,College_Western Kentucky,College_Wichita State,College_William & Mary,College_Wisconsin,College_Wyoming,College_Xavier,College_nan
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
653,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
654,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
655,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
656,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [11]:
# Add the numerical variables from the original DataFrame to the one-hot encoding DataFrame
numerical_values_df = draft_data_df.drop(columns=categorical_variables)
encoded_df = pd.concat([encoded_df, numerical_values_df], axis=1)

# Review the Dataframe
encoded_df

Unnamed: 0,Tm_ATL,Tm_BOS,Tm_BRK,Tm_CHA,Tm_CHH,Tm_CHI,Tm_CHO,Tm_CLE,Tm_DAL,Tm_DEN,...,MP2,PTS3,TRB4,AST5,WS,WS/48,BPM,VORP,Year,Success
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,32.0,18.5,3.3,5.5,42.8,0.099,1.3,17.6,2008,0.419922
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,34.2,20.9,8.6,4.3,77.2,0.162,3.6,32.1,2009,0.495736
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,35.6,19.1,4.3,9.1,44.1,0.097,2.4,24.1,2010,0.855461
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,33.9,22.8,3.8,5.7,68.7,0.167,4.6,32.7,2011,0.962301
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,34.4,23.9,10.2,2.3,86.8,0.215,6.2,40.0,2012,0.843752
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
653,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.0,3.5,2.8,0.3,0.9,0.071,-3.3,-0.2,2014,0.019674
654,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,2015,0.142906
655,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,15.5,5.2,2.2,1.2,0.2,0.006,-5.1,-1.3,2016,0.606472
656,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,2017,0.017022


In [12]:
# Define the target set y using the IS_SUCCESSFUL column
y = encoded_df["Success"]

# Display a sample of y
y[:5]

0    0.419922
1    0.495736
2    0.855461
3    0.962301
4    0.843752
Name: Success, dtype: float64

In [13]:
# Define features set X by selecting all columns but IS_SUCCESSFUL
X = encoded_df.drop(columns=["Success"])

# Review the features DataFrame
X[:5]

Unnamed: 0,Tm_ATL,Tm_BOS,Tm_BRK,Tm_CHA,Tm_CHH,Tm_CHI,Tm_CHO,Tm_CLE,Tm_DAL,Tm_DEN,...,FT%,MP2,PTS3,TRB4,AST5,WS,WS/48,BPM,VORP,Year
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.829,32.0,18.5,3.3,5.5,42.8,0.099,1.3,17.6,2008
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.696,34.2,20.9,8.6,4.3,77.2,0.162,3.6,32.1,2009
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.779,35.6,19.1,4.3,9.1,44.1,0.097,2.4,24.1,2010
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.881,33.9,22.8,3.8,5.7,68.7,0.167,4.6,32.7,2011
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.799,34.4,23.9,10.2,2.3,86.8,0.215,6.2,40.0,2012


In [14]:
# Split the preprocessed data into a training and testing dataset
# Assign the function a random_state equal to 1
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [15]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the scaler to the features training dataset
X_scaler = scaler.fit(X_train)

# Fit the scaler to the features training dataset
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [16]:
# Define the the number of inputs (features) to the model
number_input_features = len(X_train.iloc[0])

# Review the number of features
number_input_features

183

In [17]:
# Define the number of neurons in the output layer
number_output_neurons = 1

In [18]:
# Define the number of hidden nodes for the first hidden layer
hidden_nodes_layer1 =  number_input_features

# Review the number hidden nodes in the first layer
hidden_nodes_layer1

183

In [19]:
# Define the number of hidden nodes for the second hidden layer
hidden_nodes_layer2 =  (hidden_nodes_layer1 + 1) // 2

# Review the number hidden nodes in the second layer
hidden_nodes_layer2

92

In [20]:
# Create the Sequential model instance
nn = Sequential()

In [21]:
# Add the first hidden layer
nn.add(Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

In [22]:
# Add the second hidden layer
nn.add(Dense(units=hidden_nodes_layer2, activation="relu"))

In [23]:
# Add the output layer to the model specifying the number of output neurons and activation function
nn.add(Dense(units=number_output_neurons, activation="linear"))

In [24]:
# Compile the Sequential model
nn.compile(loss="mean_squared_error", optimizer="adam", metrics=["mse"])

In [25]:
# Fit the model using 50 epochs and the training data
fit_model = nn.fit(X_train_scaled, y_train, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
