# Smart_Stock_ML: Neural Network

## Setup and Explore the data

In [1]:
# Import dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix,classification_report
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import hvplot.pandas
import joblib

#  Import completed dataset
sp500_company_ml_df = pd.read_csv("data/sp500_company_ml.csv")
sp500_company_ml_df.head()

Unnamed: 0,Year,Outperformed?,Interest Rate (Federal Funds Effective Rate),CPI,UnemploymentRate,GDPGrowthRate,GICS Sector,GICS Sub-Industry,Founded,Gender,CEO Transition,Tenure Bucket,HQ_US_State_or_Country,Salary_Bucket
0,2015,1,0.11,0.1,5.3,2.9,Health Care,Life Sciences Tools & Services,1999,M,1.0,<= 2 years,California,"(15000000, 20000000]"
1,2016,1,0.34,1.3,4.9,1.8,Health Care,Life Sciences Tools & Services,1999,M,0.0,<= 2 years,California,"(15000000, 20000000]"
2,2017,1,0.65,2.1,4.4,2.5,Health Care,Life Sciences Tools & Services,1999,M,0.0,<= 2 years,California,"(15000000, 20000000]"
3,2018,1,1.41,2.4,3.9,3.0,Health Care,Life Sciences Tools & Services,1999,M,0.0,2 to 5 years,California,"(15000000, 20000000]"
4,2019,0,2.4,1.8,3.7,2.5,Health Care,Life Sciences Tools & Services,1999,M,0.0,2 to 5 years,California,"(15000000, 20000000]"


In [2]:
# View data info
sp500_company_ml_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4366 entries, 0 to 4365
Data columns (total 14 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   Year                                          4366 non-null   int64  
 1   Outperformed?                                 4366 non-null   int64  
 2   Interest Rate (Federal Funds Effective Rate)  4366 non-null   float64
 3   CPI                                           4366 non-null   float64
 4   UnemploymentRate                              4366 non-null   float64
 5   GDPGrowthRate                                 4366 non-null   float64
 6   GICS Sector                                   4366 non-null   object 
 7   GICS Sub-Industry                             4366 non-null   object 
 8   Founded                                       4366 non-null   object 
 9   Gender                                        4366 non-null   o

In [3]:
# Determine the number of unique values in each column.
sp500_company_ml_df.nunique()

Year                                              9
Outperformed?                                     2
Interest Rate (Federal Funds Effective Rate)      9
CPI                                               9
UnemploymentRate                                  9
GDPGrowthRate                                     7
GICS Sector                                      11
GICS Sub-Industry                               124
Founded                                         161
Gender                                            2
CEO Transition                                    2
Tenure Bucket                                     4
HQ_US_State_or_Country                           49
Salary_Bucket                                     7
dtype: int64

In [4]:
# View data columns
sp500_company_ml_df.columns

Index(['Year', 'Outperformed?', 'Interest Rate (Federal Funds Effective Rate)',
       'CPI', 'UnemploymentRate', 'GDPGrowthRate', 'GICS Sector',
       'GICS Sub-Industry', 'Founded', 'Gender', 'CEO Transition',
       'Tenure Bucket', 'HQ_US_State_or_Country', 'Salary_Bucket'],
      dtype='object')

In [5]:
# Remove some features to test impact on model
sp500_company_ml_df = sp500_company_ml_df.drop(columns=['Founded', 'GICS Sub-Industry'], axis=1)

In [6]:
sp500_company_ml_df.nunique()

Year                                             9
Outperformed?                                    2
Interest Rate (Federal Funds Effective Rate)     9
CPI                                              9
UnemploymentRate                                 9
GDPGrowthRate                                    7
GICS Sector                                     11
Gender                                           2
CEO Transition                                   2
Tenure Bucket                                    4
HQ_US_State_or_Country                          49
Salary_Bucket                                    7
dtype: int64

In [7]:
# Convert categorical data to numeric with `pd.get_dummies`
sp500_numeric_df = pd.get_dummies(sp500_company_ml_df, dtype=int).drop('Gender_M', axis=1)
sp500_numeric_df

Unnamed: 0,Year,Outperformed?,Interest Rate (Federal Funds Effective Rate),CPI,UnemploymentRate,GDPGrowthRate,CEO Transition,GICS Sector_Communication Services,GICS Sector_Consumer Discretionary,GICS Sector_Consumer Staples,...,HQ_US_State_or_Country_Virginia,HQ_US_State_or_Country_Washington,HQ_US_State_or_Country_Wisconsin,"Salary_Bucket_(0, 5000000]","Salary_Bucket_(10000000, 15000000]","Salary_Bucket_(15000000, 20000000]","Salary_Bucket_(20000000, 25000000]","Salary_Bucket_(25000000, 50000000]","Salary_Bucket_(5000000, 10000000]","Salary_Bucket_(50000000, 300000000]"
0,2015,1,0.11,0.1,5.3,2.9,1.0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,2016,1,0.34,1.3,4.9,1.8,0.0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,2017,1,0.65,2.1,4.4,2.5,0.0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,2018,1,1.41,2.4,3.9,3.0,0.0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,2019,0,2.40,1.8,3.7,2.5,0.0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4361,2019,1,2.40,1.8,3.7,2.5,0.0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4362,2020,1,1.55,1.2,8.1,-2.2,1.0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4363,2021,1,0.09,4.7,5.4,5.8,0.0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4364,2022,0,0.08,8.0,3.6,1.9,0.0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [8]:
# Split the preprocessed data into features and target arrays
y = sp500_numeric_df['Outperformed?'].values
X = sp500_numeric_df.drop(columns=['Outperformed?'], axis=1).values
X_columns = sp500_numeric_df.drop(columns=['Outperformed?'], axis=1).columns

In [9]:
# View data shape
X.shape

(4366, 78)

In [10]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [11]:
# View train data shape
X_train.shape

(3274, 78)

In [12]:
# View test data shape
X_test.shape

(1092, 78)

In [13]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler instance
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Save the fitted scaler
joblib.dump(scaler, "scaler.save")

['scaler.save']

## Neural Network Modelling

### Test # 1

In [None]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0])
hidden_nodes_layer1 =  10
hidden_nodes_layer2 = 10

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

In [None]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=100)

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

### Test # 2

In [None]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0])
hidden_nodes_layer1 =  128
hidden_nodes_layer2 = 128

nn2 = tf.keras.models.Sequential()

# First hidden layer
nn2.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

# Second hidden layer
nn2.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn2.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn2.summary()

In [None]:
# Compile the model
nn2.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# Train the model
fit_model = nn2.fit(X_train_scaled,y_train,epochs=100)

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn2.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

### Test # 3

In [None]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0])
hidden_nodes_layer1 = 128
hidden_nodes_layer2 = 128
hidden_nodes_layer3 = 128

nn3 = tf.keras.models.Sequential()

# First hidden layer
nn3.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

# Second hidden layer
nn3.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Third hidden layer
nn3.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="relu"))

# Output layer
nn3.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn3.summary()

In [None]:
# Compile the model
nn3.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# Train the model
fit_model = nn3.fit(X_train_scaled,y_train,epochs=100)

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn3.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

### Test # 4

In [None]:
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping


# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0])
hidden_nodes_layer1 = 256
hidden_nodes_layer2 = 256
hidden_nodes_layer3 = 256
hidden_nodes_layer4 = 256

nn4 = tf.keras.models.Sequential()

# First hidden layer
nn4.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu", kernel_regularizer=l2(0.01)))
nn4.add(Dropout(0.5))
nn4.add(BatchNormalization())

# Second hidden layer
nn4.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))
nn4.add(Dropout(0.5))
nn4.add(BatchNormalization())

# Third hidden layer
nn4.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="relu"))
nn4.add(Dropout(0.5))
nn4.add(BatchNormalization())

# Fourth hidden layer
nn4.add(tf.keras.layers.Dense(units=hidden_nodes_layer4, activation="relu"))
nn4.add(Dropout(0.5))
nn4.add(BatchNormalization())

# Output layer
nn4.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn4.summary()

In [None]:
# Compile the model
nn4.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

#  Add early Stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

In [None]:
# Train the model
fit_model = nn4.fit(X_train_scaled,y_train,epochs=50, callbacks=[early_stopping])

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn4.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

### Test 5

In [None]:
# Import additional keras libraries to help reduce overfitting
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping


# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0])
hidden_nodes_layer1 = 16
hidden_nodes_layer2 = 115
hidden_nodes_layer3 = 15
hidden_nodes_layer4 = 55
hidden_nodes_layer5 = 30
hidden_nodes_layer6 = 40
hidden_nodes_layer7 = 125

nn5 = tf.keras.models.Sequential()

# First hidden layer
nn5.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu", kernel_regularizer=l2(0.01)))
nn5.add(Dropout(0.5))
nn5.add(BatchNormalization())

# Second hidden layer
nn5.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))
nn5.add(Dropout(0.5))
nn5.add(BatchNormalization())

# Third hidden layer
nn5.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="relu"))
nn5.add(Dropout(0.5))
nn5.add(BatchNormalization())

# Fourth hidden layer
nn5.add(tf.keras.layers.Dense(units=hidden_nodes_layer4, activation="relu"))
nn5.add(Dropout(0.5))
nn5.add(BatchNormalization())

# Fifth hidden layer
nn5.add(tf.keras.layers.Dense(units=hidden_nodes_layer5, activation="relu"))
nn5.add(Dropout(0.5))
nn5.add(BatchNormalization())

# Sixth hidden layer
nn5.add(tf.keras.layers.Dense(units=hidden_nodes_layer6, activation="relu"))
nn5.add(Dropout(0.5))
nn5.add(BatchNormalization())

# Seventh hidden layer
nn5.add(tf.keras.layers.Dense(units=hidden_nodes_layer7, activation="relu"))
nn5.add(Dropout(0.5))
nn5.add(BatchNormalization())

# Output layer
nn5.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn5.summary()

In [None]:
# Compile the model
nn5.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Early Stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

In [None]:
# Train the model
fit_model = nn5.fit(X_train_scaled,y_train,epochs=100, callbacks=[early_stopping])

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn5.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

## Optimize the model

In [14]:
# Create a method that creates a new Sequential model with hyperparameter options
def create_model(hp):
    nn_model = tf.keras.models.Sequential()

    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation',['relu','tanh'])

    # Allow kerastuner to decide number of neurons in first layer
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=1,
        max_value=30,
        step=5), activation=activation, input_dim=len(X_train[0])))

    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 3, 6)):
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=10,
            max_value=128,
            step=5),
            activation=activation))

    nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Compile the model
    nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])

    return nn_model

In [15]:
# Import the kerastuner library
import keras_tuner as kt

tuner = kt.Hyperband(
    create_model,
    objective="val_accuracy",
    max_epochs=20,
    hyperband_iterations=2)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [16]:
# Run the kerastuner search for best hyperparameters
tuner.search(X_train_scaled,y_train,epochs=20,validation_data=(X_test_scaled,y_test))

Trial 60 Complete [00h 00m 04s]
val_accuracy: 0.5906593203544617

Best val_accuracy So Far: 0.6135531067848206
Total elapsed time: 00h 02m 28s


In [17]:
from pprint import pprint
# Get top 3 model hyperparameters and print the values
top_hyper = tuner.get_best_hyperparameters(3)
for index, param in enumerate(top_hyper):
    print('------------------------')
    print(f'Model rank: {index + 1}')
    print('Model hyperparametrers')
    pprint(param.values)

------------------------
Model rank: 1
Model hyperparametrers
{'activation': 'relu',
 'first_units': 11,
 'num_layers': 4,
 'tuner/bracket': 1,
 'tuner/epochs': 20,
 'tuner/initial_epoch': 7,
 'tuner/round': 1,
 'tuner/trial_id': '0053',
 'units_0': 45,
 'units_1': 105,
 'units_2': 10,
 'units_3': 70,
 'units_4': 45,
 'units_5': 105}
------------------------
Model rank: 2
Model hyperparametrers
{'activation': 'tanh',
 'first_units': 26,
 'num_layers': 5,
 'tuner/bracket': 2,
 'tuner/epochs': 7,
 'tuner/initial_epoch': 3,
 'tuner/round': 1,
 'tuner/trial_id': '0011',
 'units_0': 120,
 'units_1': 30,
 'units_2': 95,
 'units_3': 20,
 'units_4': 70,
 'units_5': 60}
------------------------
Model rank: 3
Model hyperparametrers
{'activation': 'relu',
 'first_units': 6,
 'num_layers': 3,
 'tuner/bracket': 0,
 'tuner/epochs': 20,
 'tuner/initial_epoch': 0,
 'tuner/round': 0,
 'units_0': 25,
 'units_1': 15,
 'units_2': 65,
 'units_3': 15,
 'units_4': 110,
 'units_5': 100}


In [18]:
# Evaluate the top 3 models against the test dataset
top_model = tuner.get_best_models(3)
for index, model in enumerate(top_model):
    print(f'Model rank: {index + 1}')
    model_loss, model_accuracy = model.evaluate(X_test_scaled,y_test,verbose=2)
    print(f"Loss: {model_loss:.3f}, Accuracy: {model_accuracy:.3f}")

Model rank: 1
35/35 - 0s - 5ms/step - accuracy: 0.6136 - loss: 0.6945
Loss: 0.694, Accuracy: 0.614
Model rank: 2
35/35 - 0s - 4ms/step - accuracy: 0.6117 - loss: 0.6805
Loss: 0.681, Accuracy: 0.612
Model rank: 3
35/35 - 0s - 4ms/step - accuracy: 0.6108 - loss: 0.6730
Loss: 0.673, Accuracy: 0.611


In [20]:
# Retrieve the best model
best_model = tuner.get_best_models(num_models=1)[0]

# Export the best model to a keras file
from tensorflow.keras.saving import save_model
save_model(best_model, 'saved_nn_models/best_optimized_model.keras')