# Smart_Stock_ML: Neural Network

## Setup and Explore the data

In [1]:
# Import dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix,classification_report
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import hvplot.pandas

#  Import completed dataset
sp500_company_ml_df = pd.read_csv("data/sp500_company_ml.csv")
sp500_company_ml_df.head()

Unnamed: 0,Year,Outperformed?,Interest Rate (Federal Funds Effective Rate),CPI,UnemploymentRate,GDPGrowthRate,GICS Sector,Gender,Headquarters State,Salary_Bins
0,2015,1,0.11,0.1,5.3,2.9,Health Care,M,California,"(15000000, 20000000]"
1,2016,1,0.34,1.3,4.9,1.8,Health Care,M,California,"(15000000, 20000000]"
2,2017,1,0.65,2.1,4.4,2.5,Health Care,M,California,"(15000000, 20000000]"
3,2018,1,1.41,2.4,3.9,3.0,Health Care,M,California,"(15000000, 20000000]"
4,2019,0,2.4,1.8,3.7,2.5,Health Care,M,California,"(15000000, 20000000]"


In [2]:
# View data info
sp500_company_ml_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4370 entries, 0 to 4369
Data columns (total 10 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   Year                                          4370 non-null   int64  
 1   Outperformed?                                 4370 non-null   int64  
 2   Interest Rate (Federal Funds Effective Rate)  4370 non-null   float64
 3   CPI                                           4370 non-null   float64
 4   UnemploymentRate                              4370 non-null   float64
 5   GDPGrowthRate                                 4370 non-null   float64
 6   GICS Sector                                   4370 non-null   object 
 7   Gender                                        4370 non-null   object 
 8   Headquarters State                            4370 non-null   object 
 9   Salary_Bins                                   4361 non-null   o

In [3]:
# Determine the number of unique values in each column.
sp500_company_ml_df.nunique()

Year                                             9
Outperformed?                                    2
Interest Rate (Federal Funds Effective Rate)     9
CPI                                              9
UnemploymentRate                                 9
GDPGrowthRate                                    7
GICS Sector                                     11
Gender                                           2
Headquarters State                              49
Salary_Bins                                      7
dtype: int64

In [4]:
# Convert categorical data to numeric with `pd.get_dummies`
sp500_numeric_df = pd.get_dummies(sp500_company_ml_df, dtype=int)
sp500_numeric_df

Unnamed: 0,Year,Outperformed?,Interest Rate (Federal Funds Effective Rate),CPI,UnemploymentRate,GDPGrowthRate,GICS Sector_Communication Services,GICS Sector_Consumer Discretionary,GICS Sector_Consumer Staples,GICS Sector_Energy,...,Headquarters State_Virginia,Headquarters State_Washington,Headquarters State_Wisconsin,"Salary_Bins_(0, 5000000]","Salary_Bins_(10000000, 15000000]","Salary_Bins_(15000000, 20000000]","Salary_Bins_(20000000, 25000000]","Salary_Bins_(25000000, 50000000]","Salary_Bins_(5000000, 10000000]","Salary_Bins_(50000000, 300000000]"
0,2015,1,0.11,0.1,5.3,2.9,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,2016,1,0.34,1.3,4.9,1.8,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,2017,1,0.65,2.1,4.4,2.5,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,2018,1,1.41,2.4,3.9,3.0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,2019,0,2.40,1.8,3.7,2.5,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4365,2019,1,2.40,1.8,3.7,2.5,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4366,2020,1,1.55,1.2,8.1,-2.2,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4367,2021,1,0.09,4.7,5.4,5.8,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4368,2022,0,0.08,8.0,3.6,1.9,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [5]:
# Split the preprocessed data into features and target arrays
y = sp500_numeric_df['Outperformed?'].values
X = sp500_numeric_df.drop(columns=['Outperformed?'], axis=1).values
X_columns = sp500_numeric_df.drop(columns=['Outperformed?'], axis=1).columns

In [6]:
X_columns

Index(['Year', 'Interest Rate (Federal Funds Effective Rate)', 'CPI',
       'UnemploymentRate', 'GDPGrowthRate',
       'GICS Sector_Communication Services',
       'GICS Sector_Consumer Discretionary', 'GICS Sector_Consumer Staples',
       'GICS Sector_Energy', 'GICS Sector_Financials',
       'GICS Sector_Health Care', 'GICS Sector_Industrials',
       'GICS Sector_Information Technology', 'GICS Sector_Materials',
       'GICS Sector_Real Estate', 'GICS Sector_Utilities', 'Gender_F',
       'Gender_M', 'Headquarters State_Alabama', 'Headquarters State_Arizona',
       'Headquarters State_Arkansas', 'Headquarters State_Bermuda',
       'Headquarters State_Bristol', 'Headquarters State_California',
       'Headquarters State_Canada', 'Headquarters State_Colorado',
       'Headquarters State_Connecticut', 'Headquarters State_D.C.',
       'Headquarters State_Delaware', 'Headquarters State_Florida',
       'Headquarters State_Georgia', 'Headquarters State_Idaho',
       'Headquarters S

In [7]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [8]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler instance
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Neural Network Modelling

### Test # 1

In [None]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0])
hidden_nodes_layer1 =  128
hidden_nodes_layer2 = 128

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

In [None]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=100)

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
# Export the model to HDF5 file
nn.save("SP500_model.h5")

In [None]:
# Export the model to keras file
from tensorflow.keras.saving import save_model

save_model(nn, 'SP500_model.keras')

### Test # 2

In [None]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0])
hidden_nodes_layer1 =  8
hidden_nodes_layer2 = 6

nn2 = tf.keras.models.Sequential()

# First hidden layer
nn2.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

# Second hidden layer
nn2.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn2.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn2.summary()

In [None]:
# Compile the model
nn2.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# Train the model
fit_model = nn2.fit(X_train_scaled,y_train,epochs=200)

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn2.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

### Test # 3

In [None]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0])
hidden_nodes_layer1 = 50
hidden_nodes_layer2 = 50
hidden_nodes_layer3 = 50

nn3 = tf.keras.models.Sequential()

# First hidden layer
nn3.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

# Second hidden layer
nn3.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Third hidden layer
nn3.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="relu"))

# Output layer
nn3.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn3.summary()

In [None]:
# Compile the model
nn3.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# Train the model
fit_model = nn3.fit(X_train_scaled,y_train,epochs=100)

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn3.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

### Test # 4

In [9]:
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping


# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0])
hidden_nodes_layer1 = 256
hidden_nodes_layer2 = 256
hidden_nodes_layer3 = 256
hidden_nodes_layer4 = 256

nn4 = tf.keras.models.Sequential()

# First hidden layer
nn4.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu", kernel_regularizer=l2(0.01)))
nn4.add(Dropout(0.5))
nn4.add(BatchNormalization())

# Second hidden layer
nn4.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))
nn4.add(Dropout(0.5))
nn4.add(BatchNormalization())

# Third hidden layer
nn4.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="relu"))
nn4.add(Dropout(0.5))
nn4.add(BatchNormalization())

# Fourth hidden layer
nn4.add(tf.keras.layers.Dense(units=hidden_nodes_layer4, activation="relu"))
nn4.add(Dropout(0.5))
nn4.add(BatchNormalization())

# Output layer
nn4.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn4.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [10]:
# Compile the model
nn4.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Early Stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)


In [11]:
# Train the model
fit_model = nn4.fit(X_train_scaled,y_train,epochs=100, callbacks=[early_stopping])

Epoch 1/100
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.4961 - loss: 1.8679
Epoch 2/100
[1m 54/103[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 3ms/step - accuracy: 0.4802 - loss: 1.5453

  current = self.get_monitor_value(logs)


[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.4830 - loss: 1.5113
Epoch 3/100
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5278 - loss: 1.3193
Epoch 4/100
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5069 - loss: 1.2208
Epoch 5/100
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5177 - loss: 1.1272
Epoch 6/100
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5402 - loss: 1.0495
Epoch 7/100
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5336 - loss: 0.9972
Epoch 8/100
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5567 - loss: 0.9372
Epoch 9/100
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5750 - loss: 0.8859
Epoch 10/100
[1m103/103[0m [32m━━━━━━━━━━

[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6568 - loss: 0.6810
Epoch 70/100
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6537 - loss: 0.6877
Epoch 71/100
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6544 - loss: 0.6796
Epoch 72/100
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6614 - loss: 0.6846
Epoch 73/100
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6661 - loss: 0.6816
Epoch 74/100
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6649 - loss: 0.6825
Epoch 75/100
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6467 - loss: 0.6987
Epoch 76/100
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6616 - loss: 0.6696
Epoch 77/100
[1m103/103[0m [32m━━━

In [12]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn4.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

35/35 - 0s - 4ms/step - accuracy: 0.5947 - loss: 0.7586
Loss: 0.7585774660110474, Accuracy: 0.5946934819221497


## Optimize the model