In [1]:
# Imports
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from pandas.tseries.offsets import DateOffset
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from imblearn.metrics import classification_report_imbalanced
import keras.metrics
import keras.optimizers
from tensorflow import keras

In [2]:
# Import the custom_new csv as a DataFrame
signals_df = pd.read_csv(
    Path("./Resources/custom_new.csv"),
    index_col = 'Date',
    infer_datetime_format=True, 
    parse_dates=True
)

# Drop features that aren't useful for machine learning
signals_df.drop(columns=['value','Portfolio Cumulative Returns','Portfolio Period Returns','Portfolio Total','Portfolio Cash','Portfolio Holdings'], inplace=True)

# Review the DataFrame
signals_df.head()

Unnamed: 0_level_0,Close,Open,High,Volume,Low,STOCHk_14_3_3,STOCHd_14_3_3,MACD_12_26_9,MACDh_12_26_9,MACDs_12_26_9,k-d,hist_adj,Stoch_Signal,MACD_Signal,Signal,Entry/Exit,Position,Entry/Exit Position
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2021-04-19 15:30:00-04:00,415.23999,414.325012,415.339996,14176944,414.140015,25.930343,22.103159,0.621316,-0.520959,1.142274,3.827184,0.071674,1.0,1.0,1,1.0,100,100.0
2021-04-20 09:30:00-04:00,413.920013,413.910004,414.679993,11437142,413.660004,18.099063,21.674862,0.463053,-0.543377,1.00643,-3.575799,-0.022418,0.0,0.0,0,-1.0,0,-100.0
2021-04-20 10:30:00-04:00,411.575012,413.92099,413.929993,17431474,411.119995,16.004308,20.011238,0.146716,-0.687771,0.834487,-4.00693,-0.144395,0.0,0.0,0,0.0,0,0.0
2021-04-20 11:30:00-04:00,412.144989,411.575012,412.399994,10103321,410.619995,11.246037,15.116469,-0.05733,-0.713454,0.656124,-3.870433,-0.025683,0.0,0.0,0,0.0,0,0.0
2021-04-20 12:30:00-04:00,411.290009,412.149994,412.220001,5748608,411.25,12.27036,13.173568,-0.284746,-0.752696,0.46795,-0.903208,-0.039242,0.0,0.0,0,0.0,0,0.0


In [3]:
X=signals_df.drop(columns='Entry/Exit')
y=signals_df[['Entry/Exit']]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

# Select the start of the training period
training_begin = X.index.min() + DateOffset(hours=1)

# Select the ending period for the training data with an offset of 3 months
training_end = X.index.min() + DateOffset(months=3)
    # Keep training less than 50% of total DataFrame

# Generate the X_train and y_train DataFrames
X_train = X.loc[training_begin:training_end]
y_train = y.loc[training_begin:training_end]

# Review the X_train DataFrame
X_train.head()

# Generate the X_test and y_test DataFrames
X_test = X.loc[training_end+DateOffset(hours=1):]
y_test = y.loc[training_end+DateOffset(hours=1):]

# Display the training being and end
print(training_begin)
print(training_end)

# Review the X_train DataFrame
display(X_train.head())

# Review the X_test DataFrame
display(X_test.head())

2021-04-19 16:30:00-04:00
2021-07-19 15:30:00-04:00


Unnamed: 0_level_0,Close,Open,High,Volume,Low,STOCHk_14_3_3,STOCHd_14_3_3,MACD_12_26_9,MACDh_12_26_9,MACDs_12_26_9,k-d,hist_adj,Stoch_Signal,MACD_Signal,Signal,Position,Entry/Exit Position
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2021-04-20 09:30:00-04:00,413.920013,413.910004,414.679993,11437142,413.660004,18.099063,21.674862,0.463053,-0.543377,1.00643,-3.575799,-0.022418,0.0,0.0,0,0,-100.0
2021-04-20 10:30:00-04:00,411.575012,413.92099,413.929993,17431474,411.119995,16.004308,20.011238,0.146716,-0.687771,0.834487,-4.00693,-0.144395,0.0,0.0,0,0,0.0
2021-04-20 11:30:00-04:00,412.144989,411.575012,412.399994,10103321,410.619995,11.246037,15.116469,-0.05733,-0.713454,0.656124,-3.870433,-0.025683,0.0,0.0,0,0,0.0
2021-04-20 12:30:00-04:00,411.290009,412.149994,412.220001,5748608,411.25,12.27036,13.173568,-0.284746,-0.752696,0.46795,-0.903208,-0.039242,0.0,0.0,0,0,0.0
2021-04-20 13:30:00-04:00,411.535004,411.309692,412.100006,5744918,411.200012,14.220449,12.578948,-0.440132,-0.726465,0.286333,1.641501,0.02623,1.0,1.0,1,100,100.0


Unnamed: 0_level_0,Close,Open,High,Volume,Low,STOCHk_14_3_3,STOCHd_14_3_3,MACD_12_26_9,MACDh_12_26_9,MACDs_12_26_9,k-d,hist_adj,Stoch_Signal,MACD_Signal,Signal,Position,Entry/Exit Position
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2021-07-20 09:30:00-04:00,429.019989,425.679993,429.470001,22332446,424.829987,27.075173,14.110055,-2.727512,-0.308362,-2.41915,12.965119,0.506535,1.0,1.0,1,100,0.0
2021-07-20 10:30:00-04:00,429.920013,429.019989,430.559998,11728727,428.809998,47.861584,28.632651,-2.288172,0.104782,-2.392954,19.228933,0.413144,1.0,1.0,1,100,0.0
2021-07-20 11:30:00-04:00,431.220001,429.920013,431.440002,9536033,429.839996,66.275627,47.070795,-1.814182,0.463018,-2.2772,19.204832,0.358236,1.0,1.0,1,100,0.0
2021-07-20 12:30:00-04:00,430.959991,431.220001,432.079987,9371889,430.839996,72.896502,62.344571,-1.442889,0.667449,-2.110338,10.551931,0.204431,1.0,1.0,1,100,0.0
2021-07-20 13:30:00-04:00,431.799988,430.959991,431.829987,7831322,430.73999,81.425578,73.532569,-1.068539,0.833439,-1.901978,7.893009,0.165991,1.0,1.0,1,100,0.0


In [5]:
# Scale the features DataFrames
# Create a StandardScaler instance
scaler = StandardScaler()

# Apply the scaler model to fit the X-train data
X_scaler = scaler.fit(X_train)

## Transform the X_train and X_test DataFrames using the X_scaler
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [6]:
# From SVM, instantiate SVC classifier model instance
signals_tree = RandomForestClassifier(random_state=1)
 
# Fit the model to the data using the training data
signals_tree.fit(X_train_scaled,y_train.values.ravel())
 
# Use the testing data to make the model predictions
y_signals_tree_pred = signals_tree.predict(X_test_scaled)

signals_tree_class = classification_report(y_test,y_signals_tree_pred)
signals_tree_matrix = confusion_matrix(y_test,y_signals_tree_pred)

In [7]:
# Instantiate the random oversampler model

random_sampler = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model
X_resampled, y_resampled = random_sampler.fit_resample(X_train,y_train)

In [8]:
# Stoch RandomForestClassifier Oversampled
signals_tree_os = RandomForestClassifier(random_state=1)
 
# Fit the model to the data using the training data
signals_tree_os.fit(X_resampled,y_resampled.values.ravel())
 
# Use the testing data to make the model predictions
y_signals_tree_pred_os = signals_tree_os.predict(X_test)

signals_tree_class_os = classification_report(y_test,y_signals_tree_pred_os)
signals_tree_matrix_os = confusion_matrix(y_test,y_signals_tree_pred_os)

In [9]:
# # From LogisticRegression, instantiate LogisticRegression classifier model instance
signals_log = LogisticRegression(random_state=1)
 
 # Fit the model to the data using the training data
signals_log.fit(X_train_scaled,y_train.values.ravel())
 
# # Use the testing data to make the model predictions
y_signals_log_pred = signals_log.predict(X_test_scaled)

# # Create and save confusion matrix and classification report to a variable name
signals_log_matrix = confusion_matrix(y_test,y_signals_log_pred)
signals_log_class = classification_report(y_test,y_signals_log_pred)

In [10]:
print(signals_log_matrix)
print(signals_log_class)

[[ 129    0    0]
 [   0 1042    0]
 [   0    0  129]]
              precision    recall  f1-score   support

        -1.0       1.00      1.00      1.00       129
         0.0       1.00      1.00      1.00      1042
         1.0       1.00      1.00      1.00       129

    accuracy                           1.00      1300
   macro avg       1.00      1.00      1.00      1300
weighted avg       1.00      1.00      1.00      1300



In [11]:
# Define the the number of inputs (features) to the model
number_input_features = len(X_train.iloc[0])
    # input features should equal the number of features in X

# Define the number of neurons in the output layer
number_output_neurons = len(y.columns)
    # we use 3 output neurons because we have three targets, Entry/Exit_-1.0, Entry/Exit_0.0, and Entry/Exit_1.0.
    # should be equal to the number of target columns (len(y.columns)) we are trying to predict.

# Define the number of hidden nodes for all hidden layers
hidden_nodes_layer1 = (number_input_features + number_output_neurons) //2
hidden_nodes_layer2 = (hidden_nodes_layer1 + number_output_neurons) //2
hidden_nodes_layer3 = (hidden_nodes_layer2 + number_output_neurons) //2
hidden_nodes_layer4 = (hidden_nodes_layer3 + number_output_neurons) //2
hidden_nodes_layer5 = (hidden_nodes_layer4 + number_output_neurons) //2
hidden_nodes_layer6 = (hidden_nodes_layer5 + number_output_neurons) //2
hidden_nodes_layer7 = (hidden_nodes_layer6 + number_output_neurons) //2
# 9 5 3 2
n='\n'

# Review the number of input features, output neurons, and hidden nodes
print(f'Number of input features:{n}{number_input_features}{n}Number of output neurons:{n}{number_output_neurons}{n}' 
    f'Hidden layer Neuron #:{n}First Layer: {hidden_nodes_layer1}{n}Second Layer: {hidden_nodes_layer2}{n}Third Layer: {hidden_nodes_layer3}{n}Fourth Layer: {hidden_nodes_layer4}{n}'
    f'{hidden_nodes_layer5}{n}{hidden_nodes_layer6}')

Number of input features:
17
Number of output neurons:
1
Hidden layer Neuron #:
First Layer: 9
Second Layer: 5
Third Layer: 3
Fourth Layer: 2
1
1


In [12]:
# Create the Sequential model instance
nn = Sequential()

# Add the first hidden layer
nn.add(Dense(units=hidden_nodes_layer1,input_dim=number_input_features,activation='relu'))

# Add the second hidden layer
nn.add(Dense(units=hidden_nodes_layer2,activation='relu'))

# Add the third hidden layer
nn.add(Dense(units=hidden_nodes_layer3,activation='relu'))

# Add the fourth hidden layer
nn.add(Dense(units=hidden_nodes_layer4,activation='relu'))

# Add the fifth hidden layer
nn.add(Dense(units=hidden_nodes_layer5,activation='relu'))

# Add the sixth hidden layer
nn.add(Dense(units=hidden_nodes_layer6,activation='relu'))

# Add the output layer to the model specifying the number of output neurons and activation function
nn.add(Dense(units=number_output_neurons,activation='sigmoid'))
    # choice is between softmax and sigmoid

# Display the Sequential model summary
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 9)                 162       
                                                                 
 dense_1 (Dense)             (None, 5)                 50        
                                                                 
 dense_2 (Dense)             (None, 3)                 18        
                                                                 
 dense_3 (Dense)             (None, 2)                 8         
                                                                 
 dense_4 (Dense)             (None, 1)                 3         
                                                                 
 dense_5 (Dense)             (None, 1)                 2         
                                                                 
 dense_6 (Dense)             (None, 1)                 2

In [13]:
# Compile the Sequential model
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics='accuracy')
    # metric exploration
    
# Fit the model using 100 epochs and the training data
nn.fit(X_train_scaled,y_train,epochs=500)
    # make sure to use X_train_scaled rather than X_train
    # verbose=3, reduces the graphics displayed per epoch. in turn this increases the overall speed of the epochs.
    

#from_logits=True

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

<keras.callbacks.History at 0x142932ef5c8>

In [14]:
# Evaluate the model's keras.metrics.CategoricalAccuracy() metrics using the evaluate method and the test data
nn_custom_loss, nn_custom_accuracy = nn.evaluate(X_test_scaled,y_test)
print(f'Loss:{nn_custom_loss},Acc:{nn_custom_accuracy}')

Loss:0.025539394468069077,Acc:0.8015384674072266


In [15]:
print('NN')
print(f'Loss:{nn_custom_loss},Acc:{nn_custom_accuracy}')
print('Random Forest')
print(signals_tree_class)
print(signals_tree_matrix)
print('OS -- Random Forest')
print(signals_tree_class_os)
print(signals_tree_matrix_os)
print('log Reg')
print(signals_log_matrix)
print(signals_log_class)

NN
Loss:0.025539394468069077,Acc:0.8015384674072266
Random Forest
              precision    recall  f1-score   support

        -1.0       1.00      1.00      1.00       129
         0.0       1.00      1.00      1.00      1042
         1.0       1.00      1.00      1.00       129

    accuracy                           1.00      1300
   macro avg       1.00      1.00      1.00      1300
weighted avg       1.00      1.00      1.00      1300

[[ 129    0    0]
 [   0 1042    0]
 [   0    0  129]]
OS -- Random Forest
              precision    recall  f1-score   support

        -1.0       1.00      1.00      1.00       129
         0.0       1.00      1.00      1.00      1042
         1.0       1.00      1.00      1.00       129

    accuracy                           1.00      1300
   macro avg       1.00      1.00      1.00      1300
weighted avg       1.00      1.00      1.00      1300

[[ 129    0    0]
 [   0 1042    0]
 [   0    0  129]]
log Reg
[[ 129    0    0]
 [   0 1042    0