In [1]:
# Imports
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from pandas.tseries.offsets import DateOffset
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from imblearn.metrics import classification_report_imbalanced
import keras.metrics
import keras.optimizers
from tensorflow import keras
from sklearn.decomposition import PCA

In [2]:
# Import the BITCOIN csv into the notebook
signals_df = pd.read_csv(
    Path("./Resources/custom_new.csv"),
    index_col = 'Date',
    infer_datetime_format=True, 
    parse_dates=True
)

# Drop features that aren't useful for machine learning
signals_df.drop(columns=['value','Portfolio Cumulative Returns','Portfolio Period Returns','Portfolio Total','Portfolio Cash','Portfolio Holdings'], inplace=True)

# Review the DataFrame
signals_df.head()

Unnamed: 0_level_0,Close,Open,High,Volume,Low,STOCHk_14_3_3,STOCHd_14_3_3,MACD_12_26_9,MACDh_12_26_9,MACDs_12_26_9,k-d,hist_adj,Stoch_Signal,MACD_Signal,Signal,Entry/Exit,Position,Entry/Exit Position
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2021-04-19 15:30:00-04:00,415.23999,414.325012,415.339996,14176944,414.140015,25.930343,22.103159,0.621316,-0.520959,1.142274,3.827184,0.071674,1.0,1.0,1,1.0,100,100.0
2021-04-20 09:30:00-04:00,413.920013,413.910004,414.679993,11437142,413.660004,18.099063,21.674862,0.463053,-0.543377,1.00643,-3.575799,-0.022418,0.0,0.0,0,-1.0,0,-100.0
2021-04-20 10:30:00-04:00,411.575012,413.92099,413.929993,17431474,411.119995,16.004308,20.011238,0.146716,-0.687771,0.834487,-4.00693,-0.144395,0.0,0.0,0,0.0,0,0.0
2021-04-20 11:30:00-04:00,412.144989,411.575012,412.399994,10103321,410.619995,11.246037,15.116469,-0.05733,-0.713454,0.656124,-3.870433,-0.025683,0.0,0.0,0,0.0,0,0.0
2021-04-20 12:30:00-04:00,411.290009,412.149994,412.220001,5748608,411.25,12.27036,13.173568,-0.284746,-0.752696,0.46795,-0.903208,-0.039242,0.0,0.0,0,0.0,0,0.0


In [3]:
# use PCA to reduce dimension from 18 to 15

# Stoch_Signal, MACD_Signal, Entry/Exit, Entry/Exit Position > one feature

In [4]:
signals_data_df = signals_df[['hist_adj','Stoch_Signal','MACD_Signal','Entry/Exit','Position','Entry/Exit Position']]

In [5]:
scaled_data = StandardScaler().fit_transform(signals_data_df)

In [6]:
# Create a DataFrame with the scaled data
signals_data_df_scaled = pd.DataFrame(
    scaled_data,
    columns=signals_data_df.columns
)
    # creates DataFrame out of scaled_data and sets column names equal to df_market_data column names 
    # the DataFrame is then assigned to a variable called df_market_data_scaled

# Copy the crypto names from the original data

signals_data_df_scaled["Date"] = signals_data_df.index

# Set the coinid column as index

signals_data_df_scaled = signals_data_df_scaled.set_index("Date")

# Display sample data

signals_data_df_scaled.head()

Unnamed: 0_level_0,hist_adj,Stoch_Signal,MACD_Signal,Entry/Exit,Position,Entry/Exit Position
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-04-19 15:30:00-04:00,0.492587,0.981796,1.071432,2.195439,1.261849,2.195439
2021-04-20 09:30:00-04:00,-0.155464,-1.018542,-0.933331,-2.197961,-0.792488,-2.197961
2021-04-20 10:30:00-04:00,-0.995562,-1.018542,-0.933331,-0.001261,-0.792488,-0.001261
2021-04-20 11:30:00-04:00,-0.177949,-1.018542,-0.933331,-0.001261,-0.792488,-0.001261
2021-04-20 12:30:00-04:00,-0.271334,-1.018542,-0.933331,-0.001261,-0.792488,-0.001261


In [7]:
# Create a PCA model instance and set `n_components=3`.

pca = PCA(n_components=1)

In [8]:
# Use the PCA model with `fit_transform` to reduce to 
# three principal components.

market_data_pca = pca.fit_transform(signals_data_df_scaled)

# View the first five rows of the DataFrame. 

market_data_pca[:5]

array([[ 3.25824065],
       [-2.85782779],
       [-1.58659328],
       [-1.27767772],
       [-1.31296099]])

In [9]:
# Retrieve the explained variance to determine how much information 
# can be attributed to each principal component.

display(pca.explained_variance_ratio_)

display(sum(list(pca.explained_variance_ratio_)))
    # calculates the total explained variance
    
    # Stoch_Signal, MACD_Signal, Entry/Exit, Entry/Exit Position > one feature

array([0.60559955])

0.6055995480506612

In [10]:
market_data_pca = pd.DataFrame(market_data_pca)
market_data_pca.set_index(signals_df.index, inplace=True)
market_data_pca

Unnamed: 0_level_0,0
Date,Unnamed: 1_level_1
2021-04-19 15:30:00-04:00,3.258241
2021-04-20 09:30:00-04:00,-2.857828
2021-04-20 10:30:00-04:00,-1.586593
2021-04-20 11:30:00-04:00,-1.277678
2021-04-20 12:30:00-04:00,-1.312961
...,...
2022-04-12 12:30:00-04:00,-1.839730
2022-04-12 13:30:00-04:00,-1.647146
2022-04-12 14:30:00-04:00,-1.647329
2022-04-12 15:30:00-04:00,-0.282353


In [11]:
# Set the numerical variable DataFrame
side_numeric = signals_df.drop(columns=['hist_adj', 'Stoch_Signal','MACD_Signal','Entry/Exit','Position','Entry/Exit Position'])

# Add the numerical variables from the original DataFrame to the one-hot encoding DataFrame
signals_pca_df = pd.concat([market_data_pca,side_numeric],axis=1)

# Review the DataFrame
display(signals_pca_df.head())

Unnamed: 0_level_0,0,Close,Open,High,Volume,Low,STOCHk_14_3_3,STOCHd_14_3_3,MACD_12_26_9,MACDh_12_26_9,MACDs_12_26_9,k-d,Signal
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2021-04-19 15:30:00-04:00,3.258241,415.23999,414.325012,415.339996,14176944,414.140015,25.930343,22.103159,0.621316,-0.520959,1.142274,3.827184,1
2021-04-20 09:30:00-04:00,-2.857828,413.920013,413.910004,414.679993,11437142,413.660004,18.099063,21.674862,0.463053,-0.543377,1.00643,-3.575799,0
2021-04-20 10:30:00-04:00,-1.586593,411.575012,413.92099,413.929993,17431474,411.119995,16.004308,20.011238,0.146716,-0.687771,0.834487,-4.00693,0
2021-04-20 11:30:00-04:00,-1.277678,412.144989,411.575012,412.399994,10103321,410.619995,11.246037,15.116469,-0.05733,-0.713454,0.656124,-3.870433,0
2021-04-20 12:30:00-04:00,-1.312961,411.290009,412.149994,412.220001,5748608,411.25,12.27036,13.173568,-0.284746,-0.752696,0.46795,-0.903208,0


In [12]:
len(signals_pca_df.columns)

13

In [13]:
X=signals_pca_df.drop(columns='Signal')
y=signals_pca_df[['Signal']]

    # signals_pca_df

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)
    # test_size=x
len(X_test)

436

In [15]:
y_train.value_counts()

Signal
0         803
1         503
dtype: int64

In [16]:
# Select the start of the training period
training_begin = X.index.min() + DateOffset(hours=1)

# Display the training begin date
print(training_begin)

2021-04-19 16:30:00-04:00


In [17]:
# Select the ending period for the training data with an offset of 3 months
training_end = X.index.min() + DateOffset(months=3)
    # Keep training less than 50% of total DataFrame

# Display the training end date
print(training_end)

2021-07-19 15:30:00-04:00


In [18]:
# Generate the X_train and y_train DataFrames
X_train = X.loc[training_begin:training_end]
y_train = y.loc[training_begin:training_end]

# Review the X_train DataFrame
X_train.head()

Unnamed: 0_level_0,0,Close,Open,High,Volume,Low,STOCHk_14_3_3,STOCHd_14_3_3,MACD_12_26_9,MACDh_12_26_9,MACDs_12_26_9,k-d
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2021-04-20 09:30:00-04:00,-2.857828,413.920013,413.910004,414.679993,11437142,413.660004,18.099063,21.674862,0.463053,-0.543377,1.00643,-3.575799
2021-04-20 10:30:00-04:00,-1.586593,411.575012,413.92099,413.929993,17431474,411.119995,16.004308,20.011238,0.146716,-0.687771,0.834487,-4.00693
2021-04-20 11:30:00-04:00,-1.277678,412.144989,411.575012,412.399994,10103321,410.619995,11.246037,15.116469,-0.05733,-0.713454,0.656124,-3.870433
2021-04-20 12:30:00-04:00,-1.312961,411.290009,412.149994,412.220001,5748608,411.25,12.27036,13.173568,-0.284746,-0.752696,0.46795,-0.903208
2021-04-20 13:30:00-04:00,3.139985,411.535004,411.309692,412.100006,5744918,411.200012,14.220449,12.578948,-0.440132,-0.726465,0.286333,1.641501


In [19]:
# Generate the X_test and y_test DataFrames
X_test = X.loc[training_end+DateOffset(hours=1):]
y_test = y.loc[training_end+DateOffset(hours=1):]

# Review the X_test DataFrame
display(X_test.head())
display(X_test.tail())
    # NOT SURE IF DATEOFFSET IS NECESSARY FOR X/Y TEST

Unnamed: 0_level_0,0,Close,Open,High,Volume,Low,STOCHk_14_3_3,STOCHd_14_3_3,MACD_12_26_9,MACDh_12_26_9,MACDs_12_26_9,k-d
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2021-07-20 09:30:00-04:00,2.801206,429.019989,425.679993,429.470001,22332446,424.829987,27.075173,14.110055,-2.727512,-0.308362,-2.41915,12.965119
2021-07-20 10:30:00-04:00,2.558179,429.920013,429.019989,430.559998,11728727,428.809998,47.861584,28.632651,-2.288172,0.104782,-2.392954,19.228933
2021-07-20 11:30:00-04:00,2.415296,431.220001,429.920013,431.440002,9536033,429.839996,66.275627,47.070795,-1.814182,0.463018,-2.2772,19.204832
2021-07-20 12:30:00-04:00,2.015059,430.959991,431.220001,432.079987,9371889,430.839996,72.896502,62.344571,-1.442889,0.667449,-2.110338,10.551931
2021-07-20 13:30:00-04:00,1.915028,431.799988,430.959991,431.829987,7831322,430.73999,81.425578,73.532569,-1.068539,0.833439,-1.901978,7.893009


Unnamed: 0_level_0,0,Close,Open,High,Volume,Low,STOCHk_14_3_3,STOCHd_14_3_3,MACD_12_26_9,MACDh_12_26_9,MACDs_12_26_9,k-d
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2022-04-12 12:30:00-04:00,-1.83973,441.640015,441.75,442.700012,7124676,441.170013,28.591089,30.039237,-1.904712,-0.04556,-1.859152,-1.448148
2022-04-12 13:30:00-04:00,-1.647146,438.48999,441.649994,441.940002,10061518,438.420013,14.9548,25.692469,-2.125682,-0.213224,-1.912458,-10.737669
2022-04-12 14:30:00-04:00,-1.647329,437.059998,438.480011,439.290009,14347270,436.650085,8.822045,17.455978,-2.388655,-0.380958,-2.007697,-8.633933
2022-04-12 15:30:00-04:00,-0.282353,438.269989,437.059906,438.779999,12496573,436.679993,7.240768,10.339204,-2.470944,-0.370598,-2.100347,-3.098436
2022-04-12 16:00:00-04:00,3.192334,438.290009,438.290009,438.290009,0,438.290009,13.034849,9.699221,-2.50566,-0.324251,-2.181409,3.335629


In [20]:
# Scale the features DataFrames
# Create a StandardScaler instance
scaler = StandardScaler()

# Apply the scaler model to fit the X-train data
X_scaler = scaler.fit(X_train)

## Transform the X_train and X_test DataFrames using the X_scaler
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)



In [21]:
# From SVM, instantiate SVC classifier model instance
signals_tree = RandomForestClassifier(random_state=1)
 
# Fit the model to the data using the training data
signals_tree.fit(X_train_scaled,y_train)
 
# Use the testing data to make the model predictions
y_signals_tree_pred = signals_tree.predict(X_test_scaled)

signals_tree_class = classification_report(y_test,y_signals_tree_pred)
signals_tree_matrix = confusion_matrix(y_test,y_signals_tree_pred)

  """


In [22]:
# Instantiate the random oversampler model

random_sampler = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model
X_resampled, y_resampled = random_sampler.fit_resample(X_train,y_train)


y_resampled.value_counts()

# Do we have to create this before 



Signal
0         277
1         277
dtype: int64

In [23]:
# Stoch RandomForestClassifier Oversampled
signals_tree_os = RandomForestClassifier(random_state=1)
 
# Fit the model to the data using the training data
signals_tree_os.fit(X_resampled,y_resampled)
 
# Use the testing data to make the model predictions
y_signals_tree_pred_os = signals_tree_os.predict(X_test)

signals_tree_class_os = classification_report(y_test,y_signals_tree_pred_os)
signals_tree_matrix_os = confusion_matrix(y_test,y_signals_tree_pred_os)

  """


In [24]:
# # From LogisticRegression, instantiate LogisticRegression classifier model instance
signals_log = LogisticRegression(random_state=1)
 
 # Fit the model to the data using the training data
signals_log.fit(X_train_scaled,y_train)
 
# # Use the testing data to make the model predictions
y_signals_log_pred = signals_log.predict(X_test_scaled)

# # Create and save confusion matrix and classification report to a variable name
signals_log_matrix = confusion_matrix(y_test,y_signals_log_pred)
signals_log_class = classification_report(y_test,y_signals_log_pred)

  y = column_or_1d(y, warn=True)


In [25]:
print(signals_log_matrix)
print(signals_log_class)

[[791   2]
 [  0 507]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       793
           1       1.00      1.00      1.00       507

    accuracy                           1.00      1300
   macro avg       1.00      1.00      1.00      1300
weighted avg       1.00      1.00      1.00      1300



In [26]:
# From LogisticRegression, instantiate LogisticRegression classifier model instance
signals_log_os = LogisticRegression(random_state=1)
 
# Fit the model to the data using the training data
signals_log_os.fit(X_resampled,y_resampled)
 
# # Use the testing data to make the model predictions
y_signals_log_pred_os = signals_log.predict(X_test)

# Create and save confusion matrix and classification report to a variable name
signals_log_matrix_os = confusion_matrix(y_test,y_signals_log_pred_os)
signals_log_class_os = classification_report(y_test,y_signals_log_pred_os)

  y = column_or_1d(y, warn=True)


In [27]:
# Define the the number of inputs (features) to the model
number_input_features = len(X_train.iloc[0])
    # input features should equal the number of features in X

# Define the number of neurons in the output layer
number_output_neurons = len(y.columns)
    # we use 3 output neurons because we have three targets, Entry/Exit_-1.0, Entry/Exit_0.0, and Entry/Exit_1.0.
    # should be equal to the number of target columns (len(y.columns)) we are trying to predict.

# Define the number of hidden nodes for all hidden layers
hidden_nodes_layer1 = (number_input_features + number_output_neurons) //2
hidden_nodes_layer2 = (hidden_nodes_layer1 + number_output_neurons) //2
hidden_nodes_layer3 = 600
hidden_nodes_layer4 = 400
hidden_nodes_layer5 = 300
hidden_nodes_layer6 = 200
hidden_nodes_layer7 = 150
hidden_nodes_layer8 = 120
hidden_nodes_layer9 = 100
hidden_nodes_layer10 = 60

n='\n'

# Review the number of input features, output neurons, and hidden nodes
print(f'Number of input features:{n}{number_input_features}{n}Number of output neurons:{n}{number_output_neurons}{n}' 
    f'Hidden layer Neuron #:{n}First Layer: {hidden_nodes_layer1}{n}Second Layer: {hidden_nodes_layer2}{n}Third Layer: {hidden_nodes_layer3}{n}Fourth Layer: {hidden_nodes_layer4}{n}'
    f'Fifth Layer: {hidden_nodes_layer5}')

Number of input features:
12
Number of output neurons:
1
Hidden layer Neuron #:
First Layer: 6
Second Layer: 3
Third Layer: 600
Fourth Layer: 400
Fifth Layer: 300


In [28]:
# Create the Sequential model instance
nn = Sequential()

# Add the first hidden layer
nn.add(Dense(units=hidden_nodes_layer1,input_dim=number_input_features,activation='relu'))

# Add the second hidden layer
nn.add(Dense(units=hidden_nodes_layer2,activation='relu'))

# Add the third hidden layer
#nn.add(Dense(units=hidden_nodes_layer3,activation='relu'))

# Add the fourth hidden layer
#nn.add(Dense(units=hidden_nodes_layer4,activation='relu'))

# Add the fifth hidden layer
#nn.add(Dense(units=hidden_nodes_layer5,activation='relu'))

# Add the sixth hidden layer
#nn.add(Dense(units=hidden_nodes_layer6,activation='relu'))

# Add the seventh hidden layer
#nn.add(Dense(units=hidden_nodes_layer7,activation='relu'))

# Add the eigth hidden layer
#nn.add(Dense(units=hidden_nodes_layer8,activation='relu'))

# Add the ninth hidden layer
#nn.add(Dense(units=hidden_nodes_layer9,activation='relu'))

# Add the tenth hidden layer
#nn.add(Dense(units=hidden_nodes_layer10,activation='relu'))

# Add the output layer to the model specifying the number of output neurons and activation function
nn.add(Dense(units=number_output_neurons,activation='softmax'))
    # choice is between softmax and sigmoid

# Display the Sequential model summary
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 6)                 78        
                                                                 
 dense_1 (Dense)             (None, 3)                 21        
                                                                 
 dense_2 (Dense)             (None, 1)                 4         
                                                                 
Total params: 103
Trainable params: 103
Non-trainable params: 0
_________________________________________________________________


In [29]:
# Compile the Sequential model
nn.compile(loss=keras.losses.BinaryCrossentropy(), optimizer=keras.optimizers.Adam(), metrics=[keras.metrics.CategoricalAccuracy()])
    # metric exploration
    
# loss=keras.losses.CategoricalCrossentropy()
# keras.losses.BinaryCrossentropy()
# Fit the model using 100 epochs and the training data
nn.fit(X_train_scaled,y_train,epochs=500)
    # make sure to use X_train_scaled rather than X_train
    # verbose=3, reduces the graphics displayed per epoch. in turn this increases the overall speed of the epochs.
    
# keras.optimizers.Adam()

#from_logits=True

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

<keras.callbacks.History at 0x1e24ce9d408>

In [30]:
# Evaluate the model's keras.metrics.CategoricalAccuracy() metrics using the evaluate method and the test data
nn_custom = nn.evaluate(X_test_scaled,y_test)
    # is there a way to see the metrics of each output neuron individually?
    
# catagory
# loss: 0.0088 - categorical_accuracy: 0.9962


# Binary
# loss: 0.0039 - categorical_accuracy: 0.9992
# Loss: 0.000084905 - categorical_accuracy: 1.0000
# loss: 0.0045 - categorical_accuracy: 0.9992



In [31]:
# Import the BTC dataset into a Pandas Dataframe
btc_df = pd.read_csv(
    Path("./Resources/bitcoin_new.csv"),
    index_col = 'Date',
    infer_datetime_format=True, 
    parse_dates=True
)

# Drop columns that are irrelevant to machine learning
btc_df.drop(columns=['value','Portfolio Cumulative Returns','Portfolio Period Returns','Portfolio Total','Portfolio Cash','Portfolio Holdings'], inplace=True)


# Create a list of categorical variables
# We are replicating the encoding of the macd_df DataFrame used for the Neural Network
categorical_variables = list(btc_df[['Entry/Exit']])

# Review the DataFrame
display(btc_df.head())

print(n)

# Check the DataFrame's dtypes
display(btc_df.dtypes)

print(n)

# Display the categorical variables list
display(categorical_variables[0:6])

Unnamed: 0_level_0,Close,Open,High,Volume,Low,MACD_12_26_9,MACDh_12_26_9,MACDs_12_26_9,hist_adj,Signal,Entry/Exit,Position,Entry/Exit Position
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2021-04-16 03:00:00+00:00,63060.675781,62897.472656,63242.539062,645509120,62867.0,76.206577,3.677745,72.528833,-14.900307,0.0,0.0,0.0,0.0
2021-04-16 04:00:00+00:00,61900.785156,63024.421875,63024.421875,137613312,61900.785156,-28.142434,-80.537014,52.394579,-84.214758,0.0,0.0,0.0,0.0
2021-04-16 05:00:00+00:00,61948.710938,61817.359375,62318.277344,1772748800,61695.523438,-105.753508,-126.51847,20.764962,-45.981456,0.0,0.0,0.0,0.0
2021-04-16 06:00:00+00:00,61562.84375,61943.515625,61943.515625,1777618944,61289.355469,-196.136164,-173.520901,-22.615263,-47.002431,0.0,0.0,0.0,0.0
2021-04-16 07:00:00+00:00,61626.21875,61558.09375,61724.355469,550719488,61278.59375,-259.658061,-189.634238,-70.023823,-16.113337,0.0,0.0,0.0,0.0






Close                  float64
Open                   float64
High                   float64
Volume                   int64
Low                    float64
MACD_12_26_9           float64
MACDh_12_26_9          float64
MACDs_12_26_9          float64
hist_adj               float64
Signal                 float64
Entry/Exit             float64
Position               float64
Entry/Exit Position    float64
dtype: object





['Entry/Exit']

In [32]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)
    # sparse = False, results in an array
    # sparse = True (default), results in a sparse matrix
    
# Encode the categorcal variables using OneHotEncoder
encoded_data = enc.fit_transform(btc_df[categorical_variables])

# Create a DataFrame with the encoded variables
encoded_df = pd.DataFrame(
    encoded_data,
    columns = enc.get_feature_names(categorical_variables)
        # function gathers column names and assigns them to the new DataFrame
)

# Set encoded_df's index to btc_df's index
encoded_df.set_index(btc_df.index, inplace=True)

# Review encoded_data array
display(encoded_data[0:5])

# Review encoded_df DataFrame
display(encoded_df.head())



array([[0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.]])

Unnamed: 0_level_0,Entry/Exit_-1.0,Entry/Exit_0.0,Entry/Exit_1.0
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-04-16 03:00:00+00:00,0.0,1.0,0.0
2021-04-16 04:00:00+00:00,0.0,1.0,0.0
2021-04-16 05:00:00+00:00,0.0,1.0,0.0
2021-04-16 06:00:00+00:00,0.0,1.0,0.0
2021-04-16 07:00:00+00:00,0.0,1.0,0.0


In [33]:
# Set the numerical variable DataFrame
side_numeric = btc_df.drop(columns=['Entry/Exit'])

# Add the numerical variables from the original DataFrame to the one-hot encoding DataFrame
btc_ohe_df = pd.concat([encoded_df,side_numeric],axis=1)

# Review the data
side_numeric.head()

# Review the DataFrame
btc_ohe_df.head()

Unnamed: 0_level_0,Entry/Exit_-1.0,Entry/Exit_0.0,Entry/Exit_1.0,Close,Open,High,Volume,Low,MACD_12_26_9,MACDh_12_26_9,MACDs_12_26_9,hist_adj,Signal,Position,Entry/Exit Position
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2021-04-16 03:00:00+00:00,0.0,1.0,0.0,63060.675781,62897.472656,63242.539062,645509120,62867.0,76.206577,3.677745,72.528833,-14.900307,0.0,0.0,0.0
2021-04-16 04:00:00+00:00,0.0,1.0,0.0,61900.785156,63024.421875,63024.421875,137613312,61900.785156,-28.142434,-80.537014,52.394579,-84.214758,0.0,0.0,0.0
2021-04-16 05:00:00+00:00,0.0,1.0,0.0,61948.710938,61817.359375,62318.277344,1772748800,61695.523438,-105.753508,-126.51847,20.764962,-45.981456,0.0,0.0,0.0
2021-04-16 06:00:00+00:00,0.0,1.0,0.0,61562.84375,61943.515625,61943.515625,1777618944,61289.355469,-196.136164,-173.520901,-22.615263,-47.002431,0.0,0.0,0.0
2021-04-16 07:00:00+00:00,0.0,1.0,0.0,61626.21875,61558.09375,61724.355469,550719488,61278.59375,-259.658061,-189.634238,-70.023823,-16.113337,0.0,0.0,0.0


In [34]:
len(signals_pca_df.columns)

13

In [35]:
signals_pca_df.head()

Unnamed: 0_level_0,0,Close,Open,High,Volume,Low,STOCHk_14_3_3,STOCHd_14_3_3,MACD_12_26_9,MACDh_12_26_9,MACDs_12_26_9,k-d,Signal
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2021-04-19 15:30:00-04:00,3.258241,415.23999,414.325012,415.339996,14176944,414.140015,25.930343,22.103159,0.621316,-0.520959,1.142274,3.827184,1
2021-04-20 09:30:00-04:00,-2.857828,413.920013,413.910004,414.679993,11437142,413.660004,18.099063,21.674862,0.463053,-0.543377,1.00643,-3.575799,0
2021-04-20 10:30:00-04:00,-1.586593,411.575012,413.92099,413.929993,17431474,411.119995,16.004308,20.011238,0.146716,-0.687771,0.834487,-4.00693,0
2021-04-20 11:30:00-04:00,-1.277678,412.144989,411.575012,412.399994,10103321,410.619995,11.246037,15.116469,-0.05733,-0.713454,0.656124,-3.870433,0
2021-04-20 12:30:00-04:00,-1.312961,411.290009,412.149994,412.220001,5748608,411.25,12.27036,13.173568,-0.284746,-0.752696,0.46795,-0.903208,0


In [36]:
len(btc_ohe_df.columns)

15

In [37]:
btc_df.head()

Unnamed: 0_level_0,Close,Open,High,Volume,Low,MACD_12_26_9,MACDh_12_26_9,MACDs_12_26_9,hist_adj,Signal,Entry/Exit,Position,Entry/Exit Position
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2021-04-16 03:00:00+00:00,63060.675781,62897.472656,63242.539062,645509120,62867.0,76.206577,3.677745,72.528833,-14.900307,0.0,0.0,0.0,0.0
2021-04-16 04:00:00+00:00,61900.785156,63024.421875,63024.421875,137613312,61900.785156,-28.142434,-80.537014,52.394579,-84.214758,0.0,0.0,0.0,0.0
2021-04-16 05:00:00+00:00,61948.710938,61817.359375,62318.277344,1772748800,61695.523438,-105.753508,-126.51847,20.764962,-45.981456,0.0,0.0,0.0,0.0
2021-04-16 06:00:00+00:00,61562.84375,61943.515625,61943.515625,1777618944,61289.355469,-196.136164,-173.520901,-22.615263,-47.002431,0.0,0.0,0.0,0.0
2021-04-16 07:00:00+00:00,61626.21875,61558.09375,61724.355469,550719488,61278.59375,-259.658061,-189.634238,-70.023823,-16.113337,0.0,0.0,0.0,0.0


In [38]:
len(btc_df.columns)

13

In [39]:
# Create the features set selecting all features besides the target set and assign it to X
X = btc_df.drop(columns=['Entry/Exit'])

# Create the target set selecting the Signal column and assiging it to y
# We have three targets due to multi-class classification and OneHotEncoder 
y = btc_df[['Entry/Exit']]

# Set X and y equal to X_test and y_test
X_test = X
y_test = y

In [40]:
# Scale X_test
# Apply the scaler model to fit_transform the X-test data
# Transform the X_test DataFrame using the X_scaler
X_scaled = X_scaler.fit_transform(X_test)

In [41]:
# Evaluate the btc neural network's keras.metrics.CategoricalAccuracy() metrics using the evaluate method and the test data
nn_custom_btc = nn.evaluate(X_scaled,y_test)



In [42]:
#y_btc_pred = nn.predict(X_scaled)

#y_btc_df = pd.DataFrame(y_btc_pred,columns=['Sell','Hold','Buy'])
#y_btc_df.to_csv('Resources/btc_nn_signal.csv')

    # btc df has 15 columns while custom has 18

In [43]:
# Evaluate the btc neural network's keras.metrics.CategoricalAccuracy() metrics using the evaluate method and the test data
nn_macd_btc = nn.evaluate(X_scaled,y_test)



In [44]:
print('NN')
print('Loss: -- Categorical Accuracy:')
print(nn_custom)

print('NN BTC')
print('Loss: -- Categorical Accuracy:')
print(nn_custom_btc)


print('Random Forest')
print(signals_tree_class)
print(signals_tree_matrix)
print('OS -- Random Forest')
print(signals_tree_class_os)
print(signals_tree_matrix_os)
print('log Reg')
print(signals_log_matrix_os)
print(signals_log_class_os)
print('OS -- Log Reg')
print(signals_log_matrix_os)
print(signals_log_class_os)

NN
Loss: -- Categorical Accuracy:
[0.01582847535610199, 1.0]
NN BTC
Loss: -- Categorical Accuracy:
[1.6150100231170654, 1.0]
Random Forest
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       793
           1       1.00      1.00      1.00       507

    accuracy                           1.00      1300
   macro avg       1.00      1.00      1.00      1300
weighted avg       1.00      1.00      1.00      1300

[[792   1]
 [  0 507]]
OS -- Random Forest
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       793
           1       1.00      1.00      1.00       507

    accuracy                           1.00      1300
   macro avg       1.00      1.00      1.00      1300
weighted avg       1.00      1.00      1.00      1300

[[792   1]
 [  0 507]]
log Reg
[[792   1]
 [506   1]]
              precision    recall  f1-score   support

           0       0.61      1.00      0.76       793
 

In [45]:
#(None, 14), found shape=(None, 12)