In [None]:
# Import the required libraries
import numpy as np
import pandas as pd
import hvplot.pandas
from pathlib import Path
from finta import TA
from pandas.tseries.offsets import DateOffset
import os
import requests
from dotenv import load_dotenv
import alpaca_trade_api as tradeapi
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Dropout, LSTM
from sklearn.metrics import classification_report

import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
df = pd.read_csv("../algotrader2/resources/aapl_15min_indc_df.csv")
df.head()

In [None]:
# Create our timestamp column as a datetime index, then save it as our index
df['timestamp'] = pd.to_datetime(df['timestamp'])
df.set_index('timestamp', inplace=True)

In [None]:
df.corr()

In [None]:

# X is everything except the new_signal column
########
# Dropping variables to fix negative loss.
# Dropped: old signal, 
X = df.drop(['new_signal', 'pct_returns'], axis=1)

# We should use the .shift() function so that our algorithm predicts the minute before realtime
# Drop the row with NaN values 
X = X.shift().dropna()

X.head()

In [None]:
# y is the NEW signal column
y = df[("new_signal")]

# Set start of training period
training_begin = X.index.min()

# we will train on 9 months and then test with the rest
training_end = X.index.min() + DateOffset(months=9)

In [None]:
# Generate the X_train and y_train DataFrames
X_train = X.loc[training_begin:training_end]
y_train = y.loc[training_begin:training_end]

# Generate the X_test and y_test DataFrames
X_test = X.loc[training_end:]
y_test = y.loc[training_end:]

In [None]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Apply the scaler model to fit the X-train data
X_scaler = scaler.fit(X_train)

# Transform the X_train and X_test DataFrames using the X_scaler
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

display(X_train_scaled.shape)
display(X_test_scaled.shape)

In [None]:
num_predictors = len(X.columns)

# We have 2 possible outcomes, and we are trying to predict the stock/indicators to be in position -1 or 1
num_classes = 1

num_predictors

In [None]:
nn_model = Sequential()

In [None]:

# Add dense layer(s)
nn_model.add(Dense(units=32, input_dim=num_predictors, activation='relu'))
nn_model.add(Dense(units=64, activation='relu'))
nn_model.add(Dense(units=128, activation='relu'))
nn_model.add(Dense(units=256, activation='relu'))
nn_model.add(Dense(units=128, activation='relu'))
nn_model.add(Dense(units=64, activation='relu'))
nn_model.add(Dense(units=32, activation='relu'))

In [None]:
# Drop-out layer(s)
nn_model.add(Dropout(.2,input_shape=(10,)))

# # Add dense layer, add Regularization
# nn_model.add(Dense(5, activation='relu', kernel_regularized=l2(0.01), bias_regularized=l2(0.01)))

In [None]:
# Add output layer
# Number of outputs equals number of classes
nn_model.add(Dense(num_classes, activation="sigmoid"))

In [None]:
# Compile model
nn_model.compile(loss="binary_crossentropy",
              optimizer="adam",
              metrics=['accuracy'])

# Summarize model
nn_model.summary()

In [None]:
# Fit model
num_epochs = 100

nn_model.fit(X_train_scaled, y_train,
          epochs=num_epochs,
          batch_size=32,
          validation_split=0.2,     # This 'validation_split' is telling the neural network to keep 20% of the data to validate its score on the training set... this is to help AVOID OVERFITTING. 
          shuffle=True)

In [None]:
# Show model loss and accuracy

# Evaluate the model loss and accuracy metrics using the evaluate method and the test data
model_loss, model_accuracy = nn_model.evaluate(X_test_scaled, y_test, verbose=2)

# Display the evaluation results
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
# back test
# Predict values using testing data
nn_test_predictions = nn_model.predict(X_test_scaled)
nn_train_predictions = nn_model.predict(X_train_scaled)

In [None]:
# Convert probabilities to class labels (0 or 1) using 0.5 as the threshold
nn_test_predictions_labels = (nn_test_predictions > 0.5).astype(int)
nn_train_predictions_labels = (nn_train_predictions > 0.5).astype(int)

# Training classification report
train_class_report = classification_report(y_train, nn_train_predictions_labels)
print(train_class_report)

# Testing classification report
test_class_report = classification_report(y_test, nn_test_predictions_labels)
print(test_class_report)

### Interesting how our accuracy dropped by so much when we ran the model on the testing data.
The next iterations will begin to address the following:

### FIRST: Let's get this algorithm running on the real market data; Paper Trading.
- We'll have to find out how to link our model's predictions to our given paper trading brokerage account API. 

I this point, I believe that it is crucial we deploy our model on live market sessions before tuning it any further.

### After the baseline deployment is observed:

If it is clearly off during deployment, look deep into each step of the process to see if we are making a major mistake.

If it seems to be making some sort of sense, then we can get tuning::
- Train on more than 1 year's worth of OHLC data.
- Think of using 3-second interval data gathered from TOS.
- Use the SPY and NQ, and indicators such as VWAP and SMAs on the SPY and NQ, as variables to train our data on. 
    - A majority of stock "follow" the movement of these large indicies according to popular belief. AAPL is an interesting story because it is the largest stock in both indicies. So, we will also test this algorithm on other stocks and observe the numbers.
- We can also possible combine the timeframes, and incorporate new timefimes; 1hr, 1d, 4hr, 30mins. We can put all timeframes and the indicators derived from the OHLC data give each timeframe to predict the pivot points on the 15 minute timeframe.
- Tune our actual neural network algorithm (add more layers, drop out, etc.)
- Try other ML algorithms such as SVM or Randomforests, then combine all ML algo's predictions to create a final prediction
- Use CLOUD services to run EVERYTHING. 
- Automate our datamining process. 
- Implement sentiment analysis (TTVL chat, X, Reddit, emails)
- Add options movement (Put/Call ratios, IV of puts/calls, other important variables)
- Tune our indicators, add the indicators that we have not implemented yet that gave the error ("cannot add multiple output columns to single column in df")


### The goal is consistient profitability. 

In [None]:
# FOR FUTURE USE

# num_features = X_train_scaled.shape[1]
# # Reshape the data to include the time steps dimension
# # Here, we're using 1 time step as an example
# X_train_reshaped = X_train_scaled.reshape(-1, 1, num_features)
# X_test_reshaped = X_test_scaled.reshape(-1, 1, num_features)
# # Reshape X_train_scaled and X_test_scaled
# X_train_reshaped = X_train_scaled.reshape((num_samples, num_timesteps, num_features))
# X_test_reshaped = X_test_scaled.reshape((X_test_scaled.shape[0], num_timesteps, num_features))
# display(X_train_reshaped)
# display(X_test_reshaped)

# num_timesteps = 1  # This is an example; you might need a different value
# Change time step from 1 to more.
# This can be calculated by dividing the total number of elements 
# in X_train_scaled by the number of features and then by the desired number of time steps.
# Calculate the total number of samples that can be formed with 100 time steps
# num_samples = X_train_scaled.shape[0] // 1000


# # We have 2 possible outcomes, and we are trying to predict the stock/indicators to be in position -1 or 1
# num_classes = 1
# nn_model = Sequential()
# # Add LSTM layer
# nn_model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2])))
# nn_model.add(Dropout(0.2))
# # Add another LSTM layer
# nn_model.add(LSTM(units=50, return_sequences=False))
# nn_model.add(Dropout(0.2))

# # Add dense layers
# nn_model.add(Dense(units=64, activation='relu'))
# nn_model.add(Dropout(0.2))

# # Compile model
# nn_model.compile(loss="binary_crossentropy",
#               optimizer="adam",
#               metrics=['accuracy'])

# # Summarize model
# nn_model.summary()


# # Fit model
# num_epochs = 50
# nn_model.fit(X_train_reshaped, y_train, epochs=num_epochs, batch_size=32, validation_split=0.2, shuffle=True)

In [None]:
# FOR FUTURE USE

# # Save model history for further manipulation
# model_history = model.history.model_history.keys()

# # Now we can plot the accuracy for training and validation

# training_results = pd.DataFrame(index=range(1, num_epochs+1))
# training_results['Training'] = model_history['categorical_accuracy']
# training_results['Validation'] = model_history['val_categorical_accuracy']
# training_results.plot(title = 'Training and Validation Performance')