In [31]:
!pip install matplotlib
!pip install numpy
!pip install pandas
!pip install scikit-learn



In [32]:
from datetime import datetime, timedelta
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import os
import sklearn
#Function to produce training and targets using full dataset tables

#GLOBAL PATH to data
#DATA_PATH = '~/data/finance'

def produceXYDataSets(ticker, corp, ns_back): 
    df = pd.read_csv('~/data/finance'+'/optionchaindata/all/'+ticker+'_alldata_'+corp+'.csv.zip', parse_dates=['quoteDate','expiryDate'])
    print("Dataframe shape from file",df.shape)
    
    #basic data cleaning, remove lines where the strike price is more than 50 from the stockprice
    df_good = df[ df['strikeDelta'] > -50 ]
    print("After removing deltastrike bigger than -50", df_good.shape)

    df_good = df_good[ df_good['strikeDelta'] < 50 ]
    print("After removing deltastrike less than 50", df_good.shape)

    #All contract names to read through
    contracts = df_good['contractSymbol'].unique()

    x_train = []
    y_train = []

    #Only look at contracts that have at least 1.5 time as many entries as the look back time
    nquotes_min = int(1.5*ns_back)
    good_contracts = []

    for contract in contracts:
        data = df_good[df_good['contractSymbol'] == contract]
        
        if data.shape[0] < nquotes_min:
            continue
        
        good_contracts.append(contract)
        
        data['deltaDays'] = data['quoteDate'].diff()
        data['weekday'] = data['quoteDate'].dt.dayofweek

        ndays = data.shape[0]

        for iday in range(ns_back+1, ndays):
            #X:
            stockPrices = data['stockClose'][iday-ns_back:iday].values
            strike = data['strike'].values[iday]
            openInterest = data['openInterest'].values[iday]
            daysToExpiry = int(data['daysToExpiry'].values[iday].split('days')[0])
            deltaDays = data['deltaDays'].values[iday]/ np.timedelta64(1, 'D')
            weekday = data['weekday'].values[iday]

            features =np.concatenate( [[strike, openInterest, daysToExpiry, deltaDays, weekday], stockPrices] )

            #y:
            ask = data['ask'].values[iday]
            bid = data['bid'].values[iday]

            targets = np.array([bid, ask])

            #print(weekday,ask,bid,daysToExpiry,deltaDays, strike,stock_prices)
            x_train.append(features)
            y_train.append(targets)
    

    x_train = np.array(x_train)
    y_train = np.array(y_train)
    
    print("Used",len(good_contracts),"contracts total")
    print("Done, made data set with",x_train.shape[0],"samples")
    
    xydata = np.concatenate([x_train,y_train], axis=1)
    DATA_PATH = os.path.expanduser('~/data/')  # Expands the ~ to the full home directory path
    directory = os.path.join(DATA_PATH, 'save/')

    if not os.path.exists(directory):
        os.makedirs(directory)
        
    np.save(directory+ticker+'_'+corp+'_XY.npy',xydata)
    
    return x_train, y_train

   


In [33]:
# train on AAPLE Data
ticker = 'AAPL'
x_train, y_train = produceXYDataSets('AAPL',"C",20)
print(x_train.shape, y_train.shape)



Dataframe shape from file (187867, 30)
After removing deltastrike bigger than -50 (132526, 30)
After removing deltastrike less than 50 (94202, 30)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['deltaDays'] = data['quoteDate'].diff()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['weekday'] = data['quoteDate'].dt.dayofweek
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['deltaDays'] = data['quoteDate'].diff()
A value is trying to be set on a copy of a slice from a DataFrame.

Used 814 contracts total
Done, made data set with 48809 samples
(48809, 25) (48809, 2)


In [34]:
x_train[0,5:]

array([172.78999329, 172.55000305, 168.88000488, 167.30000305,
       167.30000305, 164.32000732, 160.07000732, 162.74000549,
       164.8500061 , 165.11999512, 163.19999695, 166.55999756,
       166.22999573, 163.16999817, 159.30000305, 157.44000244,
       162.94999695, 158.52000427, 154.72999573, 150.61999512])

In [35]:
x_data = x_train[:,:5]
x_stockdata=x_train[:,5:]

print(x_data)
print(x_stockdata)

[[ 1.400e+02  3.320e+02  1.700e+01  1.000e+00  1.000e+00]
 [ 1.400e+02  4.240e+02  1.600e+01  1.000e+00  2.000e+00]
 [ 1.400e+02  6.690e+02  1.500e+01  1.000e+00  3.000e+00]
 ...
 [ 1.850e+02  1.386e+03  1.640e+02 -6.000e+00  1.000e+00]
 [ 1.850e+02  1.379e+03  1.630e+02  1.000e+00  2.000e+00]
 [ 1.850e+02  1.385e+03  1.620e+02  1.000e+00  3.000e+00]]
[[172.78999329 172.55000305 168.88000488 ... 158.52000427 154.72999573
  150.61999512]
 [172.55000305 168.88000488 167.30000305 ... 154.72999573 150.61999512
  155.08999634]
 [168.88000488 167.30000305 167.30000305 ... 150.61999512 155.08999634
  159.58999634]
 ...
 [149.69999695 146.86999512 150.72000122 ... 146.63000488 142.16000366
  144.49000549]
 [146.86999512 150.72000122 148.27999878 ... 142.16000366 144.49000549
  142.91000366]
 [150.72000122 148.27999878 150.03999329 ... 144.49000549 142.91000366
  140.94000244]]


In [36]:
from sklearn import preprocessing
min_max_scaler_stockdata = preprocessing.MinMaxScaler()
min_max_scaler_data = preprocessing.MinMaxScaler()


x_stockdata_scaled = min_max_scaler_stockdata.fit_transform(x_stockdata)
print(x_stockdata_scaled)

x_data_scaled = min_max_scaler_data.fit_transform(x_data)
print(x_data_scaled)

[[0.87382388 0.86891611 0.79386503 ... 0.58200412 0.50449885 0.42044977]
 [0.86891611 0.79386503 0.76155416 ... 0.50449885 0.42044977 0.51186082]
 [0.79386503 0.76155416 0.76155416 ... 0.42044977 0.51186082 0.60388534]
 ...
 [0.4016359  0.34376267 0.42249488 ... 0.33885489 0.24744384 0.29509213]
 [0.34376267 0.42249488 0.37259709 ... 0.24744384 0.29509213 0.26278126]
 [0.42249488 0.37259709 0.40858879 ... 0.29509213 0.26278126 0.22249495]]
[[0.375      0.002956   0.02048193 0.10784314 0.25      ]
 [0.375      0.00377513 0.01927711 0.10784314 0.5       ]
 [0.375      0.00595651 0.01807229 0.10784314 0.75      ]
 ...
 [0.75       0.0123404  0.19759036 0.03921569 0.25      ]
 [0.75       0.01227808 0.19638554 0.10784314 0.5       ]
 [0.75       0.0123315  0.19518072 0.10784314 0.75      ]]


In [37]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.optimizers import Adam

# Assuming x_train, y_train, x_data_scaled, and x_stockdata_scaled are loaded from previous steps

# Define the RNN model with LSTM
def build_lstm_model(input_shape):
    model = Sequential()
    model.add(LSTM(50, return_sequences=True, input_shape=input_shape, recurrent_dropout=0.1))
    model.add(LSTM(50, return_sequences=False, recurrent_dropout=0.1))
    model.add(Dropout(0.2))
    model.add(Dense(25, activation='relu'))
    model.add(Dense(2, activation='linear'))  # Predicting two values: bid and ask prices

    model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')
    return model

# Prepare the data for LSTM
# LSTM requires input shape of the form [samples, time steps, features]
# Here, we assume that x_stockdata_scaled is already shaped as [samples, time steps]
# and x_data_scaled are additional features that we will append to each time step

# Reshape x_stockdata_scaled to [samples, time steps, 1] if it's not already in 3D
if len(x_stockdata_scaled.shape) == 2:
    x_stockdata_scaled = np.expand_dims(x_stockdata_scaled, axis=2)

# Concatenate the additional features to the stock data along the last dimension
x_lstm = np.concatenate((x_stockdata_scaled, np.repeat(np.expand_dims(x_data_scaled, 1), x_stockdata_scaled.shape[1], axis=1)), axis=2)

# Build the model
model = build_lstm_model(input_shape=(x_lstm.shape[1], x_lstm.shape[2]))

# Train the model
history = model.fit(x_lstm, y_train, epochs=10, batch_size=64, validation_split=0.2)

# Evaluate the model
loss = model.evaluate(x_lstm, y_train)

# Predictions
predictions = model.predict(x_lstm)

# You can now use the trained model to predict bid and ask prices from your options data
model.save(ticker+'_model.h5')

2023-11-09 23:20:40.942039: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-11-09 23:20:44.262234: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:268] failed call to cuInit: CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE: forward compatibility was attempted on non supported HW
2023-11-09 23:20:44.262329: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:168] retrieving CUDA diagnostic information for host: neutrino
2023-11-09 23:20:44.262338: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:175] hostname: neutrino
2023-11-09 23:20:44.262544: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:199] libcuda reported version is: 470.223.2
2023-11-09 23:20:44.262579: I tensorflow/compiler/xla/s

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
