In [1]:
import numpy as np
import os
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.optimizers import Adam

# Function to produce training and targets using full dataset tables
def produceXYDataSets(ticker, corp, ns_back): 
    df = pd.read_csv('~/data/finance'+'/optionchaindata/all/'+ticker+'_alldata_'+corp+'.csv.zip', parse_dates=['quoteDate','expiryDate'])
    print("Dataframe shape from file",df.shape)
    
    #basic data cleaning, remove lines where the strike price is more than 50 from the stockprice
    df_good = df[ df['strikeDelta'] > -50 ]
    print("After removing deltastrike bigger than -50", df_good.shape)

    df_good = df_good[ df_good['strikeDelta'] < 50 ]
    print("After removing deltastrike less than 50", df_good.shape)

    #All contract names to read through
    contracts = df_good['contractSymbol'].unique()

    x_train = []
    y_train = []

    #Only look at contracts that have at least 1.5 time as many entries as the look back time
    nquotes_min = int(1.5*ns_back)
    good_contracts = []

    for contract in contracts:
        data = df_good[df_good['contractSymbol'] == contract]
        
        if data.shape[0] < nquotes_min:
            continue
        
        good_contracts.append(contract)
        
        data['deltaDays'] = data['quoteDate'].diff()
        data['weekday'] = data['quoteDate'].dt.dayofweek

        ndays = data.shape[0]

        for iday in range(ns_back+1, ndays):
            #X:
            stockPrices = data['stockClose'][iday-ns_back:iday].values
            strike = data['strike'].values[iday]
            openInterest = data['openInterest'].values[iday]
            daysToExpiry = int(data['daysToExpiry'].values[iday].split('days')[0])
            deltaDays = data['deltaDays'].values[iday]/ np.timedelta64(1, 'D')
            weekday = data['weekday'].values[iday]

            features =np.concatenate( [[strike, openInterest, daysToExpiry, deltaDays, weekday], stockPrices] )

            #y:
            ask = data['ask'].values[iday]
            bid = data['bid'].values[iday]

            targets = np.array([bid, ask])

            #print(weekday,ask,bid,daysToExpiry,deltaDays, strike,stock_prices)
            x_train.append(features)
            y_train.append(targets)
    

    x_train = np.array(x_train)
    y_train = np.array(y_train)
    
    print("Used",len(good_contracts),"contracts total")
    print("Done, made data set with",x_train.shape[0],"samples")
    
    xydata = np.concatenate([x_train,y_train], axis=1)
    DATA_PATH = os.path.expanduser('~/data/')  # Expands the ~ to the full home directory path
    directory = os.path.join(DATA_PATH, 'save/')

    if not os.path.exists(directory):
        os.makedirs(directory)
        
    np.save(directory+ticker+'_'+corp+'_XY.npy',xydata)
    
    return x_train, y_train
   

# Initialize lists to collect the training data
x_train_all = []
y_train_all = []

# List of NASDAQ tickers that are also in the dataset
tickers = [
    'AAPL', 'META', 'NFLX', 'SHOP', 'NVDA', 'AMD', 'GOOGL',
    'AMZN', 'INTC', 'FB', 'MSFT', 'PYPL', 'TSLA'
]

# Loop over each ticker symbol
for ticker in tickers:
    try:
        x_train_ticker, y_train_ticker = produceXYDataSets(ticker, "C", 20)
        x_train_all.append(x_train_ticker)
        y_train_all.append(y_train_ticker)
    except Exception as e:
        print(f"Failed to process ticker {ticker}: {e}")

# Concatenate the lists into numpy arrays
x_train_all = np.concatenate(x_train_all, axis=0)
y_train_all = np.concatenate(y_train_all, axis=0)

# Split the data into training and testing sets
x_train_all, x_test, y_train_all, y_test = train_test_split(x_train_all, y_train_all, test_size=0.2, random_state=42)


# Scale the data
scaler_x = preprocessing.MinMaxScaler()
scaler_y = preprocessing.MinMaxScaler()

x_train_all_scaled = scaler_x.fit_transform(x_train_all)
y_train_all_scaled = scaler_y.fit_transform(y_train_all)

x_test_scaled = scaler_x.transform(x_test)
y_test_scaled = scaler_y.transform(y_test)



# Define the RNN model with LSTM
def build_lstm_model(input_shape):
    model = Sequential()
    model.add(LSTM(50, return_sequences=True, input_shape=input_shape, recurrent_dropout=0.1))
    model.add(LSTM(50, return_sequences=False, recurrent_dropout=0.1))
    model.add(Dropout(0.2))
    model.add(Dense(25, activation='relu'))
    model.add(Dense(2, activation='linear'))  # Predicting two values: bid and ask prices
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')
    return model

# Prepare the data for LSTM
if len(x_train_all_scaled.shape) == 2:
    x_train_all_scaled = np.expand_dims(x_train_all_scaled, axis=2)

if len(x_test_scaled.shape) == 2:
    x_test_scaled = np.expand_dims(x_test_scaled, axis=2)

# Build the model
model = build_lstm_model(input_shape=(x_train_all_scaled.shape[1], x_train_all_scaled.shape[2]))

# Train the model
history = model.fit(x_train_all_scaled, y_train_all_scaled, epochs=10, batch_size=64, validation_split=0.2)


# Evaluate the model on the test set
test_loss = model.evaluate(x_test_scaled, y_test_scaled)
print(f"Test loss: {test_loss}")

# Make predictions on the test set
predictions = model.predict(x_test_scaled)

# Inverse transform the scaled predictions to original scale
predictions_original = scaler_y.inverse_transform(predictions)
y_test_original = scaler_y.inverse_transform(y_test_scaled)

# Save the trained model
model.save('combined_model.h5')

print("Model training complete and saved. Model evaluated on test data.")


  from pandas.core.computation.check import NUMEXPR_INSTALLED
2023-11-10 00:19:48.378829: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Dataframe shape from file (187867, 30)
After removing deltastrike bigger than -50 (132526, 30)
After removing deltastrike less than 50 (94202, 30)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['deltaDays'] = data['quoteDate'].diff()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['weekday'] = data['quoteDate'].dt.dayofweek
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['deltaDays'] = data['quoteDate'].diff()
A value is trying to be set on a copy of a slice from a DataFrame.

Used 814 contracts total
Done, made data set with 48809 samples
Dataframe shape from file (28618, 30)
After removing deltastrike bigger than -50 (15138, 30)
After removing deltastrike less than 50 (12353, 30)
Used 0 contracts total
Done, made data set with 0 samples
Failed to process ticker META: axis 1 is out of bounds for array of dimension 1
Dataframe shape from file (209372, 30)
After removing deltastrike bigger than -50 (93059, 30)
After removing deltastrike less than 50 (45502, 30)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['deltaDays'] = data['quoteDate'].diff()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['weekday'] = data['quoteDate'].dt.dayofweek
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['deltaDays'] = data['quoteDate'].diff()
A value is trying to be set on a copy of a slice from a DataFrame.

Used 436 contracts total
Done, made data set with 14777 samples
Dataframe shape from file (352329, 30)
After removing deltastrike bigger than -50 (81133, 30)
After removing deltastrike less than 50 (81133, 30)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['deltaDays'] = data['quoteDate'].diff()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['weekday'] = data['quoteDate'].dt.dayofweek
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['deltaDays'] = data['quoteDate'].diff()
A value is trying to be set on a copy of a slice from a DataFrame.

Used 830 contracts total
Done, made data set with 46360 samples
Dataframe shape from file (303666, 30)
After removing deltastrike bigger than -50 (145394, 30)
After removing deltastrike less than 50 (91977, 30)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['deltaDays'] = data['quoteDate'].diff()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['weekday'] = data['quoteDate'].dt.dayofweek
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['deltaDays'] = data['quoteDate'].diff()
A value is trying to be set on a copy of a slice from a DataFrame.

Used 819 contracts total
Done, made data set with 42952 samples
Dataframe shape from file (137380, 30)
After removing deltastrike bigger than -50 (99193, 30)
After removing deltastrike less than 50 (94259, 30)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['deltaDays'] = data['quoteDate'].diff()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['weekday'] = data['quoteDate'].dt.dayofweek
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['deltaDays'] = data['quoteDate'].diff()
A value is trying to be set on a copy of a slice from a DataFrame.

Used 695 contracts total
Done, made data set with 38286 samples
Dataframe shape from file (457413, 30)
After removing deltastrike bigger than -50 (126594, 30)
After removing deltastrike less than 50 (119589, 30)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['deltaDays'] = data['quoteDate'].diff()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['weekday'] = data['quoteDate'].dt.dayofweek
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['deltaDays'] = data['quoteDate'].diff()
A value is trying to be set on a copy of a slice from a DataFrame.

Used 1190 contracts total
Done, made data set with 64692 samples
Dataframe shape from file (639900, 30)
After removing deltastrike bigger than -50 (198205, 30)
After removing deltastrike less than 50 (176159, 30)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['deltaDays'] = data['quoteDate'].diff()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['weekday'] = data['quoteDate'].dt.dayofweek
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['deltaDays'] = data['quoteDate'].diff()
A value is trying to be set on a copy of a slice from a DataFrame.

Used 1749 contracts total
Done, made data set with 99200 samples
Dataframe shape from file (52182, 30)
After removing deltastrike bigger than -50 (51354, 30)
After removing deltastrike less than 50 (51354, 30)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['deltaDays'] = data['quoteDate'].diff()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['weekday'] = data['quoteDate'].dt.dayofweek
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['deltaDays'] = data['quoteDate'].diff()
A value is trying to be set on a copy of a slice from a DataFrame.

Used 451 contracts total
Done, made data set with 26255 samples
Dataframe shape from file (101507, 26)
After removing deltastrike bigger than -50 (47970, 26)
After removing deltastrike less than 50 (29437, 26)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['deltaDays'] = data['quoteDate'].diff()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['weekday'] = data['quoteDate'].dt.dayofweek
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['deltaDays'] = data['quoteDate'].diff()
A value is trying to be set on a copy of a slice from a DataFrame.

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Function to calculate and print error metrics
def evaluate_predictions(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    print(f"Mean Squared Error: {mse}")
    print(f"Mean Absolute Error: {mae}")

# Evaluate predictions
evaluate_predictions(y_test_original, predictions_original)

# Plotting the real vs predicted values
def plot_predictions(y_true, y_pred, ticker):
    plt.figure(figsize=(14, 7))
    plt.plot(y_true[:, 0], label='Real Bid', color='blue', marker='o')
    plt.plot(y_pred[:, 0], label='Predicted Bid', color='red', linestyle='--', marker='x')
    plt.plot(y_true[:, 1], label='Real Ask', color='green', marker='o')
    plt.plot(y_pred[:, 1], label='Predicted Ask', color='orange', linestyle='--', marker='x')
    plt.title(f'Real vs Predicted Bid/Ask Prices for {ticker}')
    plt.xlabel('Sample')
    plt.ylabel('Price')
    plt.legend()
    plt.show()

# Assuming you have a way to map back the test samples to their corresponding tickers
# For example, if you have a list `test_tickers` that maps each sample in `x_test` to its ticker
test_tickers = tickers
# You would need to create this mapping before splitting the data into training and testing sets

# Plot the real vs predicted values for each ticker
unique_tickers = np.unique(test_tickers)
for ticker in unique_tickers:
    ticker_indices = [i for i, t in enumerate(test_tickers) if t == ticker]
    plot_predictions(y_test_original[ticker_indices], predictions_original[ticker_indices], ticker)
