In [1]:
from datetime import datetime, timedelta
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd

#Function to produce training and targets using full dataset tables

def produceXYDataSets(ticker, corp, ns_back): 
    df = pd.read_csv('./optionchaindata/all/'+ticker+'_alldata_'+corp+'.csv.zip', parse_dates=['quoteDate','expiryDate'])
    print("Dataframe shape from file",df.shape)
    
    #basic data cleaning, remove lines where the strike price is more than 50 from the stockprice
    df_good = df[ df['strikeDelta'] > -50 ]
    print("After removing deltastrike bigger than -50", df_good.shape)

    df_good = df_good[ df_good['strikeDelta'] < 50 ]
    print("After removing deltastrike less than 50", df_good.shape)

    #All contract names to read through
    contracts = df_good['contractSymbol'].unique()

    x_train = []
    y_train = []

    #Only look at contracts that have at least 1.5 time as many entries as the look back time
    nquotes_min = int(1.5*ns_back)
    good_contracts = []

    for contract in contracts:
        data = df_good[df_good['contractSymbol'] == contract]
        
        if data.shape[0] < nquotes_min:
            continue
        
        good_contracts.append(contract)
        
        data['deltaDays'] = data['quoteDate'].diff()
        data['weekday'] = data['quoteDate'].dt.dayofweek

        ndays = data.shape[0]

        for iday in range(ns_back+1, ndays):
            #X:
            stockPrices = data['stockClose'][iday-ns_back:iday].values
            strike = data['strike'].values[iday]
            openInterest = data['openInterest'].values[iday]
            daysToExpiry = int(data['daysToExpiry'].values[iday].split('days')[0])
            deltaDays = data['deltaDays'].values[iday]/ np.timedelta64(1, 'D')
            weekday = data['weekday'].values[iday]

            features =np.concatenate( [[strike, openInterest, daysToExpiry, deltaDays, weekday], stockPrices] )

            #y:
            ask = data['ask'].values[iday]
            bid = data['bid'].values[iday]

            targets = np.array([bid, ask])

            #print(weekday,ask,bid,daysToExpiry,deltaDays, strike,stock_prices)
            x_train.append(features)
            y_train.append(targets)
    

    x_train = np.array(x_train)
    y_train = np.array(y_train)
    
    print("Used",len(good_contracts),"contracts total")
    print("Done, made data set with",x_train.shape[0],"samples")
    
    xydata = np.concatenate([x_train,y_train], axis=1)
    np.save('./optionchaindata/all/'+ticker+'_'+corp+'_XY.npy',xydata)
    
    return x_train, y_train

   


In [2]:
x_train, y_train = produceXYDataSets('AAPL',"C",20)
print(x_train.shape, y_train.shape)



Dataframe shape from file (166437, 26)
After removing deltastrike bigger than -50 (117353, 26)
After removing deltastrike less than 50 (82393, 26)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['deltaDays'] = data['quoteDate'].diff()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['weekday'] = data['quoteDate'].dt.dayofweek


Used 704 contracts total
Done, made data set with 42179 samples
(42179, 25) (42179, 2)


In [13]:
x_train[0,5:]

array([172.78999329, 172.55000305, 168.88000488, 167.30000305,
       167.30000305, 164.32000732, 160.07000732, 162.74000549,
       164.8500061 , 165.11999512, 163.19999695, 166.55999756,
       166.22999573, 163.16999817, 159.30000305, 157.44000244,
       162.94999695, 158.52000427, 154.72999573, 150.61999512])

In [14]:
x_data = x_train[:,:5]
x_stockdata=x_train[:,5:]

print(x_data)
print(x_stockdata)

[[140. 332.  17.   1.   1.]
 [140. 424.  16.   1.   2.]
 [140. 669.  15.   1.   3.]
 ...
 [105.  53. 813.   1.   3.]
 [105.  62. 802.  11.   0.]
 [105.  63. 801.   1.   1.]]
[[172.78999329 172.55000305 168.88000488 ... 158.52000427 154.72999573
  150.61999512]
 [172.55000305 168.88000488 167.30000305 ... 154.72999573 150.61999512
  155.08999634]
 [168.88000488 167.30000305 167.30000305 ... 150.61999512 155.08999634
  159.58999634]
 ...
 [142.47999573 138.33999634 142.99000549 ... 150.6499939  152.33999634
  149.3500061 ]
 [138.33999634 142.99000549 140.08999634 ... 152.33999634 149.3500061
  144.80000305]
 [142.99000549 140.08999634 140.41999817 ... 149.3500061  144.80000305
  138.91999817]]


In [17]:
from sklearn import preprocessing
min_max_scaler_stockdata = preprocessing.MinMaxScaler()
min_max_scaler_data = preprocessing.MinMaxScaler()


x_stockdata_scaled = min_max_scaler_stockdata.fit_transform(x_stockdata)
print(x_stockdata_scaled)

x_data_scaled = min_max_scaler_data.fit_transform(x_data)
print(x_data_scaled)

[[0.87382388 0.86891611 0.79386503 ... 0.58200412 0.50449885 0.42044977]
 [0.86891611 0.79386503 0.76155416 ... 0.50449885 0.42044977 0.51186082]
 [0.79386503 0.76155416 0.76155416 ... 0.42044977 0.51186082 0.60388534]
 ...
 [0.25398765 0.1693251  0.26441729 ... 0.42106324 0.45562361 0.39447863]
 [0.1693251  0.26441729 0.20511241 ... 0.45562361 0.39447863 0.30143155]
 [0.26441729 0.20511241 0.21186091 ... 0.39447863 0.30143155 0.18118607]]
[[3.75000000e-01 2.95599836e-03 1.93003619e-02 1.07843137e-01
  2.50000000e-01]
 [3.75000000e-01 3.77513044e-03 1.80940893e-02 1.07843137e-01
  5.00000000e-01]
 [3.75000000e-01 5.95651477e-03 1.68878166e-02 1.07843137e-01
  7.50000000e-01]
 ...
 [8.33333333e-02 4.71891305e-04 9.79493366e-01 1.07843137e-01
  7.50000000e-01]
 [8.33333333e-02 5.52023790e-04 9.66224367e-01 2.05882353e-01
  0.00000000e+00]
 [8.33333333e-02 5.60927400e-04 9.65018094e-01 1.07843137e-01
  2.50000000e-01]]
