In [1]:
from Utils.GBM_Utils import get_MLE_params, sample_GBM

import os

import numpy as np
import tqdm
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
DATA_DIR = './Data/stocks'
SPLIT_DATE = pd.to_datetime('2019-06-01', format='%Y-%m-%d')

In [3]:
T_PAST = 60
T_FUT = 20

In [4]:
WINDOW_SAVE_DIR = f'./Data/windowed_data_{T_PAST}_{T_FUT}'
WINDOW_DT_SAVE_DIR = f'./Data/windowed_dt_data_{T_PAST}_{T_FUT}'

os.makedirs(WINDOW_SAVE_DIR, exist_ok=True)
os.makedirs(WINDOW_DT_SAVE_DIR, exist_ok=True)
os.makedirs('Figures', exist_ok=True)

In [5]:
data_dict = {}

for i, f_name in enumerate(sorted(os.listdir(DATA_DIR))):
  f_dir = os.path.join(DATA_DIR, f_name)
  # read only the date 0th and adjusted close price 5th cols
  timeseries_df = pd.read_csv(f_dir, usecols=[0,5], parse_dates=['Date'])
  timeseries_df['time'] = (timeseries_df['Date']-timeseries_df['Date'].min()).dt.days.values
  timeseries_df['dt'] = timeseries_df['time'] - timeseries_df['time'].shift(1)
  data_dict[f_name.split('.')[0]] = timeseries_df
  
  if DEBUG := False:
    break

In [6]:
if DEBUG:
  fig, ax = plt.subplots(1, 1, figsize=(15, 6))
  data_df = data_dict['AAPL']
  
  train_df = data_df[data_df['Date'] < SPLIT_DATE]
  test_df = data_df[~(data_df['Date'] < SPLIT_DATE)]
  
  mu_hat, sigma_sq_hat = get_MLE_params(np.log(train_df['Adj Close'].values), train_df['dt'].values)
  mu_hat, sigma_sq_hat, mu_hat-0.5*sigma_sq_hat
  
  ax.plot(train_df['Date'], train_df['Adj Close'], 'bo', label='true train data', alpha=0.2, markersize=2)
  ax.plot(test_df['Date'], test_df['Adj Close'], 'bx', label='true test data', alpha=0.2, markersize=2)
  
  N_SAMPLES = 10
  for _ in range(N_SAMPLES):
    sample_bs = sample_GBM(mu_hat, sigma_sq_hat, train_df['Adj Close'].iloc[-1], test_df['dt'].iloc[1:].values, True)
    ax.plot(test_df['Date'], sample_bs, 'g--', alpha=0.5)
  
  sample_bs = sample_GBM(mu_hat, sigma_sq_hat, data_df['Adj Close'].iloc[0], data_df['dt'][1:].values, False)
  ax.plot(data_df['Date'], sample_bs, 'r-', label='sampled data (no BM)')
  
  ax.legend()

In [7]:
if DEBUG:
  for stock_ticker in data_dict:
    fig, ax = plt.subplots(1, 2, figsize=(20, 6))
    data_df = data_dict[stock_ticker]
    
    # plot the trend in the 1st subplot
    ax[0].plot(data_df['Date'], data_df['Adj Close'])
    ax[0].set_title(f'Stock Price vs time for {stock_ticker}')
    ax[0].set_xlabel('Date')
    ax[0].set_ylabel('Share Price')
    
    # set up train & test data
    train_df = data_df[data_df['Date'] < SPLIT_DATE]
    if len(train_df) < 2:
      continue
    test_df = data_df[~(data_df['Date'] < SPLIT_DATE)]
    
    mu_hat, sigma_sq_hat = get_MLE_params(np.log(train_df['Adj Close'].values), train_df['dt'].values)
    # print(stock_ticker, mu_hat, sigma_sq_hat, mu_hat-sigma_sq_hat**2/2)
    
    ax[1].plot(train_df['Date'], train_df['Adj Close'], 'ko', label='true train data')
    ax[1].plot(test_df['Date'], test_df['Adj Close'], 'kx', label='true test data')
    
    sample_bs = sample_GBM(mu_hat, sigma_sq_hat, data_df['Adj Close'].iloc[0], data_df['dt'][1:].values, False)
    ax[1].plot(data_df['Date'], sample_bs, 'r-', label='sampled data (no BM)')
    
    N_SAMPLES = 10
    for _ in range(N_SAMPLES):
      sample_bs = sample_GBM(mu_hat, sigma_sq_hat, train_df['Adj Close'].iloc[-1], test_df['dt'].iloc[1:].values, True)
      ax[1].plot(test_df['Date'], sample_bs, 'b--', alpha=0.5)
      # the lines below will sample the whole history, which looks cool but is not how we would use GBM in practice
      # sample_bs = sample_BS(mu_float, sigma_sq_float, data_df['Adj Close'].iloc[0], data_df['dt'][1:].values, True)
      # plt.plot(data_df['Date'], sample_bs, 'b--', label='sampled data (w/ BM)', alpha=0.1)
  
    ax[1].legend()
    
    fig.tight_layout()
    fig.savefig(f'./Figures/{stock_ticker}_EDA.png')
    plt.close(fig)

In [8]:
# by idx train/test window split

for stock_ticker in tqdm.tqdm(data_dict):
  # fig, ax = plt.subplots(1, 1, figsize=(10, 6))
  data_df = data_dict[stock_ticker]
  data_mat = None
  data_mat_dt = None

  for st_idx in np.arange(0, len(data_df)-T_FUT, T_PAST)[:-1]:
    # # code to plot graphs of MLE series vs time w/ multiple MLEs every T_PAST days
    # train_df = data_df.iloc[st_idx:st_idx+T_PAST]
    # test_df = data_df.iloc[st_idx+T_PAST:st_idx+T_PAST+T_FUT]
    # try:
    #   mu_hat, sigma_sq_hat = get_MLE_params(np.log(train_df['Adj Close'].values), train_df['dt'].values)
    # except RuntimeWarning:
    #   continue
    # print(stock_ticker, mu_hat, sigma_sq_hat, mu_hat-sigma_sq_hat**2/2)
    # ax.plot(train_df['Date'], train_df['Adj Close'], 'bo', label='true train data', alpha=0.2, markersize=2)
    # 
    # sample_bs = sample_BS(mu_hat, sigma_sq_hat, train_df['Adj Close'].iloc[0], train_df['dt'][1:].values, False)
    # ax.plot(train_df['Date'], sample_bs, 'r-', label='sampled data (no BM)', alpha=0.5)
    # 
    # N_SAMPLES = 10
    # for _ in range(N_SAMPLES):
    #     sample_bs = sample_BS(mu_hat, sigma_sq_hat, train_df['Adj Close'].iloc[-1], test_df['dt'][1:].values, True)
    #     ax.plot(test_df['Date'], sample_bs, 'k-', alpha=1.0, zorder=2.5)
    
    close_prices = data_df['Adj Close'].iloc[st_idx:st_idx+T_PAST+T_FUT].values.reshape(1, -1)
    if close_prices.max() > 1e6 or close_prices.min() < 0:
      continue
    if data_mat is None:
      data_mat = close_prices
      data_mat_dt = data_df['dt'].iloc[st_idx:st_idx+T_PAST+T_FUT].values.reshape(1, -1)
    else:
      data_mat = np.concatenate((data_mat, close_prices), axis=0)
      data_mat_dt = np.concatenate((data_mat_dt, data_df['dt'].iloc[st_idx:st_idx+T_PAST+T_FUT].values.reshape(1, -1)), axis=0)

    # fig.tight_layout()
    # fig.savefig(f'./Figures/{stock_ticker}_EDA_num_split.png')
    # plt.show()
    # plt.close(fig)

  np.save(os.path.join(WINDOW_SAVE_DIR, f'{stock_ticker}_windows.npy'), data_mat)
  np.save(os.path.join(WINDOW_DT_SAVE_DIR, f'{stock_ticker}_windows_dt.npy'), data_mat_dt)

100%|██████████| 121/121 [00:01<00:00, 64.84it/s]


In [9]:
AAPL_windows = np.load(os.path.join(WINDOW_SAVE_DIR, 'AAPL_windows.npy'))
AAPL_windows.shape

(182, 80)