# data process

In [1]:
import numpy as np
import pandas as pd
import dill as pickle
import matplotlib.pyplot as plt
from tqdm import tqdm
import copy
from matplotlib import cm
import seaborn as sns
import random
import datetime
import math
from sklearn.linear_model import LinearRegression
import torch
import imageio

from FDA_Projection import FDA
from NeuralSDE import NeuralSDE

In [6]:
raw_data = pd.read_csv("G:\gitCode\BU_programming\\finnal_project\FuNVol\\self.test_data\\volatility surface.csv")
np.unique(raw_data['ticker'])

array(['AMZN', 'IBM', 'INTC', 'TSLA'], dtype=object)

In [7]:
raw_data = raw_data[['date', 'days', 'delta', 'impl_volatility', 'ticker']]
raw_data = raw_data.fillna(0)

In [8]:
tickers = np.unique(raw_data['ticker'])

In [11]:
data_path = "G:\gitCode\BU_programming\\finnal_project\FuNVol\\self.test_data\\"

def daily_IV_matrix(df:pd.DataFrame, date:pd.DatetimeTZDtype):
    return pd.pivot_table(df, values='impl_volatility', index='days', columns='delta').values
    
for ticker in tickers:

    data_by_ticker = copy.deepcopy(raw_data[raw_data['ticker']==ticker])
    data_by_ticker['date'] = pd.to_datetime(data_by_ticker['date'])
    start = data_by_ticker['date'].min()
    end = data_by_ticker['date'].max()
    data_by_ticker.sort_values(['date', 'days', 'delta'])

    price_df = pd.read_csv(data_path+"{}.csv".format(ticker))
    price_df['date'] = pd.to_datetime(price_df['Date'])
    price_df.sort_values('date', inplace=True)

    print("ticker name: {}, start time {}, end time:{}".format(ticker, start, end))
    iv_by_day = np.array([daily_IV_matrix(df, date) for date, df in data_by_ticker.groupby(['date'])])
    print("IV data shape {}".format(iv_by_day.shape))
    # iv_by_day.shape = (2893, 11, 17)
    dates = np.sort(np.array(np.unique(data_by_ticker['date'])))
    tau = np.sort(np.array(np.unique(data_by_ticker['days']))) / 365
    Delta = np.sort(np.array(np.unique(data_by_ticker['delta']))) / 100
    result_dict = {
        'dates': dates,
        'tau': tau,
        'Delta': Delta,
        'IV': iv_by_day,
        'prices': price_df['Adj Close'].values
    }
    with open(data_path+"raw_{}.pickle".format(ticker), 'wb') as handle:
        pickle.dump(result_dict, handle)


ticker name: AMZN, start time 2022-01-03 00:00:00, end time:2023-02-28 00:00:00
IV data shape (290, 11, 17)
ticker name: IBM, start time 2022-01-03 00:00:00, end time:2023-02-28 00:00:00
IV data shape (290, 11, 17)
ticker name: INTC, start time 2022-01-03 00:00:00, end time:2023-02-28 00:00:00
IV data shape (290, 11, 17)
ticker name: TSLA, start time 2022-01-03 00:00:00, end time:2023-02-28 00:00:00
IV data shape (290, 11, 17)


In [1]:
import numpy as np
import pandas as pd
import dill as pickle
import matplotlib.pyplot as plt
from tqdm import tqdm
import copy
from matplotlib import cm
import seaborn as sns
import random
import datetime
import math
from sklearn.linear_model import LinearRegression
import torch
import imageio
class Prediction:

    def __init__(self, data_dir, tickers, model, K=3) -> None:

        #load all data
        self.raw_data = []
        self.raw_price_data = []
        self.data = []
        self.current_ticker=None
        self.tickers = np.array(tickers)
        self.trained_model = model
        
        


        print("loading data for various tickers...")
        # load the data and normalize individually        
        for ticker in tickers:
            
            # the raw data files cannot be provided due to licensing issues
            with open(data_dir + "raw_" + ticker + '.pickle', 'rb') as handle:
                self.raw_data.append(pickle.load(handle))

        print("normalizing IVs...")
        # normalizing coefficients
        self.nrm = []
        self.reg = []

        for data in copy.deepcopy(self.raw_data):
            
            a_, b_, IV = model.Normalise(np.log(np.exp(data['IV'])-1))
            data['IV'] = np.where(np.isneginf(IV), np.NaN, IV)   # !!!!
            
            c_, d_, prices = model.Normalise(np.log(data['prices']))
            data['prices'] = prices

            self.nrm.append({'IV':[a_,b_], 'price' : [c_,d_]})
            self.data.append(data)
            
        print("truncating data to common set of dates...")
        # only keep data on dates where all tickers have data and
        # collate the data into a single source
        self.data_all = {'dates' : [],
                         'tau' : self.data[0]['tau'],
                         'Delta' : self.data[0]['Delta'],
                         'IV' : [],
                         'prices' : [],
                         'ticker_idx' : []}
        
        self.delta = self.data[0]['Delta']
        self.tau = self.data[0]['tau']

        for i, ticker in enumerate(tickers):
            
            # detrend the time serise data
            alpha_, beta_, prices_d = model.Detrend(self.data[i]['prices'])
            self.reg.append([alpha_,beta_])
            self.data[i]['prices'] = prices_d
            
            if i ==0 :
                self.data_all['IV'] = self.data[i]['IV']
                self.data_all['dates'] = self.data[i]['dates']
                self.data_all['ticker_idx'] = np.zeros(len(self.data[i]['dates']), int)
                self.data_all['prices'] = self.data[i]['prices']
                
            else:
                self.data_all['IV'] = np.concatenate((self.data_all['IV'], self.data[i]['IV']))
                self.data_all['dates'] = np.concatenate((self.data_all['dates'], self.data[i]['dates']))
                self.data_all['prices'] = np.concatenate((self.data_all['prices'], self.data[i]['prices']))
                self.data_all['ticker_idx'] = np.concatenate((self.data_all['ticker_idx'], i + np.zeros(len(self.data[i]['dates']), int)))

        self.rng_IV = np.nanquantile(self.data_all["IV"].flatten(), [0.025, 0.975])
        print("Project onto trained common FPCs")
        self.K = K        
        self.b = []
        self.b_all = np.zeros((0, K))
        self.b_sim = []
        self.b_price = []
        
        for data in self.data:
            self.b.append(model.Perform_FPC_Projection(data['IV'], K))
            self.b_all = np.concatenate((self.b_all, self.b[-1]))
            self.b_price.append(np.concatenate((model.Perform_FPC_Projection(data['IV'], K),
                                data['prices'].reshape(-1,1)), axis=1))  
            
            
    
        N = (self.b[0].shape[1]+1)   
        self.current_ticker = self.tickers
        # concatenate coefficients of FDA with prices across all tickers
        for i in range(len(self.tickers)):
            if i == 0:
                data = np.concatenate((self.b[i][:,:N-1].reshape(-1,N-1), self.data[i]['prices'].reshape(-1,1)), axis=1)
                dates = pd.to_datetime(self.data[i]['dates'], format='%Y%m%d')
                T = (dates-dates[0])/ np.timedelta64(1, 'D')
                T = T.values.reshape(-1,1)/365
            else:
                data = np.concatenate((data, self.b[i][:,:N-1].reshape(-1,N-1), self.data[i]['prices'].reshape(-1,1)), axis=1)
        
        self.n_lags = self.trained_model.neural_sde.n_lags
        self.test_data = torch.from_numpy(self.trained_model.neural_sde.Normalize_Data(data)).float()
            
        simulated_IV_list = []
        pred_days = 3
        with tqdm(total=(self.test_data.shape[0]-9)*pred_days, desc='simulating IV........') as pdar:
            for i in range(9, self.test_data.shape[0]):
                    for j in range(1, pred_days+1):
                        simulated_IV_list.append(self.simulate_Time_Series(dT=j/365,start=i))
                        pdar.update(1)
        self.simulated_df = pd.concat(simulated_IV_list, axis=0)
            

    def simulate_Time_Series(self, dT=1/365, start=9):
        input_data = self.test_data[start-self.n_lags+1:start+1, :].unsqueeze(0).to('cuda:0')
        predi_data = self.trained_model.neural_sde.pi_drift.forward(input_data).view(-1).to('cpu')
        nu_pred = torch.add(input_data[0, -1, :].to('cpu'),(predi_data*dT)).to('cpu')

        def Data_per_Ticker(b_sim, ticker_idx):
            price = self.trained_model.reg[ticker_idx][0] + self.trained_model.reg[ticker_idx][1][0] * (self.trained_model.train_size+start+1) + b_sim[-1]
            price = np.exp(self.trained_model.UnNormalise(self.trained_model.nrm[ticker_idx]['price'][0], self.trained_model.nrm[ticker_idx]['price'][1], price.detach().numpy())).reshape(-1)
            b = np.array([[b_sim[:-1].detach().numpy()]])
            delta_grid_t, tau_grid_t, IV_t = self.trained_model.fda_model.Generate_IV_Grid(b=b, delta = self.trained_model.x, tau = self.trained_model.y)
            IV = np.log(1+ np.exp(self.trained_model.UnNormalise(self.trained_model.nrm[ticker_idx]['IV'][0], self.trained_model.nrm[ticker_idx]['IV'][1], IV_t)))
            IV = IV.transpose(1,0,2,3)
            
            return delta_grid_t, tau_grid_t, IV, price
        
        seq_length = int(len(nu_pred)/len(self.tickers))
        df_list = []
        for i in range(len(self.tickers)):
            delta_grid_t_i, tau_grid_t_i, IV_i, price_i = Data_per_Ticker(nu_pred[i*seq_length:(i+1)*seq_length], i)
            IV_df = pd.DataFrame(data=np.array(IV_i)[0,0,:,:], columns=self.trained_model.data[0]['Delta']*100, index=self.trained_model.data[0]['tau']*365)
            df_unpivoted = IV_df.stack().reset_index()
            df_unpivoted.columns = ['days', 'delta', 'predicted_IV']
            df_unpivoted['date'] = self.data[0]['dates'][start]
            df_unpivoted['ticker'] = self.trained_model.tickers[i]
            df_unpivoted['price'] = price_i[0]
            df_unpivoted['pred_days'] = dT * 365
            df_list.append(df_unpivoted)
                    
        return pd.concat(df_list, axis=0)

        pass

# FuNVol

In [2]:
# perform FPC projection
with open("G:\gitCode\BU_programming\\finnal_project\FuNVol\data\dynamicIV.pkl", "rb") as md:
    trained_model = pickle.load(md)
model = Prediction('self.test_data/', ['AMZN', 'IBM', 'INTC', 'TSLA'], model=trained_model, K=8)

RuntimeError: Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False. If you are running on a CPU-only machine, please use torch.load with map_location=torch.device('cpu') to map your storages to the CPU.

# Model Training

In [None]:
#%%
# Generate data for future scenarios
# Can provide the number of independent scenarios and time steps as arguments to the function

with torch.no_grad():
    delta_grid_t, tau_grid_t, IV, price, b_sim = model.Generate_Data(nsims=10, nsteps=31)
    
""" delta_grid_t and tau_grid_t are lists containing the transformed delta-tau meshgrid for each equity
By default, IV values are calculated on the same grid as that for which data was available
IV is a list containing the implied vol values on the above grid for each equity
price is a list containing price paths for each equity
b_sim contains the simulated time series of FPCCs and equity prices """

""" b_sim is nsims x nsteps x 36 (4 equities times 8 FPCCs plus 1 price) dimensional
The first index corresponds to the generated (independent) scenarios
The second index corresponds to the sequence of days where day 0 is the last day of training (observed) and the remaining 29 days are generated coefficients
Hence day 0 will give the FPC coefficients (FPCCs) for the IV surface that is observed on the last day of training and is not a synthetic generated surface
The last index corresponds to the different assets' FPCCs and transformed equity prices, details for third index below:
0-7 give FPCCs for AMZN, 8 gives transformed equity price for AMZN
9-16 give FPCCs for IBM, 17 gives transformed equity price for IBM
18-25 give FPCCs for INTC, 26 gives transformed equity price for INTC
27-34 give FPCCs for TSLA, 35 gives transformed equity price for TSLA """

""" Each element of price is nsims x nsteps dimensional """

"""Each element in the list IV is nsims x nsteps x len(tau) x len(delta) dimensional
Each element of delta_grid_t and tau_grid_t is len(tau) x len(delta) containing the transformed
values of delta and tau on a grid at which IV is calculated"""

# to plot surfaces and get IV values at a different grid, refer to generated_data.ipynb
with open('data/sim_data_coeffs.npy', 'wb') as f:
    np.save(f, b_sim)
    
with open('data/sim_data_prices.npy', 'wb') as f:
    np.save(f, np.array(price).transpose(1,2,0))