In [None]:
# Treating data
# Assuming data is daily
# Loading libraries. 
import math 
import os
import numpy as np
import pandas as pd

# Getting price data
data = pd.read_csv("vol/data.csv")
open = data.loc[:,"Open"]
close = data.loc[:,"Close"]
high = data.loc[:,"High"]
low = data.loc[:,"Low"]

# Generating variable values
# 1-day realized vols for daily data
realized_1 = []
for i in range(1, len(prices)):
  realized_1.append(math.log(close[i]) - math.log(close[i-1]))

# 5-day realized vols for daily data
realized_5 = []
for i in range(6, len(prices)):
  sum = 0
  for j in range(i-5, i):
    sum += (math.log(close[i]) - math.log(close[i-1]))^2
  realized_5.append(math.sqrt(sum))

# 22-day realized vols for daily data
realized_22 = []
for i in range(23, len(prices)):
  sum = 0
  for j in range(i-22, i):
    sum += (math.log(close[i]) - math.log(close[i-1]))^2
  realized_22.append(math.sqrt(sum))

# True Range 
true_range = []
for i in range(1, len(prices)):
  true_range.append(max(high[i]-low[i], abs(high[i]-close[i-1]), abs(low[i]-close[i-1])))

# Average True Range
avg_tr = [true_range[0]]
for i in range(2, len(prices)):
  avg_tr.append(((len(avg_tr)-1)*avg_tr[i-2]+true_range[i-1])/len(avg_tr))

# Close Relative To Daily Range
crtdr = []
for i in range(0, len(prices)):
  crtdr.append((close[i]-low[i])/(high[i]-low[i]))

# Relative Strength Index for realized volatility - 5 days
rsirv_5 = []
for i in range(6, len(prices)):
  gains = []
  losses = []
  for j in range(i-5, i):
    if(close[i]>close[i-1]):
      gains.append(close[i]-close[i-1])
    else:
      losses.append(close[i-1]-close[i])
  rsirv_5.append(1-(1/(1+(sum(gains)/len(gains))/(sum(losses)/len(losses))))))

# Relative Strength Index for realized volatility - 22 days
rsirv_22 = []
for i in range(23, len(prices)):
  gains = []
  losses = []
  for j in range(i-22, i):
    if(close[i]>close[i-1]):
      gains.append(close[i]-close[i-1])
    else:
      losses.append(close[i-1]-close[i])
  rsirv_22.append(1-(1/(1+(sum(gains)/len(gains))/(sum(losses)/len(losses))))))

# Arrays into pandas dataframe
arr = np.array([open[22:], high[22:], low[22:], close[22:], realized_1[21:], realized_5[17:], realized_22[22:], crtdr[22:]])
dataset_df = pd.DataFrame(arr, columns = ['open','high','low', 'close', 'realized_1', 'realized_5', 'realized_22', 'crtdr'])

# Splitting dataset into training and testing datasets.
def split_dataset(dataset, test_ratio=0.10):
  "Splits pandas dataframe into two"
  test_indices = np.random.rand(len(dataset)) < test_ratio
  return dataset[~test_indices], dataset[test_indices]


train_ds_pd, test_ds_pd = split_dataset(dataset_df)
print("{} - training; {} - testing.".format(len(train_ds_pd), len(test_ds_pd)))


In [None]:
# Training and evaluating the model. 
# Install TensorFlow's decision forest library. 
pip install tensorflow tensorflow_decision_forests

# Load other necessary libraries. 
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_decision_forests as tfdf

# Load in the training and testing datasets into a pandas dataframe - data has been randomly split into training and testing sets beforehand; with 10% in testing.  
train_df = pd.read_csv("vol/train.csv")
test_df = pd.read_csv("vol/test.csv")

# Convert loaded datasets into TensorFlow datasets.
label="realized_volatility"
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_df, label=label)
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_df, label=label)

# Train the Random Forest model.
model = tfdf.keras.RandomForestModel(verbose=1)
model.fit(train_ds)

# Evaluation.
model.evaluate(test_ds)

# Plotting the model. 
tfdf.model_plotter.plot_model_in_colab(model_1, tree_idx=0, max_depth=4)