## Preprocessing and Computing different indicators

In [1]:
import pandas as pd
import numpy as np
import talib as ta
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
class TechnicalIndicators:
    def __init__(self, data):
        self.data = data

    def add_momentum_indicators(self):
        self.data['RSI'] = ta.RSI(self.data['Close'], timeperiod=14)
        self.data['MACD'], self.data['MACD_signal'], self.data['MACD_hist'] = ta.MACD(self.data['Close'], fastperiod=12, slowperiod=26, signalperiod=9)
        self.data['Stoch_k'], self.data['Stoch_d'] = ta.STOCH(self.data['High'], self.data['Low'], self.data['Close'],
                                                              fastk_period=14, slowk_period=3, slowd_period=3)

    def add_volume_indicators(self):
        self.data['OBV'] = ta.OBV(self.data['Close'], self.data['Volume'])

    def add_volatility_indicators(self):
        self.data['Upper_BB'], self.data['Middle_BB'], self.data['Lower_BB'] = ta.BBANDS(self.data['Close'], timeperiod=20)
        self.data['ATR_1'] = ta.ATR(self.data['High'], self.data['Low'], self.data['Close'], timeperiod=1)
        self.data['ATR_2'] = ta.ATR(self.data['High'], self.data['Low'], self.data['Close'], timeperiod=2)
        self.data['ATR_5'] = ta.ATR(self.data['High'], self.data['Low'], self.data['Close'], timeperiod=5)
        self.data['ATR_10'] = ta.ATR(self.data['High'], self.data['Low'], self.data['Close'], timeperiod=10)
        self.data['ATR_20'] = ta.ATR(self.data['High'], self.data['Low'], self.data['Close'], timeperiod=20)

    def add_trend_indicators(self):
        self.data['ADX'] = ta.ADX(self.data['High'], self.data['Low'], self.data['Close'], timeperiod=14)
        self.data['+DI'] = ta.PLUS_DI(self.data['High'], self.data['Low'], self.data['Close'], timeperiod=14)
        self.data['-DI'] = ta.MINUS_DI(self.data['High'], self.data['Low'], self.data['Close'], timeperiod=14)
        self.data['CCI'] = ta.CCI(self.data['High'], self.data['Low'], self.data['Close'], timeperiod=5)

    def add_other_indicators(self):
        self.data['DLR'] = np.log(self.data['Close'] / self.data['Close'].shift(1))
        self.data['TWAP'] = self.data['Close'].expanding().mean()
        self.data['VWAP'] = (self.data['Volume'] * (self.data['High'] + self.data['Low']) / 2).cumsum() / self.data['Volume'].cumsum()

    def add_all_indicators(self):
        self.add_momentum_indicators()
        self.add_volume_indicators()
        self.add_volatility_indicators()
        self.add_trend_indicators()
        self.add_other_indicators()
        return self.data

In [2]:
data = pd.read_csv('xnas-itch-20230703.tbbo.csv')
# Preprocessing to create necessary columns
data['price']=data['price']/1e9
data['bid_px_00']=data['bid_px_00']/1e9
data['ask_px_00']=data['ask_px_00']/1e9

data['Close'] = data['price']
data['Volume'] = data['size']
data['High'] = data[['bid_px_00', 'ask_px_00']].max(axis=1)
data['Low'] = data[['bid_px_00', 'ask_px_00']].min(axis=1)
data['Open'] = data['Close'].shift(1).fillna(data['Close'])


ti = TechnicalIndicators(data)
df_with_indicators = ti.add_all_indicators()
market_features_df = df_with_indicators[35:]

In [39]:
market_features_df['Volume_scaled'] = market_features_df[['Volume']].apply(lambda x: x/10**5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  market_features_df['Volume_scaled'] = market_features_df[['Volume']].apply(lambda x: x/10**5)


In [40]:
features_list = ['Close', 'Volume_scaled', 'High', 'Low', 'Open', 'RSI']#, 'MACD', 'MACD_signal']#, 'Stoch_k', 'Stoch_d']
features = [np.array(market_features_df[col].array) for col in features_list]

In [41]:
dfeatures = market_features_df[features_list]

## Create prediction pipelines
* Comaring 2 pipelines
    * Original Chronos which performs well with zero shot tasks
    * Chronos model trained from scratch on only the given data
    * Base model was chose as t5-tiny llm keeping computational costs in mind.

In [6]:
from chronos import ChronosPipeline

pipeline = ChronosPipeline.from_pretrained("/workspace/checkpoint/",  device_map="cuda",
                                              torch_dtype=torch.bfloat16)
pipeline2 = ChronosPipeline.from_pretrained(
  "amazon/chronos-t5-tiny",
  device_map="cuda",
  torch_dtype=torch.bfloat16,
)

config.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/33.6M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

## Generate Labels/indicators for labels

In [66]:
class generate_label:
  def __init__(self,df,pipeline,history = 100,nsteps = 5, buy_threshold = 0.01,sell_threshold = -0.01):
    self.df = df
    self.pipeline = pipeline
    self.nsteps = nsteps
    self.buy_threshold = buy_threshold
    self.sell_threshold = sell_threshold
    self.history = history
    self.features_list = ['Close', 'Volume_scaled', 'High', 'Low', 'Open', 'RSI']
  def get_label(self,row):
      '''
      use the model to and past_data to predict, future data on the price(close price) of the stock
      have at least 100 samples to predict
      '''
      if row < self.history:
        return 0,0
      cur_close = self.df.iloc[row]['Close']
      
      context = [torch.tensor(self.df[col][row-self.history:row].array) for col in self.features_list]
      forecast = self.pipeline.predict(context,self.nsteps)
      median_close = torch.quantile(forecast,0.5,axis = 1)[0]
      max_future_price = torch.max(median_close)
      min_future_price = torch.min(median_close)
      max_pct_change = (max_future_price - cur_close)/cur_close
      min_pct_change = (min_future_price - cur_close)/cur_close
      return max_pct_change.item(),min_pct_change.item()
      # if max_pct_change > self.buy_threshold:
      #   return 'buy'
      # elif min_pct_change < self.sell_threshold:
      #   return 'sell'
      # else:
      #   return 'hold'
  def get_labels(self):
    print("getting labels ****")
    # labels = []
    max_pct = []
    min_pct = []
    for i in range(len(self.df)):
      ret = self.get_label(i)
      max_pct.append(ret[0])
      min_pct.append(ret[1])
      #labels.append(self.get_label(i))
    return max_pct , min_pct
  def assign_labels(self):
    # self.df['labels'] = self.get_labels()
      ret = self.get_labels()
      self.df['max_pct_change'] = ret[0]
      self.df['min_pct_change'] = ret[1]

In [67]:
labeler = generate_label(dfeatures,pipeline,buy_threshold=0.007,sell_threshold = -0.005,nsteps = 12)
labeler.assign_labels()

getting labels ****


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df['max_pct_change'] = ret[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df['min_pct_change'] = ret[1]


In [65]:
labeler.df['max_pct_change']

35     0.000000
36     0.000000
37     0.000000
38     0.000000
39     0.000000
         ...   
230    0.006861
231    0.006809
232    0.006809
233    0.006756
234    0.007171
Name: max_pct_change, Length: 200, dtype: float64

In [62]:
 torch.tensor(0.0069, dtype=torch.float64).item()

0.0069

In [33]:
labeler.df['labels'].value_counts()

labels
buy     33629
hold    25108
sell      499
Name: count, dtype: int64

In [68]:
# Save data to disk
labeler.df.to_csv('trained_data.csv',index = False)