In [7]:
import refinitiv.data as rd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
import pandas as pd
from hmmlearn.hmm import GaussianHMM,GMMHMM, CategoricalHMM
import plotly.graph_objects as go
from plotly.graph_objs.scatter.marker import Line
from plotly.subplots import make_subplots
from sklearn.cluster import AgglomerativeClustering
from sklearn.mixture import GaussianMixture
import warnings
import math

warnings.filterwarnings('ignore')

In [2]:
rd.open_session()

<refinitiv.data.session.Definition object at 0x7fdfd0e275c0 {name='workspace'}>

## Data Ingestion and Engineering

In [7]:
class DataEngineering:
    
    def __init__(self, prices, split_date):
        self.prices = prices
        self.split_date = split_date
        self.instrument = prices.columns.name
    
    def prepare_data(self):
        prices_change = self.calculate_change_of_ma(self.prices)
        data = self.prepare_data_for_model_input(prices_change)
        split_index = self.get_split_index(prices_change, self.split_date)
        
        return prices_change.set_index('Date'), data, split_index
    
    def calculate_change_of_ma(self, prices): #need to observe 

        prices_ma = prices.rolling(7).mean()
        
        prices_change = prices_ma.pct_change()
        prices_change[f'{self.instrument}_close'] = prices[self.instrument]
        prices_change.dropna(inplace=True)
        return prices_change

    def prepare_data_for_model_input(self, prices_change):
        data_dict = {}
        for column in prices_change.columns:
            if column != f'{self.instrument}_close':
                data_dict[column] = np.array([[q] for q in prices_change[column].values])
        data = np.column_stack(data_dict.values())
        return data
    
    def split_for_train_test(self, data, split_index):

        rets_train = data[:split_index]
        rets_test = data[split_index:]
        
        return rets_train, rets_test
    
    def get_split_index(self, prices_change, split_date):
        prices_change.reset_index(inplace = True)
        split_index = prices_change.loc[prices_change['Date'] > split_date].iloc[0].name
        
        return split_index

## Modeling and Evaluation

In [10]:
class RegimeDetection:
    
    def get_regimes_hmm(self, input_data):
        hmm_model = GaussianHMM(
        n_components=2,  n_iter=10000, covariance_type="full", random_state = 1000)
        return hmm_model.fit(input_data)
    
    def get_regimes_clustering(self, model):
        if model == 'AgglomerativeClustering':
            clustering = AgglomerativeClustering(n_clusters = 2, linkage = 'complete',  affinity = 'manhattan')
        elif model == 'kmeans':
            clustering = KMeans(n_clusters=2)
        
        return clustering
    
    def get_regimes_gmm(self, input_data):
        gmm = GaussianMixture(n_components=2, covariance_type = 'full', max_iter = 100000, n_init =30,
                    init_params = 'kmeans').fit(input_data)
        return gmm

In [6]:
def plot_hidden_states(hidden_states, df, n_components):

    colors = ['blue', 'green', 'yellow', 'black', 'grey']
    for i in range(n_components):
        mask = hidden_states == i
        print('Number of observations for State ', i,":", len(df.index[mask]))
        
        fig = go.Figure()
        fig.add_trace(go.Line(x=df.index, y=df[f"{prices_change.columns.name}_close"],
                    name = f'Price {prices_change.columns.name}',
                    line_color = 'red'))
        
        fig.add_trace(go.Scatter(x=df.index[mask], y=df[f"{prices_change.columns.name}_close"][mask],
                    mode='markers',
                    name='Hidden State ' + str(i)))
        
        fig.update_traces(marker=dict(size=4,color=colors[i]),
                  selector=dict(mode='markers'))
        
        fig.update_layout(height=300, width=600, legend=dict(
            yanchor="top", y=0.99,
            xanchor="left",x=0.01), 
            margin=dict(l=20, r=20, t=20, b=20))
        
        fig.show()

In [5]:
def plot_hidden_states_cf(hidden_states, df, n_components):

    colors = ['blue', 'green', 'yellow', 'black', 'grey']
    for i in range(n_components):
        mask = hidden_states == i
        print('Number of observations for State ', i,":", len(df.index[mask]))
        
        fig = go.Figure()
        fig.add_trace(go.Line(x=df.index, y=df[f"close"],
                    name = f'Price',
                    line_color = 'red'))
        
        fig.add_trace(go.Scatter(x=df.index[mask], y=df[f"close"][mask],
                    mode='markers',
                    name='Hidden State ' + str(i)))
        
        fig.update_traces(marker=dict(size=4,color=colors[i]),
                  selector=dict(mode='markers'))
        
        fig.update_layout(height=300, width=600, legend=dict(
            yanchor="top", y=0.99,
            xanchor="left",x=0.01), 
            margin=dict(l=20, r=20, t=20, b=20))
        
        fig.show()

In [None]:
def feed_forward_training(model, prices, split_index, train_chunk_size):
    
    models = {'hmm': regime_detection.get_regimes_hmm, 
              'gmm': regime_detection.get_regimes_gmm}
    
    init_train_data = prices[:split_index]
    test_data = prices[split_index:]
    
    states_pred = []
    rd_model = models[model](init_train_data)
    print('hmm score initial training', rd_model.score(init_train_data))
    
    for i in range(math.ceil(len(test_data))):
        split_index += 1
        preds = rd_model.predict(prices[:split_index]).tolist()
        states_pred.append(preds[-1])
        
        if i % train_chunk_size == 0:
            rd_model = models[model](prices[:split_index])
            print(i, 'hmm retrain', rd_model.score(prices[:split_index]))
                
    return states_pred