In [28]:
import pandas as pd
import numpy as np

data_bnp = pd.read_csv('../data/BNPPA.csv')
data_bnp['Date'] = pd.to_datetime(data_bnp['Date'])
data_bnp.sort_values('Date', inplace=True)
data_bnp.reset_index(drop=True, inplace=True)

# 1) Filter invalid or missing close prices
data_bnp = data_bnp[data_bnp['Close'].notna()]      # remove rows missing close
data_bnp = data_bnp[data_bnp['Close'] > 0]          # remove zero or negative

# 2) Compute log prices & returns
data_bnp['LogClose'] = np.log(data_bnp['Close'])
data_bnp['LogRet']   = data_bnp['LogClose'].diff()


# 3) Replace inf and drop any new NaNs
data_bnp.replace([np.inf, -np.inf], np.nan, inplace=True)
data_bnp.dropna(subset=['LogRet'], inplace=True)  # remove first row of diff

data_bnp

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,LogClose,LogRet
1,1993-10-19,21.129074,21.228039,20.832178,20.881660,3.838838,8760516.0,3.038871,-0.025732
2,1993-10-20,20.881660,21.129074,20.782696,21.030109,3.866129,6174815.0,3.045955,0.007084
3,1993-10-21,20.980625,21.376488,20.980625,21.376488,3.929806,8263391.0,3.062292,0.016336
4,1993-10-22,21.425970,21.970278,21.376488,21.871313,4.020774,10181091.0,3.085176,0.022884
5,1993-10-25,21.821831,22.019760,21.673384,21.920795,4.029871,4038109.0,3.087436,0.002260
...,...,...,...,...,...,...,...,...,...
7517,2022-12-06,52.250000,52.529999,51.849998,52.459999,52.459999,1969471.0,3.960051,0.002672
7518,2022-12-07,52.349998,52.639999,51.730000,52.000000,52.000000,2131657.0,3.951244,-0.008807
7519,2022-12-08,52.259998,52.349998,51.860001,52.000000,52.000000,1764207.0,3.951244,0.000000
7520,2022-12-09,52.299999,52.889999,51.860001,52.889999,52.889999,2243527.0,3.968214,0.016971


In [29]:
# Final check

# Observations must be 2D
observations = data_bnp['LogRet'].values.reshape(-1, 1)
if np.isnan(observations).any() or np.isinf(observations).any():
    raise ValueError("Observations still contain NaN or Inf - check data pipeline.")

observations

array([[-0.02573248],
       [ 0.00708391],
       [ 0.01633645],
       ...,
       [ 0.        ],
       [ 0.01697055],
       [-0.00397839]])

In [None]:
from py2neo import Graph
# Connect to Neo4j
graph = Graph("bolt://localhost:7687", auth=("neo4j", "password"))
graph.run("MATCH (n) DETACH DELETE n")


In [38]:
from hmmlearn import hmm
# Let's choose 3 hidden states, for example
n_components = 10


#Use the GaussianHMM class to create a new model
model = hmm.GaussianHMM(
    n_components=n_components,
    covariance_type='full',
    n_iter=100,      # Max # of EM iterations
    random_state=42  # For reproducibility
)

# Fit the model to the data usig Braum-Welch algorithm
model.fit(observations)

# --------------------------------------------------------------------------
# 4. Decode Hidden States (Viterbi)
# --------------------------------------------------------------------------
# hidden_states[i] = most likely hidden state index on day i
hidden_states = model.predict(observations)

# Model parameters we might want to store in Neo4j
start_probs = model.startprob_        # shape = (n_components,)
trans_probs = model.transmat_         # shape = (n_components, n_components)
means = model.means_.flatten()        # shape = (n_components,)
covars = model.covars_                # shape = (n_components, 1, 1) for full cov but 1D
# or if covariance_type='diag', you'd have shape = (n_components, n_features)

transition_matrix=model.transmat_
transition_matrix_3dec = np.round(transition_matrix, 3)
print("Transition Matrix (3 decimals):")

print(transition_matrix_3dec)

Transition Matrix (3 decimals):
[[0.447 0.    0.    0.    0.    0.    0.    0.439 0.    0.114]
 [0.37  0.    0.    0.    0.    0.    0.455 0.175 0.    0.   ]
 [0.    0.009 0.    0.    0.    0.    0.    0.022 0.969 0.   ]
 [0.226 0.    0.    0.708 0.067 0.    0.    0.    0.    0.   ]
 [0.    0.    0.    0.    0.172 0.77  0.    0.004 0.    0.054]
 [0.    0.    0.    0.    0.973 0.    0.    0.    0.    0.027]
 [0.    0.    0.164 0.    0.    0.    0.    0.    0.836 0.   ]
 [0.    0.    0.008 0.014 0.    0.    0.001 0.978 0.    0.   ]
 [0.    0.    0.843 0.    0.    0.    0.127 0.    0.029 0.   ]
 [0.    0.    0.    0.045 0.    0.    0.    0.    0.    0.955]]


In [None]:
# --------------------------------------------------------------------------
# 5. Save the HMM in Neo4j
# --------------------------------------------------------------------------

# A. Create a parent node for the entire HMM (optional but helps organization)
graph.run("MATCH (n) DETACH DELETE n")
hmm_name = "BNP_HMM"
create_hmm_query = f"""
MERGE (h:HMM {{name: $hmm_name}})
ON CREATE SET h.description = "GaussianHMM with {n_components} hidden states"
RETURN h
"""
graph.run(create_hmm_query, hmm_name=hmm_name)

# B. Create nodes for each hidden state with their parameters
#    We'll store: state index, mean, covariance, start prob, etc.
for i in range(n_components):
    # If covariance_type='full' and your features are 1D, your cov is
    # a 1x1 matrix. We'll store just the diagonal value (var) for clarity.
    var_value = covars[i].diagonal().mean() if covars[i].ndim > 1 else covars[i]
    create_state_query = f"""
    MERGE (s:State {{hmm_name: $hmm_name, state_index: $state_index}})
    ON CREATE SET
      s.mean = $mean,
      s.variance = $variance,
      s.start_prob = $start_prob
    RETURN s
    """
    graph.run(create_state_query,
              hmm_name=hmm_name,
              state_index=i,
              mean=float(means[i]),
              variance=float(var_value),
              start_prob=float(start_probs[i]))

# C. Create relationships for the transition probabilities
#    We'll create (s1)-[TRANSITION_TO {prob:...}]->(s2) for each pair.
for i in range(n_components):
    for j in range(n_components):
        prob_ij = float(trans_probs[i, j])
        create_rel_query = f"""
        MATCH (s1:State {{hmm_name: $hmm_name, state_index: $i}}),
              (s2:State {{hmm_name: $hmm_name, state_index: $j}})
        MERGE (s1)-[r:TRANSITION_TO {{prob: $prob_ij}}]->(s2)
        RETURN r
        """
        graph.run(create_rel_query, hmm_name=hmm_name, i=i, j=j, prob_ij=prob_ij)

print("HMM parameters have been stored in Neo4j.")

HMM parameters have been stored in Neo4j.


In [None]:
def forecast_log_returns(start_probs, transmat, means, steps=5):
    """
    start_probs: posterior probability distribution over states for the last known day
    transmat: transition matrix from the fitted HMM
    means: array of shape (n_components,) - mean log return per state
    steps: how many steps ahead to forecast
    """
    # Keep track of the distribution over states
    dist = start_probs.copy()
    forecasts = []

    for step in range(steps):
        # Weighted average log return
        forecast_r = np.sum(dist * means)
        forecasts.append(forecast_r)
        # Evolve the distribution over states by 1 step
        dist = dist @ transmat  # matrix multiply

    return forecasts

last_day_probs = posterior_probs[-1, :]
multi_step_logret = forecast_log_returns(
    start_probs=last_day_probs,
    transmat=model.transmat_,
    means=model.means_.flatten(),
    steps=5
)

