# FPT Stock Prediction: HMM Regime + MultiDLinear

**Pipeline:**
1. HMM Clustering (60-day window) → 4 regimes
2. Train MultiDLinear per regime
3. Ensemble prediction → 100 days forecast

In [None]:
# Imports
import os, random, warnings
from copy import deepcopy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from hmmlearn import hmm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')

# Seed
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

In [None]:
# Config
CONFIG = {
    'data_path': 'data/FPT_train.csv',
    'regime_window': 60,
    'n_regimes': 4,
    'seq_len': 30,
    'pred_len': 100,
    'batch_size': 32,
    'epochs': 500,
    'patience': 80,
    'lr': 1e-3,
    'train_ratio': 0.7
}
print("Config:", CONFIG)

## 1. Load & Prepare Data

In [None]:
# Load data
df = pd.read_csv(CONFIG['data_path'])
df['time'] = pd.to_datetime(df['time'])
df = df.sort_values('time').reset_index(drop=True)

# Features
df['close_log'] = np.log(df['close'])
df['open_log'] = np.log(df['open'])
df['high_log'] = np.log(df['high'])
df['low_log'] = np.log(df['low'])
df['volume_log'] = np.log(df['volume'] + 1)
df['hl_spread'] = (df['high'] - df['low']) / df['close']
df['oc_spread'] = (df['close'] - df['open']) / df['open']

print(f"Data: {len(df)} rows, {df['time'].min().date()} → {df['time'].max().date()}")

## 2. HMM Regime Detection

In [None]:
# Compute regime features
W = CONFIG['regime_window']

df['return_W'] = df['close'].pct_change(W) * 100
df['vol_W'] = df['close'].pct_change().rolling(W).std() * np.sqrt(252) * 100
df['trend_W'] = df['close'].rolling(W).apply(
    lambda x: np.polyfit(np.arange(len(x)), x, 1)[0] / x.mean() * 100, raw=False
)

# HMM
df_hmm = df.dropna(subset=['return_W', 'vol_W', 'trend_W']).copy()
X_hmm = df_hmm[['return_W', 'vol_W', 'trend_W']].values

hmm_scaler = StandardScaler()
X_hmm_scaled = hmm_scaler.fit_transform(X_hmm)

model_hmm = hmm.GaussianHMM(n_components=CONFIG['n_regimes'], covariance_type='full', 
                            n_iter=1000, random_state=42)
model_hmm.fit(X_hmm_scaled)
df_hmm['regime'] = model_hmm.predict(X_hmm_scaled)

# Name regimes by return
stats = df_hmm.groupby('regime')['return_W'].mean().sort_values(ascending=False)
REGIME_MAP = {r: i for i, r in enumerate(stats.index)}
REGIME_NAMES = ['Rally', 'Uptrend', 'Sideway', 'Downtrend']
REGIME_COLORS = ['#1B5E20', '#4CAF50', '#9E9E9E', '#C62828']

df_hmm['regime_id'] = df_hmm['regime'].map(REGIME_MAP)
df_hmm['regime_name'] = df_hmm['regime_id'].map(lambda x: REGIME_NAMES[x])

# Merge back
df['regime'] = np.nan
df.loc[df_hmm.index, 'regime'] = df_hmm['regime'].values
df['regime'] = df['regime'].ffill().bfill().astype(int)
df['regime_id'] = df['regime'].map(REGIME_MAP)

# Summary
print("\nRegime Summary:")
for rid in range(CONFIG['n_regimes']):
    orig = [k for k, v in REGIME_MAP.items() if v == rid][0]
    ret = stats.loc[orig]
    cnt = (df_hmm['regime'] == orig).sum()
    print(f"  {REGIME_NAMES[rid]}: {cnt} days ({cnt/len(df_hmm)*100:.1f}%), Return: {ret:+.1f}%")

In [None]:
# Plot
fig, ax = plt.subplots(figsize=(16, 5))
ax.plot(df['time'], df['close'], 'k-', lw=1)

prev, start = None, 0
for i in range(len(df)):
    curr = df.iloc[i]['regime_id']
    if curr != prev:
        if prev is not None:
            ax.axvspan(df.iloc[start]['time'], df.iloc[i-1]['time'], 
                      alpha=0.3, color=REGIME_COLORS[int(prev)])
        start, prev = i, curr
ax.axvspan(df.iloc[start]['time'], df.iloc[-1]['time'], alpha=0.3, color=REGIME_COLORS[int(prev)])

import matplotlib.patches as mpatches
ax.legend(handles=[mpatches.Patch(color=c, alpha=0.5, label=n) 
                   for n, c in zip(REGIME_NAMES, REGIME_COLORS)], loc='upper left')
ax.set_title('FPT Price with HMM Regimes (60d)', fontweight='bold')
ax.set_ylabel('Price')
plt.tight_layout()
plt.show()

## 3. Prepare for Model Training

In [None]:
# Scaling
train_cut = int(len(df) * CONFIG['train_ratio'])

# Target scaler
target_scaler = StandardScaler()
target_scaler.fit(df['close_log'].values[:train_cut].reshape(-1, 1))

# Feature scaler
FEAT_COLS = ['open_log', 'high_log', 'low_log', 'close_log', 'volume_log', 'hl_spread', 'oc_spread']
feat_scaler = StandardScaler()
feat_scaler.fit(df[FEAT_COLS].values[:train_cut])
X_scaled = feat_scaler.transform(df[FEAT_COLS].values)

n_features = len(FEAT_COLS)
close_idx = 3
target = X_scaled[:, close_idx]
regimes = df['regime'].values

print(f"Features: {n_features}, Train cutoff: {train_cut}")

In [None]:
# Dataset
class RegimeDataset(Dataset):
    def __init__(self, X, y, regimes, seq_len, pred_len, regime_filter=None):
        self.X = X.astype(np.float32)
        self.y = y.astype(np.float32)
        self.seq_len, self.pred_len = seq_len, pred_len
        
        self.indices = [i for i in range(len(X) - seq_len - pred_len + 1)
                        if regime_filter is None or regimes[i + seq_len - 1] == regime_filter]
    
    def __len__(self): return len(self.indices)
    
    def __getitem__(self, idx):
        i = self.indices[idx]
        return (torch.from_numpy(self.X[i:i+self.seq_len]),
                torch.from_numpy(self.y[i+self.seq_len:i+self.seq_len+self.pred_len]))

In [None]:
# MultiDLinear Model
class MultiDLinear(nn.Module):
    def __init__(self, seq_len, pred_len, n_feat, close_idx=3):
        super().__init__()
        self.seq_len, self.close_idx = seq_len, close_idx
        self.kernel = max(3, seq_len // 4)
        self.fc_t = nn.Linear(seq_len * n_feat, pred_len)
        self.fc_s = nn.Linear(seq_len * n_feat, pred_len)
        
    def forward(self, x):
        B = x.size(0)
        c = x[:, :, self.close_idx]
        t = c.unfold(-1, self.kernel, 1).mean(-1)
        pad_l = (self.seq_len - t.size(-1)) // 2
        pad_r = self.seq_len - t.size(-1) - pad_l
        t = F.pad(t, (pad_l, pad_r), mode='replicate')
        s = c - t
        
        xt, xs = x.clone(), x.clone()
        xt[:, :, self.close_idx], xs[:, :, self.close_idx] = t, s
        return self.fc_t(xt.reshape(B, -1)) + self.fc_s(xs.reshape(B, -1))

In [None]:
# Training function
class EarlyStopping:
    def __init__(self, patience):
        self.patience, self.best, self.wait, self.state = patience, float('inf'), 0, None
    def step(self, loss, model):
        if loss < self.best - 1e-5:
            self.best, self.wait, self.state = loss, 0, deepcopy(model.state_dict())
        else:
            self.wait += 1
        return self.wait >= self.patience

def train(model, train_dl, val_dl, epochs, lr, patience):
    opt = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)
    sched = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, factor=0.5, patience=30)
    stopper = EarlyStopping(patience)
    criterion = nn.MSELoss()
    model.to(device)
    
    for ep in range(epochs):
        model.train()
        for bx, by in train_dl:
            bx, by = bx.to(device), by.to(device)
            opt.zero_grad()
            loss = criterion(model(bx), by)
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            opt.step()
        
        model.eval()
        val_loss = sum(criterion(model(bx.to(device)), by.to(device)).item() 
                       for bx, by in val_dl) / len(val_dl)
        sched.step(val_loss)
        if stopper.step(val_loss, model): break
    
    if stopper.state: model.load_state_dict(stopper.state)
    return model

## 4. Train Regime-Specific Models

In [None]:
# Train per regime
models = {}
unique_regimes = sorted(df['regime'].unique())

print("Training regime-specific models...\n")
for regime in unique_regimes:
    rid = REGIME_MAP[regime]
    name = REGIME_NAMES[rid]
    
    ds = RegimeDataset(X_scaled, target, regimes, CONFIG['seq_len'], CONFIG['pred_len'], regime)
    if len(ds) < 20:
        print(f"{name}: Skip (only {len(ds)} samples)")
        continue
    
    n = len(ds)
    tr_n = int(n * 0.8)
    tr_dl = DataLoader(torch.utils.data.Subset(ds, range(tr_n)), CONFIG['batch_size'], shuffle=True)
    va_dl = DataLoader(torch.utils.data.Subset(ds, range(tr_n, n)), CONFIG['batch_size'])
    
    m = MultiDLinear(CONFIG['seq_len'], CONFIG['pred_len'], n_features, close_idx)
    m = train(m, tr_dl, va_dl, CONFIG['epochs'], CONFIG['lr'], CONFIG['patience'])
    models[regime] = m
    print(f"{name}: Trained ({n} samples)")

# Global model
print("\nTraining global model...")
ds_all = RegimeDataset(X_scaled, target, regimes, CONFIG['seq_len'], CONFIG['pred_len'])
n = len(ds_all)
tr_n = int(n * 0.9)
tr_dl = DataLoader(torch.utils.data.Subset(ds_all, range(tr_n)), CONFIG['batch_size'], shuffle=True)
va_dl = DataLoader(torch.utils.data.Subset(ds_all, range(tr_n, n)), CONFIG['batch_size'])

global_model = MultiDLinear(CONFIG['seq_len'], CONFIG['pred_len'], n_features, close_idx)
global_model = train(global_model, tr_dl, va_dl, CONFIG['epochs'], CONFIG['lr'], CONFIG['patience'])
print(f"Global: Trained ({n} samples)")

## 5. Generate Predictions

In [None]:
def predict(model, x):
    model.eval()
    with torch.no_grad():
        inp = torch.from_numpy(x.astype(np.float32)).unsqueeze(0).to(device)
        return model(inp).cpu().numpy().flatten()

def to_price(pred_scaled, scaler):
    return np.exp(scaler.inverse_transform(pred_scaled.reshape(-1, 1)).flatten())

# Input data
input_x = X_scaled[-CONFIG['seq_len']:]
current_regime = df['regime'].iloc[-1]

# Get HMM probabilities
last_feat = df_hmm[['return_W', 'vol_W', 'trend_W']].iloc[-1:].values
probs = model_hmm.predict_proba(hmm_scaler.transform(last_feat))[0]

print(f"Current regime: {REGIME_NAMES[REGIME_MAP[current_regime]]}")
print(f"\nRegime probabilities:")
for r in unique_regimes:
    print(f"  {REGIME_NAMES[REGIME_MAP[r]]}: {probs[r]*100:.1f}%")

In [None]:
# Ensemble prediction
ensemble = np.zeros(CONFIG['pred_len'])
for regime, model in models.items():
    ensemble += probs[regime] * predict(model, input_x)

# Add global with remaining weight
used_weight = sum(probs[r] for r in models.keys())
if used_weight < 1:
    ensemble += (1 - used_weight) * predict(global_model, input_x)

final_prices = to_price(ensemble, target_scaler)

print(f"\nPrediction range: {final_prices.min():.2f} - {final_prices.max():.2f}")
print(f"Change from last: {(final_prices[-1]/df['close'].iloc[-1]-1)*100:+.1f}%")

## 6. Save Submission

In [None]:
# Save
os.makedirs('submissions', exist_ok=True)
sub = pd.DataFrame({'id': range(1, CONFIG['pred_len']+1), 'close': final_prices})
sub_path = 'submissions/submission_hmm_multidlinear.csv'
sub.to_csv(sub_path, index=False)

print(f"Saved: {sub_path}")
sub.head(10)

In [None]:
# Visualize
fig, ax = plt.subplots(figsize=(14, 6))

# Historical
hist = df.iloc[-150:]
ax.plot(hist['time'], hist['close'], 'b-', lw=2, label='Historical')

# Forecast
future = pd.date_range(df['time'].iloc[-1] + pd.Timedelta(days=1), 
                       periods=CONFIG['pred_len'], freq='B')
ax.plot(future, final_prices, 'r--', lw=2, label='Forecast')
ax.axvline(df['time'].iloc[-1], color='gray', ls='--', alpha=0.5)

ax.set_title('FPT Forecast: HMM + MultiDLinear Ensemble', fontweight='bold')
ax.set_xlabel('Date')
ax.set_ylabel('Price (VND)')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('submissions/forecast_hmm_multidlinear.png', dpi=150)
plt.show()