In [1]:
# Run once at top of notebook
!pip install -q xgboost==1.7.6 pandas matplotlib scikit-learn


  You can safely remove it manually.


In [2]:
# mcx_xgb_walkforward_light.py
# Paste and run this in a Colab cell (after installing xgboost and uploading MCX_SILVER.csv to /content).

import os, re, math, joblib, warnings
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, roc_curve, confusion_matrix
import xgboost as xgb
warnings.filterwarnings("ignore")
np.random.seed(42)

DATA_PATH = "/content/MCX_SILVER.csv"   # make sure file is uploaded here
OUT_DIR = "/content/mcx_xgb_results"
os.makedirs(OUT_DIR, exist_ok=True)

# ---- Kalman smoother ----
def kalman_smooth(signal, q=1e-5, r=1e-2):
    n = len(signal)
    xhat = np.zeros(n); P = np.zeros(n)
    xhat[0] = signal[0]; P[0] = 1.0
    for k in range(1, n):
        xhat_minus = xhat[k-1]
        P_minus = P[k-1] + q
        K_gain = P_minus / (P_minus + r + 1e-12)
        xhat[k] = xhat_minus + K_gain * (signal[k] - xhat_minus)
        P[k] = (1 - K_gain) * P_minus
    return xhat

# ---- Load and basic cleanup ----
df_raw = pd.read_csv(DATA_PATH)
df_raw.columns = [re.sub(r"[^0-9A-Za-z]+","_", str(c)).lower() for c in df_raw.columns]
date_col = next((c for c in df_raw.columns if "date" in c), df_raw.columns[0])
close_col = next((c for c in df_raw.columns if "close" in c or "settle" in c or "ltp" in c), None)
if close_col is None:
    # fallback to numeric column with max variance
    numeric_cols = [c for c in df_raw.columns if pd.api.types.is_numeric_dtype(df_raw[c])]
    close_col = numeric_cols[0] if numeric_cols else df_raw.columns[1]
df = df_raw[[date_col, close_col]].rename(columns={date_col:"Date", close_col:"Close"})
df["Date"] = pd.to_datetime(df["Date"], errors="coerce", dayfirst=True)
df["Close"] = pd.to_numeric(df["Close"].astype(str).str.replace(r"[^\d\.\-eE]","", regex=True), errors="coerce")
df = df.dropna(subset=["Date","Close"]).sort_values("Date").reset_index(drop=True)
print("Loaded rows:", len(df))

# ---- Kalman smoothing ----
df["Close_kf"] = kalman_smooth(df["Close"].values, q=1e-5, r=1e-2)

# ---- Feature engineering (light) ----
def engineer(df):
    df = df.copy()
    df["close"] = df["Close_kf"]
    df["ret_1"] = df["close"].pct_change()
    for w in (5,10,20,50):
        df[f"sma_{w}"] = df["close"].rolling(w).mean()
        df[f"ema_{w}"] = df["close"].ewm(span=w, adjust=False).mean()
    df["roc_5"] = df["close"].pct_change(5)
    for w in (5,20,60):
        df[f"vol_{w}"] = df["ret_1"].rolling(w).std()
    delta = df["close"].diff()
    up = delta.clip(lower=0); down = -delta.clip(upper=0)
    roll_up = up.ewm(com=13).mean(); roll_down = down.ewm(com=13).mean()
    df["rsi_14"] = 100 - 100/(1 + roll_up / (roll_down + 1e-12))
    for lag in (1,2,3,5,10):
        df[f"ret_lag_{lag}"] = df["ret_1"].shift(lag)
        df[f"close_lag_{lag}"] = df["close"].shift(lag)
    return df

df_feat = engineer(df)
df_feat = df_feat.dropna().reset_index(drop=True)
df_feat["target_up"] = (df_feat["close"].shift(-1) > df_feat["close"]).astype(int)
df_feat = df_feat.dropna(subset=["target_up"]).reset_index(drop=True)

exclude = ["Date","Close","Close_kf","target_up"]
feature_cols = [c for c in df_feat.columns if c not in exclude and pd.api.types.is_numeric_dtype(df_feat[c])]
X_all = df_feat[feature_cols].values
y_all = df_feat["target_up"].values
dates_all = df_feat["Date"].values
print("Features:", len(feature_cols), "Samples:", len(y_all))

# ---- Last-step features for XGBoost ----
SEQ_LEN = 60
if len(X_all) <= SEQ_LEN + 10:
    raise SystemExit("Not enough rows after feature engineering. Need > SEQ_LEN + 10 rows.")
X_last = np.array([X_all[i-1] for i in range(SEQ_LEN, len(X_all))])
y_seq = y_all[SEQ_LEN:]
dates_seq = dates_all[SEQ_LEN:]
idxs = np.array([i for i in range(SEQ_LEN, len(X_all))])

# ---- Walk-forward setup ----
n = len(X_last)
WF_FOLDS = 5
test_block = max(int(n / (WF_FOLDS + 1)), 10)
folds = []
for f in range(WF_FOLDS):
    train_end = test_block*(f+1)
    test_start = train_end
    test_end = min(train_end + test_block, n)
    folds.append((0, train_end, test_start, test_end))
folds = [t for t in folds if t[1]-t[0] > 50 and t[3]-t[2] > 5]
print("Running folds:", folds)

summary = []
for i,(tr0,train_end,test_start,test_end) in enumerate(folds):
    X_train, y_train = X_last[tr0:train_end], y_seq[tr0:train_end]
    X_test, y_test = X_last[test_start:test_end], y_seq[test_start:test_end]
    dates_test = dates_seq[test_start:test_end]
    if len(y_train) < 50 or len(y_test) < 10:
        print("Skipping fold (too small)"); continue
    # scale
    scaler = StandardScaler().fit(X_train)
    X_train_s = scaler.transform(X_train); X_test_s = scaler.transform(X_test)
    # train XGBoost classifier
    dtrain = xgb.DMatrix(X_train_s, label=y_train); dtest = xgb.DMatrix(X_test_s, label=y_test)
    params = {"objective":"binary:logistic", "eta":0.05, "max_depth":4, "eval_metric":"auc"}
    bst = xgb.train(params, dtrain, num_boost_round=100, verbose_eval=False)
    y_prob = bst.predict(dtest); y_pred = (y_prob >= 0.5).astype(int)
    # metrics
    acc = accuracy_score(y_test, y_pred); f1 = f1_score(y_test, y_pred)
    try:
        auc = roc_auc_score(y_test, y_prob)
    except:
        auc = float("nan")
    # save model & scaler & metrics
    fold_dir = os.path.join(OUT_DIR, f"fold_{i+1}"); os.makedirs(fold_dir, exist_ok=True)
    bst.save_model(os.path.join(fold_dir, "xgb_model.json"))
    joblib.dump(scaler, os.path.join(fold_dir, "scaler.joblib"))
    joblib.dump({"acc":acc,"f1":f1,"auc":auc}, os.path.join(fold_dir, "metrics.joblib"))
    # small, lightweight plots (try/except)
    try:
        plt.figure(figsize=(8,2)); plt.plot(dates_test, y_test, label="True"); plt.plot(dates_test, y_prob, label="Pred prob"); plt.legend(loc="upper right"); plt.title(f"Fold {i+1} True vs Pred Prob"); plt.tight_layout(); plt.savefig(os.path.join(fold_dir,"pred_prob_timeseries.png")); plt.close()
    except Exception as e:
        print("Plot failed:", e)
    summary.append({"fold": i+1, "n_train": len(y_train), "n_test": len(y_test), "acc": acc, "f1": f1, "auc": auc})
    print(f"Fold {i+1} -> acc: {acc:.4f}, f1: {f1:.4f}, auc: {auc:.4f}")

summary_df = pd.DataFrame(summary)
summary_df.to_csv(os.path.join(OUT_DIR, "walkforward_summary.csv"), index=False)
print("\nWalk-forward summary:\n", summary_df.to_string(index=False))
print("\nAll artifacts saved to:", OUT_DIR)


FileNotFoundError: [Errno 2] No such file or directory: '/content/MCX_SILVER.csv'

In [None]:
!pip install -q tensorflow


In [None]:
# streaming_trend_model.py
# - Data path: /mnt/data/MCX_SILVER.csv
# - Outputs -> /mnt/data/streaming_model_results/
# Requirements: numpy, pandas, matplotlib, sklearn, tensorflow (optional for LSTM)
# If TF not present, the script will skip the LSTM and still run online model simulation.

import os, re, math, joblib, warnings
from datetime import datetime
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor, SGDClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score, f1_score
warnings.filterwarnings("ignore")
np.random.seed(42)

OUT = "/mnt/data/streaming_model_results"
os.makedirs(OUT, exist_ok=True)
DATA_PATH = "/content/MCX_SILVER.csv"

# ---------- utilities ----------
def safe_to_numeric(s):
    if isinstance(s, str):
        s2 = re.sub(r"[^\d\.\-eE]","", s)
        try:
            return float(s2)
        except:
            return np.nan
    return s

def kalman_smooth(signal, q=1e-5, r=1e-2):
    n = len(signal)
    xhat = np.zeros(n); P = np.zeros(n)
    xhat[0] = signal[0]; P[0] = 1.0
    for k in range(1,n):
        xhat_minus = xhat[k-1]; P_minus = P[k-1] + q
        K = P_minus / (P_minus + r + 1e-12)
        xhat[k] = xhat_minus + K * (signal[k] - xhat_minus)
        P[k] = (1 - K) * P_minus
    return xhat

# ---------- 1) Load & clean ----------
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"Place your MCX CSV at: {DATA_PATH}")

df = pd.read_csv(DATA_PATH)
# normalize column names
df.columns = [re.sub(r"[^0-9A-Za-z]+","_", str(c)).lower() for c in df.columns]
date_col = next((c for c in df.columns if "date" in c), df.columns[0])
close_col = next((c for c in df.columns if "close" in c or "ltp" in c or "settle" in c), df.columns[1])
df = df[[date_col, close_col]].rename(columns={date_col:"Date", close_col:"Close"})
df["Date"] = pd.to_datetime(df["Date"], errors="coerce", dayfirst=True)
df["Close"] = df["Close"].apply(safe_to_numeric)
df = df.dropna(subset=["Date","Close"]).sort_values("Date").reset_index(drop=True)
print("Loaded rows:", len(df), "Date range:", df['Date'].iloc[0].date(), "to", df['Date'].iloc[-1].date())

# ---------- 2) Smooth and features ----------
# Kalman smoothing (reduces micro-noise)
df["close_kf"] = kalman_smooth(df["Close"].values, q=1e-5, r=1e-2)

# Features: returns, log-returns, sma/ema, vol, rsi, lagged returns
df["ret_1"] = df["close_kf"].pct_change()
df["logret"] = np.log(df["close_kf"]).diff()
for w in (5,10,20,50):
    df[f"sma_{w}"] = df["close_kf"].rolling(w).mean()
    df[f"ema_{w}"] = df["close_kf"].ewm(span=w, adjust=False).mean()
df["roc_5"] = df["close_kf"].pct_change(5)
for w in (5,20,60):
    df[f"vol_{w}"] = df["ret_1"].rolling(w).std()
# RSI-ish
delta = df["close_kf"].diff()
up = delta.clip(lower=0); down = -delta.clip(upper=0)
roll_up = up.ewm(com=13).mean(); roll_down = down.ewm(com=13).mean()
df["rsi_14"] = 100 - 100/(1 + roll_up/(roll_down+1e-12))
for lag in (1,2,3,5,10):
    df[f"ret_lag_{lag}"] = df["ret_1"].shift(lag)
    df[f"close_lag_{lag}"] = df["close_kf"].shift(lag)

# Drop rows with NaNs from rolling windows
df = df.dropna().reset_index(drop=True)
print("Rows after features:", len(df))

# ---------- 3) Targets ----------
# Next-day return (magnitude) and direction label
df["target_ret_1"] = df["close_kf"].shift(-1) / df["close_kf"] - 1.0
df["target_dir_1"] = (df["target_ret_1"] > 0).astype(int)
# drop last row where target is NaN
df = df.dropna(subset=["target_ret_1","target_dir_1"]).reset_index(drop=True)

# ---------- 4) Feature matrix ----------
exclude = ["Date","Close","close_kf","target_ret_1","target_dir_1"]
feature_cols = [c for c in df.columns if c not in exclude and np.issubdtype(df[c].dtype, np.number)]
X = df[feature_cols].values
y_reg = df["target_ret_1"].values  # regression target (float)
y_clf = df["target_dir_1"].values  # classification target (0/1)
dates = df["Date"].values
print("Features count:", len(feature_cols))

# ---------- 5) Train/test time split ----------
# Use initial 70% for training batch LSTM and initial online fit; remaining for streaming simulation
n = len(X); train_n = int(n * 0.7)
print("Total samples:", n, "Train:", train_n, "Stream-test:", n-train_n)
X_train, X_test = X[:train_n], X[train_n:]
y_reg_train, y_reg_test = y_reg[:train_n], y_reg[train_n:]
y_clf_train, y_clf_test = y_clf[:train_n], y_clf[train_n:]
dates_train, dates_test = dates[:train_n], dates[train_n:]

# Scale features: fit on train
scaler = StandardScaler().fit(X_train)
X_train_s = scaler.transform(X_train)
X_test_s = scaler.transform(X_test)

joblib.dump(scaler, os.path.join(OUT, "scaler.joblib"))

# ---------- 6) Build models ----------
# 6A: Online models (fast, adapt using partial_fit)
# For regression (magnitude)
online_reg = SGDRegressor(max_iter=1000, tol=1e-3)
# For classification (direction)
online_clf = SGDClassifier(max_iter=1000, tol=1e-3)

# Initialize online models with a small subset (partial_fit requires classes for classifier)
init_n = min(50, len(X_train_s))
online_reg.partial_fit(X_train_s[:init_n], y_reg_train[:init_n])
online_clf.partial_fit(X_train_s[:init_n], y_clf_train[:init_n], classes=np.array([0,1]))

# 6B: Batch model (LSTM) for regression (optional - will be skipped if TensorFlow not available)
use_lstm = True
try:
    import tensorflow as tf
    from tensorflow.keras import layers, models, callbacks
    tf.random.set_seed(42)
except Exception as e:
    print("TensorFlow not available or failed to import; skipping LSTM batch model. Error:", e)
    use_lstm = False

lstm_model = None
if use_lstm:
    # Prepare sequences for LSTM: use SEQ_LEN lookback
    SEQ_LEN = 30
    def make_sequences(Xa, seq_len):
        Xs = []
        for i in range(seq_len, len(Xa)):
            Xs.append(Xa[i-seq_len:i])
        return np.array(Xs)
    # Build sequence arrays aligned to regression y (target at i -> sequence ends at i)
    Xs_all = make_sequences(scaler.transform(X), SEQ_LEN)
    y_reg_seq = y_reg[SEQ_LEN:]
    # Train/test split accordingly
    seq_train_n = int(len(Xs_all) * 0.7)
    Xs_train, Xs_test = Xs_all[:seq_train_n], Xs_all[seq_train_n:]
    yseq_train, yseq_test = y_reg_seq[:seq_train_n], y_reg_seq[seq_train_n:]
    # model
    inp = layers.Input(shape=(SEQ_LEN, Xs_train.shape[2]))
    x = layers.Bidirectional(layers.LSTM(64, return_sequences=False))(inp)
    x = layers.Dropout(0.2)(x)
    x = layers.Dense(32, activation="relu")(x)
    out = layers.Dense(1, activation="linear")(x)
    lstm_model = models.Model(inp, out)
    lstm_model.compile(optimizer="adam", loss="mse")
    es = callbacks.EarlyStopping(monitor="val_loss", patience=6, restore_best_weights=True)
    print("Training LSTM (this may take a moment)...")
    lstm_model.fit(Xs_train, yseq_train, validation_split=0.1, epochs=50, batch_size=32, callbacks=[es], verbose=0)
    lstm_model.save(os.path.join(OUT, "lstm_return_model.keras"))
    print("LSTM trained and saved.")

# ---------- 7) Streaming simulation over test set ----------
results = []
preds_online_reg = []
preds_online_clf = []
preds_lstm_reg = []

# We'll step through test samples one-by-one (simulate streaming). For each t:
#  - predict using online models (and optionally LSTM)
#  - record predictions
#  - then update online models using the true value at t (partial_fit)
# For LSTM we use the pre-trained model only (no online updates)
for i in range(len(X_test_s)):
    x_t = X_test_s[i].reshape(1,-1)
    # online predictions
    pred_mag_online = online_reg.predict(x_t)[0]
    pred_dir_online = online_clf.predict(x_t)[0]
    preds_online_reg.append(pred_mag_online)
    preds_online_clf.append(pred_dir_online)
    # LSTM prediction: need to build sequence ending here in full series index
    if use_lstm:
        # find absolute index in full X_s (train_n offset)
        abs_idx = train_n + i
        if abs_idx - SEQ_LEN + 1 >= 0:
            seq = scaler.transform(X)[abs_idx-SEQ_LEN+1:abs_idx+1]  # shape (SEQ_LEN, nfeat)
            if seq.shape[0] == SEQ_LEN:
                seq = seq.reshape(1, SEQ_LEN, seq.shape[1])
                pred_l = float(lstm_model.predict(seq, verbose=0)[0,0])
            else:
                pred_l = np.nan
        else:
            pred_l = np.nan
        preds_lstm_reg.append(pred_l)
    # record actual
    true_mag = y_reg_test[i]
    true_dir = y_clf_test[i]
    results.append({
        "date": dates_test[i], "true_mag": true_mag, "true_dir": int(true_dir),
        "pred_online_mag": pred_mag_online, "pred_online_dir": int(pred_dir_online),
        "pred_lstm_mag": preds_lstm_reg[-1] if use_lstm else np.nan
    })
    # now update online models with the true label (simulate learning from new tick)
    online_reg.partial_fit(x_t, np.array([true_mag]))
    online_clf.partial_fit(x_t, np.array([true_dir]))

# ---------- 8) Evaluate ----------
res_df = pd.DataFrame(results)
# Regression metrics (magnitude)
mse_online = mean_squared_error(res_df["true_mag"], res_df["pred_online_mag"])
mae_online = mean_absolute_error(res_df["true_mag"], res_df["pred_online_mag"])
# LSTM metrics (if used)
if use_lstm and res_df["pred_lstm_mag"].notna().sum() > 0:
    valid_mask = ~np.isnan(res_df["pred_lstm_mag"])
    mse_lstm = mean_squared_error(res_df["true_mag"][valid_mask], res_df["pred_lstm_mag"][valid_mask])
    mae_lstm = mean_absolute_error(res_df["true_mag"][valid_mask], res_df["pred_lstm_mag"][valid_mask])
else:
    mse_lstm = mae_lstm = np.nan

# Classification accuracy
acc_online = accuracy_score(res_df["true_dir"], res_df["pred_online_dir"])
f1_online = f1_score(res_df["true_dir"], res_df["pred_online_dir"])

print("ONLINE reg MSE:", mse_online, "MAE:", mae_online)
print("ONLINE dir  Acc:", acc_online, "F1:", f1_online)
print("LSTM reg MSE:", mse_lstm, "MAE:", mae_lstm)

# ---------- 9) Visuals ----------
# 9A: scatter predicted vs true (online reg)
plt.figure(figsize=(8,4))
plt.scatter(res_df["true_mag"], res_df["pred_online_mag"], alpha=0.4, s=8)
plt.xlabel("True next-day return"); plt.ylabel("Predicted (online reg)")
plt.title("Online reg: Pred vs True")
plt.grid(True)
plt.tight_layout(); plt.savefig(os.path.join(OUT, "pred_vs_true_online.png")); plt.close()

# 9B: time series of true vs predicted returns (first 300 points)
plt.figure(figsize=(12,3))
plt.plot(res_df["date"][:300], res_df["true_mag"][:300], label="true", linewidth=1)
plt.plot(res_df["date"][:300], res_df["pred_online_mag"][:300], label="online_pred", linewidth=1)
if use_lstm:
    plt.plot(res_df["date"][:300], res_df["pred_lstm_mag"][:300], label="lstm_pred", linewidth=1)
plt.legend(); plt.title("True vs Predicted next-day returns (first 300 samples)"); plt.tight_layout()
plt.savefig(os.path.join(OUT, "timeseries_preds.png")); plt.close()

# 9C: direction prediction accuracy over rolling window
res_df["correct_online_dir"] = (res_df["true_dir"] == res_df["pred_online_dir"]).astype(int)
res_df["rolling_acc_online"] = res_df["correct_online_dir"].rolling(50, min_periods=1).mean()
plt.figure(figsize=(10,3))
plt.plot(res_df["date"], res_df["rolling_acc_online"], label="rolling_acc_online")
plt.axhline(0.5, linestyle="--", color="k", alpha=0.6)
plt.title("Rolling accuracy (online classifier)"); plt.legend(); plt.tight_layout()
plt.savefig(os.path.join(OUT, "rolling_acc_online.png")); plt.close()

# 9D: simple strategy cumulative returns (apply predicted buy signal to actual next-day returns)
# Strategy: go long when model predicts up (pred_dir==1), else flat. Compare buy&hold.
strat_online = (res_df["pred_online_dir"].shift(0).fillna(0).astype(int) * res_df["true_mag"]).fillna(0)
cum_strat_online = np.cumprod(1 + strat_online) - 1
cum_hold = np.cumprod(1 + res_df["true_mag"]) - 1
plt.figure(figsize=(10,3))
plt.plot(res_df["date"], cum_strat_online, label="online strategy")
plt.plot(res_df["date"], cum_hold, label="buy & hold")
plt.legend(); plt.title("Cumulative returns: online strategy vs buy&hold"); plt.tight_layout()
plt.savefig(os.path.join(OUT, "cum_returns_online.png")); plt.close()

# Save results to csv
res_df.to_csv(os.path.join(OUT, "streaming_sim_results.csv"), index=False)
print("Saved results & plots to", OUT)

# ---------- 10) Quick diagnostics summary ----------
summary = {
    "online_reg_mse": float(mse_online), "online_reg_mae": float(mae_online),
    "online_dir_acc": float(acc_online), "online_dir_f1": float(f1_online),
    "lstm_mse": float(mse_lstm) if not math.isnan(mse_lstm) else None,
    "lstm_mae": float(mae_lstm) if not math.isnan(mae_lstm) else None,
    "n_test": len(res_df)
}
joblib.dump(summary, os.path.join(OUT, "summary.joblib"))
print("Summary:", summary)


Loaded rows: 2511 Date range: 2015-12-11 to 2025-12-08
Rows after features: 2451
Features count: 25
Total samples: 2450 Train: 1715 Stream-test: 735
Training LSTM (this may take a moment)...
LSTM trained and saved.
ONLINE reg MSE: 5.68716743584474e-07 MAE: 0.000576132347857092
ONLINE dir  Acc: 0.8884353741496599 F1: 0.917004048582996
LSTM reg MSE: 2.44319544791732e-06 MAE: 0.001168247725375198
Saved results & plots to /mnt/data/streaming_model_results
Summary: {'online_reg_mse': 5.68716743584474e-07, 'online_reg_mae': 0.000576132347857092, 'online_dir_acc': 0.8884353741496599, 'online_dir_f1': 0.917004048582996, 'lstm_mse': 2.44319544791732e-06, 'lstm_mae': 0.001168247725375198, 'n_test': 735}


In [None]:
# app.py - Streamlit app for streaming-style trend + magnitude predictions
# Usage:
#   pip install streamlit pandas numpy scikit-learn matplotlib joblib xgboost
#   streamlit run app.py

import streamlit as st
import pandas as pd
import numpy as np
import os, re, joblib
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor, SGDClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score, f1_score
import warnings
warnings.filterwarnings("ignore")

DEFAULT_PATHS = [
    "/mnt/data/streaming_model_results/streaming_sim_results.csv",
    "/mnt/data/MCX_SILVER.csv",
    "/content/MCX_SILVER.csv"
]

st.set_page_config(layout="wide", page_title="Streaming Trend Predictor")

st.title("Streaming Trend Predictor — Direction & Magnitude")

# ---------------------------
# Utilities
# ---------------------------
def safe_to_numeric(s):
    if isinstance(s, str):
        s2 = re.sub(r"[^\d\.\-eE]","", s)
        try:
            return float(s2)
        except:
            return np.nan
    return s

def kalman_smooth(signal, q=1e-5, r=1e-2):
    n = len(signal)
    xhat = np.zeros(n); P = np.zeros(n)
    xhat[0] = signal[0]; P[0] = 1.0
    for k in range(1,n):
        xhat_minus = xhat[k-1]; P_minus = P[k-1] + q
        K = P_minus / (P_minus + r + 1e-12)
        xhat[k] = xhat_minus + K * (signal[k] - xhat_minus)
        P[k] = (1 - K) * P_minus
    return xhat

def default_load_csv():
    for p in DEFAULT_PATHS:
        if os.path.exists(p):
            try:
                df = pd.read_csv(p)
                return df, p
            except:
                pass
    return None, None

def prepare_features(df):
    # Normalize column names
    df = df.copy()
    df.columns = [re.sub(r"[^0-9A-Za-z]+","_", str(c)).lower() for c in df.columns]
    date_col = next((c for c in df.columns if "date" in c), df.columns[0])
    close_col = next((c for c in df.columns if "close" in c or "ltp" in c or "settle" in c), None)
    if close_col is None:
        numeric_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]
        close_col = numeric_cols[0] if numeric_cols else df.columns[1]
    df = df[[date_col, close_col]].rename(columns={date_col:"Date", close_col:"Close"})
    df["Date"] = pd.to_datetime(df["Date"], errors="coerce", dayfirst=True)
    df["Close"] = df["Close"].apply(safe_to_numeric)
    df = df.dropna(subset=["Date","Close"]).sort_values("Date").reset_index(drop=True)
    # smoothing + features
    df["close_kf"] = kalman_smooth(df["Close"].values, q=1e-5, r=1e-2)
    df["ret_1"] = df["close_kf"].pct_change()
    df["logret"] = np.log(df["close_kf"]).diff()
    for w in (5,10,20,50):
        df[f"sma_{w}"] = df["close_kf"].rolling(w).mean()
        df[f"ema_{w}"] = df["close_kf"].ewm(span=w, adjust=False).mean()
    df["roc_5"] = df["close_kf"].pct_change(5)
    for w in (5,20,60):
        df[f"vol_{w}"] = df["ret_1"].rolling(w).std()
    # RSI-ish
    delta = df["close_kf"].diff()
    up = delta.clip(lower=0); down = -delta.clip(upper=0)
    roll_up = up.ewm(com=13).mean(); roll_down = down.ewm(com=13).mean()
    df["rsi_14"] = 100 - 100/(1 + roll_up/(roll_down+1e-12))
    for lag in (1,2,3,5,10):
        df[f"ret_lag_{lag}"] = df["ret_1"].shift(lag)
        df[f"close_lag_{lag}"] = df["close_kf"].shift(lag)
    df = df.dropna().reset_index(drop=True)
    # targets
    df["target_ret_1"] = df["close_kf"].shift(-1) / df["close_kf"] - 1.0
    df["target_dir_1"] = (df["target_ret_1"] > 0).astype(int)
    df = df.dropna(subset=["target_ret_1", "target_dir_1"]).reset_index(drop=True)
    return df

# ---------------------------
# Sidebar - file + options
# ---------------------------
with st.sidebar:
    st.header("Input & Options")
    uploaded = st.file_uploader("Upload MCX CSV (optional)", type=["csv"])
    use_default = st.checkbox("Use default path if available (/mnt/data/...)", value=True)
    seq_len = st.slider("Sequence length for LSTM (if used)", min_value=10, max_value=120, value=30)
    train_frac = st.slider("Training fraction (initial batch)", min_value=0.5, max_value=0.9, value=0.7)
    use_lstm = st.checkbox("Attempt LSTM baseline (requires tensorflow)", value=False)
    run_button = st.button("Run streaming sim")

# ---------------------------
# Load data
# ---------------------------
df_raw = None; source = None
if uploaded is not None:
    try:
        df_raw = pd.read_csv(uploaded)
        source = "uploaded file"
    except Exception as e:
        st.error("Failed to read uploaded file: " + str(e))
elif use_default:
    df_raw, source = default_load_csv()
    if df_raw is None:
        st.info("No default CSV found; please upload a CSV.")
else:
    st.info("Upload a CSV or enable default path.")

if df_raw is not None:
    st.success(f"Loaded data from: {source}")
    st.write("Preview:")
    st.dataframe(df_raw.head(6))

# ---------------------------
# Main pipeline + streaming simulation
# ---------------------------
if df_raw is not None and run_button:
    with st.spinner("Preparing features and running streaming simulation..."):
        df = prepare_features(df_raw)
        st.write(f"Prepared features. Date range: {df['Date'].iloc[0].date()} → {df['Date'].iloc[-1].date()}")
        st.write("Sample after feature prep:")
        st.dataframe(df.head(5))

        # feature matrix
        exclude = ["Date","Close","close_kf","target_ret_1","target_dir_1"]
        feature_cols = [c for c in df.columns if c not in exclude and pd.api.types.is_numeric_dtype(df[c])]
        X = df[feature_cols].values
        y_reg = df["target_ret_1"].values
        y_clf = df["target_dir_1"].values
        dates = df["Date"].values

        n = len(X)
        train_n = int(n * train_frac)
        X_train, X_test = X[:train_n], X[train_n:]
        y_reg_train, y_reg_test = y_reg[:train_n], y_reg[train_n:]
        y_clf_train, y_clf_test = y_clf[:train_n], y_clf[train_n:]
        dates_train, dates_test = dates[:train_n], dates[train_n:]

        scaler = StandardScaler().fit(X_train)
        X_train_s = scaler.transform(X_train)
        X_test_s = scaler.transform(X_test)

        # Attempt to load pre-saved models (optional)
        saved_dir_candidates = ["/mnt/data/streaming_model_results", "/content/streaming_model_results"]
        saved_loaded = False
        model_info = {"online_reg":None, "online_clf":None, "scaler":None, "lstm":None}
        for d in saved_dir_candidates:
            try:
                if os.path.exists(os.path.join(d, "scaler.joblib")):
                    model_info["scaler"] = joblib.load(os.path.join(d, "scaler.joblib"))
                if os.path.exists(os.path.join(d, "lstm_return_model.keras")) and use_lstm:
                    import tensorflow as tf
                    model_info["lstm"] = tf.keras.models.load_model(os.path.join(d, "lstm_return_model.keras"))
                # we won't try to load SGD models from disk in this generic app
                saved_loaded = True
            except Exception:
                pass

        # Build online models and initialize
        online_reg = SGDRegressor(max_iter=1000, tol=1e-3)
        online_clf = SGDClassifier(max_iter=1000, tol=1e-3)
        init_n = min(50, len(X_train_s))
        if init_n <= 2:
            st.error("Not enough training rows to initialize online model. Increase data or reduce train fraction.")
        online_reg.partial_fit(X_train_s[:init_n], y_reg_train[:init_n])
        online_clf.partial_fit(X_train_s[:init_n], y_clf_train[:init_n], classes=np.array([0,1]))

        # If user requests LSTM baseline, try to train (may take time)
        lstm_model = None
        if use_lstm:
            try:
                import tensorflow as tf
                from tensorflow.keras import layers, models, callbacks
                tf.random.set_seed(42)
                # prepare sequences
                def make_sequences(Xa, seq_len):
                    Xs = []
                    for i in range(seq_len, len(Xa)):
                        Xs.append(Xa[i-seq_len:i])
                    return np.array(Xs)
                # Build sequence arrays aligned to regression y
                scaled_all = scaler.transform(X)
                Xs_all = make_sequences(scaled_all, seq_len)
                yseq_all = y_reg[seq_len:]
                seq_train_n = int(len(Xs_all) * 0.7)
                Xs_train = Xs_all[:seq_train_n]; yseq_train = yseq_all[:seq_train_n]
                # small LSTM
                inp = layers.Input(shape=(seq_len, Xs_train.shape[2]))
                x = layers.Bidirectional(layers.LSTM(32, return_sequences=False))(inp)
                x = layers.Dropout(0.2)(x)
                x = layers.Dense(16, activation="relu")(x)
                out = layers.Dense(1, activation="linear")(x)
                lstm_model = models.Model(inp, out)
                lstm_model.compile(optimizer="adam", loss="mse")
                es = callbacks.EarlyStopping(monitor="val_loss", patience=4, restore_best_weights=True)
                lstm_model.fit(Xs_train, yseq_train, validation_split=0.1, epochs=20, batch_size=32, callbacks=[es], verbose=0)
                st.success("LSTM baseline trained.")
            except Exception as e:
                st.warning("LSTM training failed or tensorflow not available: " + str(e))
                lstm_model = None

        # Streaming simulation over the test set
        results = []
        preds_online_reg = []
        preds_online_clf = []
        preds_lstm_reg = []

        for i in range(len(X_test_s)):
            x_t = X_test_s[i].reshape(1,-1)
            pred_mag_online = online_reg.predict(x_t)[0]
            pred_dir_online = online_clf.predict(x_t)[0]
            preds_online_reg.append(pred_mag_online)
            preds_online_clf.append(int(pred_dir_online))
            # LSTM pred if available (sequence ending at absolute index)
            if lstm_model is not None:
                abs_idx = train_n + i
                if abs_idx - seq_len + 1 >= 0:
                    seq = scaler.transform(X)[abs_idx-seq_len+1:abs_idx+1]
                    if seq.shape[0] == seq_len:
                        seq = seq.reshape(1, seq_len, seq.shape[1])
                        try:
                            pred_l = float(lstm_model.predict(seq, verbose=0)[0,0])
                        except:
                            pred_l = np.nan
                    else:
                        pred_l = np.nan
                else:
                    pred_l = np.nan
                preds_lstm_reg.append(pred_l)
            true_mag = y_reg_test[i]; true_dir = int(y_clf_test[i])
            results.append({
                "date": dates_test[i], "true_mag": true_mag, "true_dir": true_dir,
                "pred_online_mag": pred_mag_online, "pred_online_dir": int(pred_dir_online),
                "pred_lstm_mag": preds_lstm_reg[-1] if lstm_model is not None else np.nan
            })
            # update online models with the true label (simulate learning)
            online_reg.partial_fit(x_t, np.array([true_mag]))
            online_clf.partial_fit(x_t, np.array([true_dir]))

        res_df = pd.DataFrame(results)
        # metrics
        mse_online = mean_squared_error(res_df["true_mag"], res_df["pred_online_mag"])
        mae_online = mean_absolute_error(res_df["true_mag"], res_df["pred_online_mag"])
        acc_online = accuracy_score(res_df["true_dir"], res_df["pred_online_dir"])
        f1_online = f1_score(res_df["true_dir"], res_df["pred_online_dir"])
        if lstm_model is not None and res_df["pred_lstm_mag"].notna().sum() > 0:
            valid_mask = ~np.isnan(res_df["pred_lstm_mag"])
            mse_lstm = mean_squared_error(res_df["true_mag"][valid_mask], res_df["pred_lstm_mag"][valid_mask])
            mae_lstm = mean_absolute_error(res_df["true_mag"][valid_mask], res_df["pred_lstm_mag"][valid_mask])
        else:
            mse_lstm = mae_lstm = np.nan

        # Show summary table
        st.subheader("Performance Summary (stream-test)")
        summary_table = pd.DataFrame([
            {"Model":"Online (SGDReg/SGDClf)","MSE":mse_online,"MAE":mae_online,"Accuracy":acc_online,"F1":f1_online},
            {"Model":"LSTM (batch)","MSE":mse_lstm,"MAE":mae_lstm,"Accuracy":None,"F1":None}
        ])
        st.dataframe(summary_table.style.format({"MSE":"{:.3e}", "MAE":"{:.6f}", "Accuracy":"{:.4f}", "F1":"{:.4f}"}))

        # Latest prediction box
        st.subheader("Latest Prediction")
        latest = res_df.iloc[-1]
        col1, col2, col3 = st.columns(3)
        col1.metric("Date", str(pd.to_datetime(latest["date"]).date()))
        col2.metric("Predicted Direction", "UP" if latest["pred_online_dir"]==1 else "DOWN")
        col3.metric("Predicted Next-Day Return", f"{latest['pred_online_mag']*100:.3f}%")

        # Plots: pred vs true scatter, time series, rolling accuracy, cumulative returns
        st.subheader("Diagnostics & Plots")
        # 1) scatter
        fig, ax = plt.subplots(figsize=(5,4))
        ax.scatter(res_df["true_mag"], res_df["pred_online_mag"], alpha=0.4, s=8)
        ax.set_xlabel("True next-day return"); ax.set_ylabel("Predicted (online)")
        ax.set_title("Predicted vs True (online reg)")
        st.pyplot(fig)

        # 2) timeseries (last 400 points)
        maxpts = min(400, len(res_df))
        fig2, ax2 = plt.subplots(figsize=(10,3))
        ax2.plot(pd.to_datetime(res_df["date"][-maxpts:]), res_df["true_mag"][-maxpts:], label="true", linewidth=1)
        ax2.plot(pd.to_datetime(res_df["date"][-maxpts:]), res_df["pred_online_mag"][-maxpts:], label="online_pred", linewidth=1)
        if lstm_model is not None:
            ax2.plot(pd.to_datetime(res_df["date"][-maxpts:]), res_df["pred_lstm_mag"][-maxpts:], label="lstm_pred", linewidth=1)
        ax2.legend(); ax2.set_title("True vs Predictions (recent)")
        st.pyplot(fig2)

        # 3) rolling accuracy for direction
        res_df["correct_online_dir"] = (res_df["true_dir"] == res_df["pred_online_dir"]).astype(int)
        res_df["rolling_acc_online"] = res_df["correct_online_dir"].rolling(50, min_periods=1).mean()
        fig3, ax3 = plt.subplots(figsize=(10,2))
        ax3.plot(pd.to_datetime(res_df["date"]), res_df["rolling_acc_online"], label="rolling_acc_online")
        ax3.axhline(0.5, linestyle="--", color="k", alpha=0.6)
        ax3.set_ylim(0,1)
        ax3.set_title("Rolling accuracy (window=50)")
        st.pyplot(fig3)

        # 4) cumulative returns: strategy vs buy & hold
        strat_online = (res_df["pred_online_dir"].astype(int) * res_df["true_mag"]).fillna(0)
        cum_strat_online = np.cumprod(1 + strat_online) - 1
        cum_hold = np.cumprod(1 + res_df["true_mag"]) - 1
        fig4, ax4 = plt.subplots(figsize=(10,3))
        ax4.plot(pd.to_datetime(res_df["date"]), cum_strat_online, label="online strategy")
        ax4.plot(pd.to_datetime(res_df["date"]), cum_hold, label="buy & hold")
        ax4.legend(); ax4.set_title("Cumulative returns")
        st.pyplot(fig4)

        # export results
        out_dir = os.path.join(".", "streamlit_results")
        os.makedirs(out_dir, exist_ok=True)
        res_df.to_csv(os.path.join(out_dir, "stream_sim_results.csv"), index=False)
        joblib.dump({"summary": {"mse_online":mse_online, "mae_online":mae_online, "acc_online":acc_online, "f1_online":f1_online}}, os.path.join(out_dir, "summary.joblib"))
        st.success(f"Streaming sim finished. Results saved to {out_dir}/stream_sim_results.csv")
        st.info("Tip: Use transaction-cost thresholding before trading; only trade when predicted magnitude > threshold.")


