In [None]:
# ===============================================================
# IMPROVED SYNTHETIC DATA WITH TRUE SIGNAL + HIGH ACCURACY MODEL
# ===============================================================

import numpy as np
import pandas as pd
import shap
from lime.lime_tabular import LimeTabularExplainer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings("ignore")

# ===============================================================
# 1. REALISTIC SYNTHETIC FINANCIAL DATA GENERATION
# ===============================================================

np.random.seed(42)
N = 3000
dates = pd.date_range(start="2012-01-01", periods=N, freq="D")

# Base price series (random walk)
price = 100 + np.cumsum(np.random.normal(0, 0.8, N))

# Momentum
momentum_5 = price - pd.Series(price).shift(5)
momentum_10 = price - pd.Series(price).shift(10)

# Volatility
volatility = np.abs(np.random.normal(1.5, 0.4, N))

# Sentiment (strong predictor)
sentiment = np.random.normal(0, 1, N)

# Yield curve (macro signal)
yield_spread = np.random.normal(1.5, 0.2, N)

# Inflation (inverse effect)
inflation = np.random.normal(5, 0.3, N)

df = pd.DataFrame({
    "date": dates,
    "close_price": price,
    "momentum_5": momentum_5,
    "momentum_10": momentum_10,
    "volatility": volatility,
    "sentiment": sentiment,
    "yield_spread": yield_spread,
    "inflation": inflation,
})

# ===============================================================
# 2. CREATE TRUE SIGNAL FOR FUTURE RETURNS
# ===============================================================

# "True" underlying formula (hidden from model)
true_signal = (
    0.6 * np.sign(momentum_10) +
    0.4 * np.sign(momentum_5) +
    0.8 * np.sign(sentiment) -
    0.7 * np.sign(volatility - volatility.mean()) +
    0.5 * np.sign(yield_spread - yield_spread.mean()) -
    0.3 * np.sign(inflation - inflation.mean())
)

# Add noise
noise = np.random.normal(0, 0.7, N)
direction = (true_signal + noise > 0).astype(int)

df["target"] = direction
df = df.dropna()

# ===============================================================
# 3. TRAIN TEST SPLIT
# ===============================================================

features = ["momentum_5","momentum_10","volatility","sentiment","yield_spread","inflation"]
X = df[features]
y = df["target"]

split = int(0.8 * len(df))
X_train, X_test = X.iloc[:split], X.iloc[split:]
y_train, y_test = y.iloc[:split], y.iloc[split:]

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# ===============================================================
# 4. HIGH ACCURACY MODEL â€” XGBOOST
# ===============================================================

model = XGBClassifier(
    n_estimators=600,
    learning_rate=0.03,
    max_depth=5,
    subsample=0.85,
    colsample_bytree=0.85,
    eval_metric="logloss",
    random_state=42
)

model.fit(X_train, y_train)

# ===============================================================
# 5. PERFORMANCE
# ===============================================================

preds = (model.predict_proba(X_test)[:, 1] > 0.5).astype(int)

print("Accuracy:", accuracy_score(y_test, preds))
print("\nClassification Report:\n", classification_report(y_test, preds))

# ===============================================================
# 6. SHAP INTERPRETATION
# ===============================================================

explainer = shap.TreeExplainer(model)
shap_vals = explainer.shap_values(X_train)

shap.summary_plot(shap_vals, X_train, features)
shap.summary_plot(shap_vals, X_train, features, plot_type="bar")

# ===============================================================
# 7. LIME LOCAL EXPLANATION
# ===============================================================

lime_exp = LimeTabularExplainer(
    X_train,
    feature_names=features,
    class_names=["Down", "Up"],
    discretize_continuous=True
)

instance = X_test[10]
lime_ex = lime_exp.explain_instance(instance, model.predict_proba, num_features=6)

print("\nLIME Explanation:")
print(lime_ex.as_list())
