# ðŸ“ˆ Google Stock ML - Main Pipeline

Modular ML pipeline for predicting Google (GOOGL) stock next-day returns.

## Models
- **XGBoost** â€” Gradient boosting with 3-stage HPO
- **LSTM & GRU** â€” Recurrent neural networks
- **Hybrid** â€” Sequential & Parallel architectures

---
## 1. Setup & Configuration

In [None]:
# Mount Google Drive (Colab)
import sys
IN_COLAB = "google.colab" in sys.modules

if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    
    # Add project to path
    PROJECT_PATH = '/content/drive/MyDrive/google_stock_ml'
    sys.path.insert(0, PROJECT_PATH)
else:
    # Local development
    PROJECT_PATH = '.'
    sys.path.insert(0, PROJECT_PATH)

In [None]:
# Install dependencies if needed
!pip install -q yfinance xgboost optuna tensorflow scikit-learn pandas-datareader

In [None]:
# Imports
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from datetime import datetime
from pathlib import Path

# Project modules
from src.config import setup_paths, get_default_params
from src.utils import (
    save_json, save_pickle, load_pickle, copy_file,
    compute_sample_weights, save_run_outputs
)
from src.data.loaders import load_all_data
from src.features.engineering import build_all_features, add_target

print("[OK] Imports complete.")

In [None]:
# Setup paths and configuration
paths = setup_paths(
    drive_project_root="/content/drive/MyDrive/my_project",
    local_project_root="/content/my_project"
)

RUN_PARAMS = get_default_params(paths)

# Extract commonly used paths
RUN_ID = paths["RUN_ID"]
PROJECT_ROOT = paths["PROJECT_ROOT"]
LOCAL_PATHS = paths["LOCAL_PATHS"]
DRIVE_PATHS = paths["DRIVE_PATHS"]
DATA_DIRS_LOCAL = paths["DATA_DIRS_LOCAL"]
DATA_DIRS_DRIVE = paths["DATA_DIRS_DRIVE"]

print(f"[CONFIG] RUN_ID: {RUN_ID}")
print(f"[CONFIG] PROJECT_ROOT: {PROJECT_ROOT}")

# Save config
save_json(RUN_PARAMS, LOCAL_PATHS["config_dir"] / "run_params.json")
save_json(RUN_PARAMS, DRIVE_PATHS["config_dir"] / "run_params.json")

print("[OK] Configuration complete.")

---
## 2. Load Data

In [None]:
# Price tickers
PRICE_TICKERS = [
    "GOOGL", "MSFT", "NVDA",
    "^IXIC", "SPY", "QQQ",
    "^VIX", "^TNX", "XLK"
]

# Date range
start = RUN_PARAMS["data"]["start_date"]
end = datetime.now().strftime("%Y-%m-%d")

# Load all data
full_df = load_all_data(
    price_tickers=PRICE_TICKERS,
    start=start,
    end=end,
    base_ticker="GOOGL",
    eu_config=RUN_PARAMS.get("eu_break_close"),
    load_macro=True
)

print(f"[OK] Data loaded. Shape: {full_df.shape}")

---
## 3. Feature Engineering

In [None]:
# Build all features
full_df = build_all_features(full_df, RUN_PARAMS)

print(f"[OK] Features built. Shape: {full_df.shape}")

In [None]:
# Limit to analysis period
limit_date = RUN_PARAMS["data"]["limit_start_date"]
full_df = full_df[full_df.index >= limit_date].copy()

# Drop rows with NaN target
TARGET_COL = RUN_PARAMS["data"]["target_col"]
full_df = full_df.dropna(subset=[TARGET_COL])

print(f"[OK] After limiting: {full_df.shape}")
print(f"Date range: {full_df.index.min()} to {full_df.index.max()}")

---
## 4. Train/Valid/Test Split

In [None]:
# Split dates
train_end = RUN_PARAMS["data"]["train_end"]
valid_start = RUN_PARAMS["data"]["valid_start"]
valid_end = RUN_PARAMS["data"]["valid_end"]
test_start = RUN_PARAMS["data"]["test_start"]

# Create masks
dates = full_df.index.normalize()
train_mask = dates <= train_end
valid_mask = (dates >= valid_start) & (dates <= valid_end)
test_mask = dates >= test_start

# Split data
train_df = full_df[train_mask].copy()
valid_df = full_df[valid_mask].copy()
test_df = full_df[test_mask].copy()

print(f"Train: {len(train_df)} ({train_df.index.min().date()} to {train_df.index.max().date()})")
print(f"Valid: {len(valid_df)} ({valid_df.index.min().date()} to {valid_df.index.max().date()})")
print(f"Test:  {len(test_df)} ({test_df.index.min().date()} to {test_df.index.max().date()})")

In [None]:
# Separate features and target
exclude_cols = [TARGET_COL, "sample_weight"]
feature_cols = [c for c in full_df.columns if c not in exclude_cols]

X_train = train_df[feature_cols].copy()
X_valid = valid_df[feature_cols].copy()
X_test = test_df[feature_cols].copy()

y_train = train_df[TARGET_COL].copy()
y_valid = valid_df[TARGET_COL].copy()
y_test = test_df[TARGET_COL].copy()

# Compute sample weights
w_cfg = RUN_PARAMS["weights"]
w_train = compute_sample_weights(y_train, c=w_cfg["c"], max_w=w_cfg["max_w"])
w_valid = compute_sample_weights(y_valid, c=w_cfg["c"], max_w=w_cfg["max_w"])
w_test = compute_sample_weights(y_test, c=w_cfg["c"], max_w=w_cfg["max_w"])

print(f"Features: {len(feature_cols)}")
print(f"[OK] Split complete.")

In [None]:
# Save splits to data/processed
PROC_LOCAL = DATA_DIRS_LOCAL["processed"]
PROC_DRIVE = DATA_DIRS_DRIVE["processed"]

X_train.to_pickle(PROC_LOCAL / "X_train_xgb.pkl")
X_valid.to_pickle(PROC_LOCAL / "X_valid_xgb.pkl")
X_test.to_pickle(PROC_LOCAL / "X_test_xgb.pkl")

save_pickle(y_train, PROC_LOCAL / "y_train.pkl")
save_pickle(y_valid, PROC_LOCAL / "y_valid.pkl")
save_pickle(y_test, PROC_LOCAL / "y_test.pkl")

save_pickle(w_train, PROC_LOCAL / "weights_train.pkl")
save_pickle(w_valid, PROC_LOCAL / "weights_valid.pkl")
save_pickle(w_test, PROC_LOCAL / "weights_test.pkl")

# Copy to Drive
for f in PROC_LOCAL.glob("*.pkl"):
    copy_file(f, PROC_DRIVE / f.name)

print("[OK] Splits saved.")

---
## 5. XGBoost Feature Selection

In [None]:
from src.models.xgboost_model import xgb_feature_selection

selected_features, gain_df, perm_df = xgb_feature_selection(
    X_train, X_valid,
    y_train, y_valid,
    w_train, w_valid,
    config=RUN_PARAMS["xgb_fs"],
    output_dir=LOCAL_PATHS["fs_dir"]
)

# Copy to Drive
for f in LOCAL_PATHS["fs_dir"].glob("*"):
    copy_file(f, DRIVE_PATHS["fs_dir"] / f.name)

print(f"\n[OK] Selected {len(selected_features)} features.")

In [None]:
# Create filtered datasets
X_train_sel = X_train[selected_features].copy()
X_valid_sel = X_valid[selected_features].copy()
X_test_sel = X_test[selected_features].copy()

# Save
X_train_sel.to_pickle(PROC_LOCAL / "X_train_xgb_selected.pkl")
X_valid_sel.to_pickle(PROC_LOCAL / "X_valid_xgb_selected.pkl")
X_test_sel.to_pickle(PROC_LOCAL / "X_test_xgb_selected.pkl")

for f in ["X_train_xgb_selected.pkl", "X_valid_xgb_selected.pkl", "X_test_xgb_selected.pkl"]:
    copy_file(PROC_LOCAL / f, PROC_DRIVE / f)

print(f"[OK] Selected feature datasets saved. Shape: {X_train_sel.shape}")

---
## 6. XGBoost HPO

In [None]:
from src.models.xgboost_model import run_hpo, train_final_model

# Split validation for HPO (early stopping vs scoring)
hpo_cfg = RUN_PARAMS["hpo"]
valid_es_start = hpo_cfg["valid_es_start"]
valid_es_end = hpo_cfg["valid_es_end"]
valid_score_start = hpo_cfg["valid_score_start"]
valid_score_end = hpo_cfg["valid_score_end"]

valid_dates = X_valid_sel.index.normalize()
es_mask = (valid_dates >= valid_es_start) & (valid_dates <= valid_es_end)
score_mask = (valid_dates >= valid_score_start) & (valid_dates <= valid_score_end)

X_valid_es = X_valid_sel[es_mask]
X_valid_score = X_valid_sel[score_mask]
y_valid_es = y_valid[es_mask]
y_valid_score = y_valid[score_mask]
w_valid_es = w_valid[es_mask]
w_valid_score = w_valid[score_mask]

print(f"Valid ES: {len(X_valid_es)} | Valid Score: {len(X_valid_score)}")

In [None]:
# Run HPO
best_params, all_trials = run_hpo(
    X_train_sel,
    X_valid_es, X_valid_score,
    y_train, y_valid_es, y_valid_score,
    w_train, w_valid_es, w_valid_score,
    config=hpo_cfg,
    random_state=RUN_PARAMS["random_state"]
)

# Save HPO results
save_json(best_params, LOCAL_PATHS["ms_dir"] / "best_params.json")
save_pickle(all_trials, LOCAL_PATHS["ms_dir"] / "hpo_trials.pkl")

copy_file(LOCAL_PATHS["ms_dir"] / "best_params.json", DRIVE_PATHS["ms_dir"] / "best_params.json")

print(f"\n[OK] HPO complete. Best params saved.")

---
## 7. Train Final XGBoost Model

In [None]:
# Train final model
xgb_model, xgb_metrics = train_final_model(
    X_train_sel, X_valid_sel, X_test_sel,
    y_train, y_valid, y_test,
    w_train, w_valid, w_test,
    params=best_params,
    config=hpo_cfg
)

# Save model
xgb_model.save_model(str(LOCAL_PATHS["models_dir"] / "xgb_final.json"))
copy_file(LOCAL_PATHS["models_dir"] / "xgb_final.json", DRIVE_PATHS["models_dir"] / "xgb_final.json")

# Save metrics
save_json(xgb_metrics, LOCAL_PATHS["outputs_dir"] / "xgb_metrics.json")

print(f"\n[OK] XGBoost training complete.")

---
## 8. Neural Network Training

In [None]:
from src.models.neural import train_lstm, train_gru, train_hybrid_sequential, train_hybrid_parallel

# Collect results
all_results = []

# Add XGBoost result
all_results.append({
    "run_id": RUN_ID,
    "model": "XGBoost",
    "feature_set": "xgb_selected",
    "test_wrmse": xgb_metrics["test_wrmse"],
    "test_diracc": xgb_metrics["test_diracc"],
})

In [None]:
# Train LSTM
lstm_model, lstm_results = train_lstm(
    X_train_sel, X_valid_sel, X_test_sel,
    y_train, y_valid, y_test,
    config=RUN_PARAMS["lstm"],
    output_dir=LOCAL_PATHS["models_dir"]
)

all_results.append({
    "run_id": RUN_ID,
    "model": "LSTM",
    "feature_set": "xgb_selected",
    "test_wrmse": lstm_results["metrics"]["test_wrmse"],
    "test_diracc": lstm_results["metrics"]["test_diracc"],
})

In [None]:
# Train GRU
gru_model, gru_results = train_gru(
    X_train_sel, X_valid_sel, X_test_sel,
    y_train, y_valid, y_test,
    config=RUN_PARAMS["gru"],
    output_dir=LOCAL_PATHS["models_dir"]
)

all_results.append({
    "run_id": RUN_ID,
    "model": "GRU",
    "feature_set": "xgb_selected",
    "test_wrmse": gru_results["metrics"]["test_wrmse"],
    "test_diracc": gru_results["metrics"]["test_diracc"],
})

In [None]:
# Train Hybrid Sequential
hybrid_seq_model, hybrid_seq_results = train_hybrid_sequential(
    X_train_sel, X_valid_sel, X_test_sel,
    y_train, y_valid, y_test,
    config=RUN_PARAMS["hybrid_seq"],
    output_dir=LOCAL_PATHS["models_dir"]
)

all_results.append({
    "run_id": RUN_ID,
    "model": "Hybrid-Seq",
    "feature_set": "xgb_selected",
    "test_wrmse": hybrid_seq_results["metrics"]["test_wrmse"],
    "test_diracc": hybrid_seq_results["metrics"]["test_diracc"],
})

In [None]:
# Train Hybrid Parallel
hybrid_par_model, hybrid_par_results = train_hybrid_parallel(
    X_train_sel, X_valid_sel, X_test_sel,
    y_train, y_valid, y_test,
    config=RUN_PARAMS["hybrid_par"],
    output_dir=LOCAL_PATHS["models_dir"]
)

all_results.append({
    "run_id": RUN_ID,
    "model": "Hybrid-Par",
    "feature_set": "xgb_selected",
    "test_wrmse": hybrid_par_results["metrics"]["test_wrmse"],
    "test_diracc": hybrid_par_results["metrics"]["test_diracc"],
})

---
## 9. Results Summary

In [None]:
# Create results DataFrame
results_df = pd.DataFrame(all_results)
results_df = results_df.sort_values("test_wrmse").reset_index(drop=True)
results_df.insert(0, "rank", range(1, len(results_df) + 1))

print("\n" + "="*60)
print("RESULTS SUMMARY")
print("="*60)
display(results_df)

# Best model
best = results_df.iloc[0]
print(f"\nBest Model: {best['model']}")
print(f"Test wRMSE: {best['test_wrmse']:.6f}")
print(f"Test DirAcc: {best['test_diracc']:.4f}")

In [None]:
# Save results
results_df.to_csv(LOCAL_PATHS["outputs_dir"] / "results_summary.csv", index=False)
copy_file(LOCAL_PATHS["outputs_dir"] / "results_summary.csv", 
          DRIVE_PATHS["outputs_dir"] / "results_summary.csv")

print(f"\n[OK] Results saved to {LOCAL_PATHS['outputs_dir']}")
print(f"[OK] Pipeline complete!")