# SGDRegressor
# Import Dependencies
This notebook retraces the four-hour Solana feature engineering flow and trains an `SGDRegressor` on minute-ahead price deltas using the same structure as the legacy Random Forest walkthrough.

In [None]:
import os
import sys
import random
from pathlib import Path
from datetime import datetime, timedelta

import numpy as np
import pandas as pd
import requests
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

sns.set_style("darkgrid")
plt.rcParams["figure.figsize"] = (12, 6)


def set_seed(seed: int = 42) -> None:
    """Set deterministic seeds for numpy, random, and Python hashing."""
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)


def find_project_root() -> Path:
    """Locate the repository root by searching for src/mlProject."""
    current = Path.cwd().resolve()
    for candidate in [current, *current.parents]:
        if (candidate / "src" / "mlProject").exists():
            return candidate
    raise RuntimeError("Unable to locate project root containing src/mlProject")


def ensure_pythonpath(root: Path) -> None:
    """Make sure the project's src directory is importable."""
    src_path = root / "src"
    if str(src_path) not in sys.path:
        sys.path.append(str(src_path))


def configure_environment(seed: int = 42) -> Path:
    """Set global seed, find root, and extend sys.path."""
    set_seed(seed)
    project_root = find_project_root()
    ensure_pythonpath(project_root)
    return project_root


PROJECT_ROOT = configure_environment()
print(f"Project root: {PROJECT_ROOT}")

# Fetch Last Four Hours of 1-Minute Data
We re-use the CryptoCompare minute endpoint to collect a synchronized four-hour SOL/USDT window so the SGDRegressor sees the same inputs as the production pipelines.

In [None]:
CRYPTOCOMPARE_URL = "https://min-api.cryptocompare.com/data/v2/histominute"


def fetch_minute_data(symbol: str = "SOL", quote: str = "USD", minutes: int = 240) -> pd.DataFrame:
    params = {
        "fsym": symbol.upper(),
        "tsym": quote.upper(),
        "limit": minutes,
        "aggregate": 1,
    }
    response = requests.get(CRYPTOCOMPARE_URL, params=params, timeout=15)
    response.raise_for_status()
    payload = response.json()
    if payload.get("Response") != "Success":
        raise RuntimeError(f"CryptoCompare API error: {payload.get('Message')}")

    frame = pd.DataFrame(payload["Data"]["Data"])
    frame["datetime"] = pd.to_datetime(frame["time"], unit="s", utc=True)
    frame = frame.rename(
        columns={
            "close": "price",
            "volumefrom": "volume",
            "volumeto": "market_cap",
        }
    )
    frame = frame[["datetime", "price", "volume", "market_cap"]]
    frame = frame.set_index("datetime").sort_index()
    return frame


raw_minute_df = fetch_minute_data()
raw_minute_df.tail()
raw_minute_df.shape

# Preprocess & Align Time Series
We align the raw candles to a continuous UTC minute index, interpolating any gaps to maintain evenly spaced samples for the downstream feature pipeline.

In [None]:
def align_minute_frame(frame: pd.DataFrame) -> pd.DataFrame:
    cleaned = frame[~frame.index.duplicated(keep="last")]
    full_index = pd.date_range(
        end=cleaned.index.max(),
        periods=len(cleaned),
        freq="1min",
        tz="UTC",
    )
    aligned = cleaned.reindex(full_index)
    aligned = aligned.interpolate(method="time").bfill().ffill()
    return aligned


aligned_minute_df = align_minute_frame(raw_minute_df)
aligned_minute_df.head()

# Feature Engineering Pipeline
We enrich the aligned series with the project technical indicators, create minute-ahead targets, and compute price deltas that the SGDRegressor will learn.

In [None]:
from mlProject.entity.config_entity import DataIngestionConfig
from mlProject.components.crypto_data_ingestion import CryptoDataIngestion

TEMP_ARTIFACT_DIR = PROJECT_ROOT / "artifacts" / "notebook_tmp"
TEMP_ARTIFACT_DIR.mkdir(parents=True, exist_ok=True)

ingestion_config = DataIngestionConfig(
    root_dir=TEMP_ARTIFACT_DIR,
    source_url="",
    local_data_file=TEMP_ARTIFACT_DIR / "minute_data.zip",
    unzip_dir=TEMP_ARTIFACT_DIR / "unzipped",
)

ingestor = CryptoDataIngestion(ingestion_config)
feature_enriched_df = ingestor.add_technical_indicators(aligned_minute_df.copy())
feature_enriched_df = ingestor.create_prediction_targets(feature_enriched_df)
feature_enriched_df = feature_enriched_df.dropna(subset=["target_price_1min"]).fillna(0.0)

feature_enriched_df["target_delta_1min"] = feature_enriched_df["target_price_1min"] - feature_enriched_df["price"]

CLEAN_FEATURES = [
    "price", "volume", "market_cap",
    "sma_7", "sma_14", "sma_30",
    "ema_7", "ema_14",
    "macd", "macd_signal", "macd_histogram",
    "rsi",
    "bb_middle", "bb_upper", "bb_lower",
    "price_change_1h", "price_change_24h", "price_change_7d",
    "volume_sma", "volume_ratio",
    "volatility",
    "high_14d", "low_14d",
    "price_position",
]

features = feature_enriched_df[CLEAN_FEATURES].values
target_prices = feature_enriched_df["target_price_1min"].values
base_prices = feature_enriched_df["price"].values
target_deltas = feature_enriched_df["target_delta_1min"].values

print(f"Features shape: {features.shape}")
print(f"Target deltas shape: {target_deltas.shape}")

# Train SGDRegressor
We split the series chronologically, scale inputs with a `StandardScaler`, and fit an `SGDRegressor` tuned for the same delta target the Streamlit pipeline consumes.

In [None]:
split_index = int(len(features) * 0.8)
X_train, X_valid = features[:split_index], features[split_index:]
y_train = target_deltas[:split_index]
y_valid = target_deltas[split_index:]
base_valid = base_prices[split_index:]

sgd_pipeline = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
        (
            "sgd",
            SGDRegressor(
                loss="squared_error",
                penalty="l2",
                alpha=1e-4,
                learning_rate="invscaling",
                eta0=1e-2,
                power_t=0.25,
                max_iter=2000,
                tol=1e-4,
                random_state=42,
                shuffle=True,
            ),
        ),
    ],
    memory=None,
)
sgd_pipeline.fit(X_train, y_train)

if len(X_valid) == 0:
    print("Validation window empty; collect more data before evaluating.")
else:
    delta_valid_pred = sgd_pipeline.predict(X_valid)
    price_valid_pred = base_valid + delta_valid_pred
    mae = mean_absolute_error(target_prices[split_index:], price_valid_pred)
    rmse = float(np.sqrt(mean_squared_error(target_prices[split_index:], price_valid_pred)))
    r2 = r2_score(target_prices[split_index:], price_valid_pred)
    print(f"Validation MAE : {mae:.6f}")
    print(f"Validation RMSE: {rmse:.6f}")
    print(f"Validation R^2 : {r2:.6f}")

# Evaluate Predictions
We visualize the minute-ahead forecasts against ground truth to confirm the SGDRegressor tracks the delta dynamics before wiring the artifact into Streamlit.

In [None]:
if len(X_valid) > 0:
    valid_timestamps = feature_enriched_df.index[split_index:]
    plt.figure()
    plt.plot(valid_timestamps, target_prices[split_index:], label="Actual", linewidth=2)
    plt.plot(valid_timestamps, price_valid_pred, label="SGDRegressor", linewidth=2)
    plt.title("SGDRegressor Minute-Ahead Forecast")
    plt.xlabel("Timestamp")
    plt.ylabel("Price (USD)")
    plt.legend()
    plt.tight_layout()
    plt.show()