# Cryptocurrency Classification Training
This notebook demonstrates a from-scratch training pipeline to predict bullish hours using engineered features.

## 1. Generate or load dataset

In [None]:
from pathlib import Path
from utils.build_dataset import generate_dataset

RAW_CSV = Path('data/raw/BTCUSDT_1h.csv')
OUTPUT_DIR = Path('data/processed/classification')
VERSION = 'v2'

# Generate dataset if it does not already exist
if not (OUTPUT_DIR / f'X_{VERSION}.parquet').exists():
    generate_dataset(
        raw_path=str(RAW_CSV),
        output_dir=str(OUTPUT_DIR),
        version=VERSION,
        task='classification',
        horizon=3,
        ml_logger=None,
    )


## 2. Load processed features and labels

In [None]:
import pandas as pd

base = Path('data/processed/classification')
X = pd.read_parquet(base / 'X_v2.parquet')
y = pd.read_parquet(base / 'y_v2.parquet').squeeze()

# Align indices and sort chronologically
X = X.sort_index()
y = y.loc[X.index]


## 3. Train/test split

In [None]:
split_idx = int(len(X) * 0.8)
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]


## 4. Hyperparameter search

In [None]:
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import numpy as np

# Search space for LogisticRegression
lr = LogisticRegression(max_iter=1000)
param_dist_lr = {
    'C': np.logspace(-3, 1, 20),
    'penalty': ['l2'],
}

# Search space for XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
param_dist_xgb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
}

cv = TimeSeriesSplit(n_splits=3)

search_lr = RandomizedSearchCV(lr, param_dist_lr, n_iter=10, cv=cv, scoring='roc_auc', n_jobs=-1, random_state=42)
search_lr.fit(X_train, y_train)

search_xgb = RandomizedSearchCV(xgb, param_dist_xgb, n_iter=10, cv=cv, scoring='roc_auc', n_jobs=-1, random_state=42)
search_xgb.fit(X_train, y_train)


## 5. Evaluate best model

In [None]:
models = {
    'LogisticRegression': search_lr.best_estimator_,
    'XGBClassifier': search_xgb.best_estimator_,
}

results = {}
for name, model in models.items():
    proba = model.predict_proba(X_test)[:, 1]
    preds = (proba > 0.5).astype(int)
    results[name] = {
        'roc_auc': roc_auc_score(y_test, proba),
        'accuracy': accuracy_score(y_test, preds),
        'precision': precision_score(y_test, preds),
        'recall': recall_score(y_test, preds),
        'f1': f1_score(y_test, preds),
    }

results


## 6. Save the best model

In [None]:
from joblib import dump

best_name = max(results, key=lambda k: results[k]['roc_auc'])
best_model = models[best_name]
Path('models').mkdir(exist_ok=True)
model_path = Path('models') / f'classification_{best_name.lower()}_v2.joblib'
dump(best_model, model_path)
print('Saved', model_path)
