# Model A v1.4 - Simplified Training with Pre-Built Features

**Improvements over v1.3:**
- ‚úÖ Uses pre-built extended feature dataset (1.12M samples)
- ‚úÖ NEW technical features: mom_1, mom_3, vol_30, vol_ratio_30_90
- ‚úÖ Class weighting for balanced predictions  
- ‚úÖ 35 features with >40% coverage
- ‚úÖ 2,394 symbols

**Target**: 64-66% ROC-AUC (baseline: 60.3%)

**Instructions**: Upload `featureset_extended_latest.parquet` to Colab Files before running!

## Cell 1: Install Dependencies

In [None]:
%%capture
!pip install lightgbm==4.1.0 pandas numpy scikit-learn joblib pyarrow

## Cell 2: Setup & Load Dataset

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import roc_auc_score, mean_squared_error
import joblib
import json
from datetime import datetime
from zipfile import ZipFile

print('üîÑ Loading pre-built extended feature dataset...')
df = pd.read_parquet('featureset_extended_latest.parquet')

print(f'‚úÖ Dataset loaded:')
print(f'   Shape: {df.shape}')
print(f'   Symbols: {df["symbol"].nunique()}')
print(f'   Date range: {df["date"].min()} to {df["date"].max()}')

## Cell 3: Feature Selection

In [None]:
# Define feature candidates
TECHNICAL_FEATURES = [
    'ret_1d', 'mom_1', 'mom_3', 'mom_6', 'mom_12_1',
    'vol_30', 'vol_60', 'vol_90', 'vol_ratio_30_90',
    'adv_20_median', 'adv_zscore',
    'trend_200', 'sma200_slope', 'sma200_slope_pos',
    'atr_pct', 'volume_skew_60'
]

FUNDAMENTAL_FEATURES = [
    'pe_ratio', 'pb_ratio', 'eps', 'market_cap',
    'pe_ratio_zscore', 'pb_ratio_zscore'
]

ALL_FEATURES = TECHNICAL_FEATURES + FUNDAMENTAL_FEATURES

# Filter features based on: (1) exist in dataset, (2) >40% coverage
feature_coverage = {f: df[f].notna().mean() if f in df.columns else 0 for f in ALL_FEATURES}
FEATURES = [f for f in ALL_FEATURES if f in df.columns and feature_coverage[f] >= 0.40]

print('Feature Coverage:')
for f in ALL_FEATURES:
    if f in df.columns:
        cov = feature_coverage[f] * 100
        status = '‚úÖ' if f in FEATURES else '‚ùå'
        feat_type = 'Tech' if f in TECHNICAL_FEATURES else 'Fund'
        print(f'  {status} [{feat_type:4s}] {f:25s}: {cov:5.1f}%')

print(f'\n‚úÖ Selected {len(FEATURES)} features')
print(f'   Technical: {len([f for f in FEATURES if f in TECHNICAL_FEATURES])}')
print(f'   Fundamental: {len([f for f in FEATURES if f in FUNDAMENTAL_FEATURES])}')

TARGET_CLASS = 'return_1m_fwd_sign'
TARGET_REG = 'return_1m_fwd'

## Cell 4: Data Preparation

In [None]:
# Clean data
df = df.replace([np.inf, -np.inf], np.nan)
df = df.dropna(subset=FEATURES + [TARGET_CLASS, TARGET_REG])

X = df[FEATURES]
y_class = df[TARGET_CLASS]
y_reg = df[TARGET_REG]

print(f'‚úÖ Dataset prepared:')
print(f'   Samples: {len(df):,}')
print(f'   Symbols: {df["symbol"].nunique()}')
print(f'   Features: {len(FEATURES)}')
print(f'\n   Class distribution:')
print(f'     Down (0): {(y_class == 0).sum():,} ({(y_class == 0).mean() * 100:.1f}%)')
print(f'     Up   (1): {(y_class == 1).sum():,} ({(y_class == 1).mean() * 100:.1f}%)')

## Cell 5: Train with Cross-Validation

In [None]:
print('üöÄ Training with 12-fold TimeSeriesSplit + Class Weighting\n')

tscv = TimeSeriesSplit(n_splits=12)
auc_scores = []
rmse_scores = []

for fold, (train_idx, val_idx) in enumerate(tscv.split(X), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y_class.iloc[train_idx], y_class.iloc[val_idx]
    
    # Compute class weights
    class_counts = y_train.value_counts()
    weight_0 = len(y_train) / (2 * class_counts[0])
    weight_1 = len(y_train) / (2 * class_counts[1])
    
    # Classifier
    clf = lgb.LGBMClassifier(
        n_estimators=600,
        learning_rate=0.03,
        num_leaves=64,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0.2,
        reg_lambda=0.4,
        class_weight={0: weight_0, 1: weight_1},
        random_state=fold,
        verbose=-1
    )
    clf.fit(X_train, y_train)
    val_pred = clf.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, val_pred)
    auc_scores.append(auc)
    
    # Regressor
    reg = lgb.LGBMRegressor(
        n_estimators=600,
        learning_rate=0.05,
        num_leaves=48,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=fold,
        verbose=-1
    )
    reg.fit(X_train, y_reg.iloc[train_idx])
    val_reg = reg.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_reg.iloc[val_idx], val_reg))
    rmse_scores.append(rmse)
    
    print(f'Fold {fold:2d}: ROC-AUC = {auc:.4f}, RMSE = {rmse:.4f}')

mean_auc = np.mean(auc_scores)
std_auc = np.std(auc_scores)
mean_rmse = np.mean(rmse_scores)
std_rmse = np.std(rmse_scores)

print(f'\n' + '='*60)
print(f'‚úÖ CROSS-VALIDATION RESULTS:')
print(f'   ROC-AUC: {mean_auc:.4f} ¬± {std_auc:.4f}')
print(f'   RMSE:    {mean_rmse:.4f} ¬± {std_rmse:.4f}')
print(f'='*60)

# Comparison to baselines
baseline_v1_2 = 0.6030
baseline_v1_3 = 0.6052
improvement_v1_2 = (mean_auc - baseline_v1_2) * 100
improvement_v1_3 = (mean_auc - baseline_v1_3) * 100

print(f'\nüìä vs Baselines:')
print(f'   v1.2 (baseline): {baseline_v1_2:.4f}  ‚Üí  {improvement_v1_2:+.2f} pp')
print(f'   v1.3 (prev):     {baseline_v1_3:.4f}  ‚Üí  {improvement_v1_3:+.2f} pp')

if mean_auc >= 0.64:
    print(f'\nüéâ TARGET ACHIEVED! (‚â•64%)')
elif mean_auc >= 0.62:
    print(f'\n‚úÖ Good progress! Close to target.')
else:
    print(f'\n‚ö†Ô∏è  Below target. Review feature importance.')

## Cell 6: Train Final Models

In [None]:
print('üèãÔ∏è Training final models on full dataset...\n')

# Class weights for full dataset
class_counts_full = y_class.value_counts()
weight_0_full = len(y_class) / (2 * class_counts_full[0])
weight_1_full = len(y_class) / (2 * class_counts_full[1])

# Final classifier
clf_final = lgb.LGBMClassifier(
    n_estimators=800,
    learning_rate=0.03,
    num_leaves=64,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.2,
    reg_lambda=0.4,
    class_weight={0: weight_0_full, 1: weight_1_full},
    random_state=42,
    verbose=-1
)
clf_final.fit(X, y_class)
print('‚úÖ Classifier trained')

# Final regressor
reg_final = lgb.LGBMRegressor(
    n_estimators=600,
    learning_rate=0.05,
    num_leaves=48,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    verbose=-1
)
reg_final.fit(X, y_reg)
print('‚úÖ Regressor trained')

# Feature importance
importance_df = pd.DataFrame({
    'feature': FEATURES,
    'importance': clf_final.feature_importances_
}).sort_values('importance', ascending=False)

print(f'\nüìä Top 15 Features:')
for idx, row in importance_df.head(15).iterrows():
    feat_type = 'üìà Tech' if row['feature'] in TECHNICAL_FEATURES else 'üí∞ Fund'
    print(f'   {feat_type:7s} {row["feature"]:25s}: {row["importance"]:,.0f}')

## Cell 7: Save Models & Artifacts

In [None]:
print('üíæ Saving models and metadata...\n')

# Save models
joblib.dump(clf_final, 'model_a_v1_4_classifier.pkl')
joblib.dump(reg_final, 'model_a_v1_4_regressor.pkl')
print('‚úÖ Models saved')

# Save features
with open('model_a_v1_4_features.json', 'w') as f:
    json.dump({'features': FEATURES}, f, indent=2)
print('‚úÖ Features saved')

# Save metrics
metrics = {
    'model_version': 'v1_4',
    'improvements': [
        'Used pre-built extended feature dataset (1.12M samples)',
        'Added short-term momentum (mom_1, mom_3)',
        'Added multi-timeframe volatility (vol_30, vol_60, vol_ratio)',
        'Added SMA200 slope',
        'Class weighting for balanced predictions'
    ],
    'roc_auc_mean': float(mean_auc),
    'roc_auc_std': float(std_auc),
    'rmse_mean': float(mean_rmse),
    'rmse_std': float(std_rmse),
    'improvement_vs_v1_2': float(improvement_v1_2),
    'improvement_vs_v1_3': float(improvement_v1_3),
    'cv_folds': 12,
    'trained_at': datetime.utcnow().isoformat(),
    'n_samples': int(len(df)),
    'n_symbols': int(df['symbol'].nunique()),
    'n_features': len(FEATURES),
    'features': FEATURES,
    'feature_types': {
        'technical': len([f for f in FEATURES if f in TECHNICAL_FEATURES]),
        'fundamental': len([f for f in FEATURES if f in FUNDAMENTAL_FEATURES])
    },
    'top_10_features': importance_df.head(10)[['feature', 'importance']].to_dict('records')
}

with open('model_a_v1_4_metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)
print('‚úÖ Metrics saved')

# Create ZIP
with ZipFile('model_a_v1_4_artifacts.zip', 'w') as zipf:
    zipf.write('model_a_v1_4_classifier.pkl')
    zipf.write('model_a_v1_4_regressor.pkl')
    zipf.write('model_a_v1_4_features.json')
    zipf.write('model_a_v1_4_metrics.json')

print('\n' + '='*60)
print('‚úÖ ALL ARTIFACTS SAVED')
print('='*60)
print('\nüì¶ Download: model_a_v1_4_artifacts.zip')
print('\nüìä Model v1.4 Summary:')
print(f'   ROC-AUC:     {mean_auc:.4f}')
print(f'   vs v1.2:     {improvement_v1_2:+.2f} pp')
print(f'   vs v1.3:     {improvement_v1_3:+.2f} pp')
print(f'   Features:    {len(FEATURES)} ({len([f for f in FEATURES if f in TECHNICAL_FEATURES])} tech + {len([f for f in FEATURES if f in FUNDAMENTAL_FEATURES])} fund)')
print(f'   Samples:     {len(df):,}')
print(f'   Symbols:     {df["symbol"].nunique()}')