# Model A v1.3 - Improved Training with Extended Features

**Improvements over v1.2:**
- ‚úÖ Extended feature set (fundamentals + technical)
- ‚úÖ Class weighting for balanced predictions
- ‚úÖ 50% fundamental coverage (1,502 symbols)

**Target**: 64-66% ROC-AUC (current baseline: 60.3%)

## Cell 1: Install Dependencies

In [None]:
%%capture
!pip install lightgbm==4.1.0 psycopg2-binary pandas numpy scikit-learn joblib python-dotenv pyarrow

## Cell 2: Setup Database Connection

In [None]:
import os
import pandas as pd
import numpy as np
import psycopg2
import lightgbm as lgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import roc_auc_score, mean_squared_error
import joblib
import json
from datetime import datetime
from zipfile import ZipFile

# Database connection
DATABASE_URL = 'postgresql://postgres.gxjqezqndltaelmyctnl:HugoRalph2026_DB_Pass_01@aws-1-ap-southeast-2.pooler.supabase.com:6543/postgres'

print('‚úÖ Imports successful')

## Cell 3: Fetch Extended Feature Data from Database

In [None]:
print('üìä Fetching extended feature data from database...')

# Fetch technical features from prices
query_technical = """
WITH latest_data AS (
    SELECT 
        dt as date,
        symbol,
        close,
        volume,
        LAG(close, 1) OVER (PARTITION BY symbol ORDER BY dt) as close_lag1,
        LAG(close, 21) OVER (PARTITION BY symbol ORDER BY dt) as close_lag21,
        LAG(close, 63) OVER (PARTITION BY symbol ORDER BY dt) as close_lag63,
        LAG(close, 126) OVER (PARTITION BY symbol ORDER BY dt) as close_lag126,
        LAG(close, 252) OVER (PARTITION BY symbol ORDER BY dt) as close_lag252,
        LEAD(close, 21) OVER (PARTITION BY symbol ORDER BY dt) as close_fwd21
    FROM prices
    WHERE dt >= CURRENT_DATE - INTERVAL '36 months'
)
SELECT
    date,
    symbol,
    close,
    volume,
    (close - close_lag1) / NULLIF(close_lag1, 0) as ret_1d,
    (close - close_lag126) / NULLIF(close_lag126, 0) as mom_6,
    (close - close_lag252) / NULLIF(close_lag252, 0) as mom_12_1,
    (close_fwd21 - close) / NULLIF(close, 0) as return_1m_fwd
FROM latest_data
WHERE close_fwd21 IS NOT NULL
ORDER BY symbol, date
"""

conn = psycopg2.connect(DATABASE_URL)
df_tech = pd.read_sql(query_technical, conn)
print(f'‚úÖ Technical features: {len(df_tech):,} rows, {df_tech["symbol"].nunique()} symbols')

# Fetch fundamentals (latest per symbol)
query_fundamentals = """
SELECT DISTINCT ON (symbol)
    symbol,
    pe_ratio,
    pb_ratio,
    eps,
    market_cap,
    industry
FROM fundamentals
WHERE pe_ratio IS NOT NULL OR market_cap IS NOT NULL
ORDER BY symbol, updated_at DESC
"""

df_fund = pd.read_sql(query_fundamentals, conn)
conn.close()
print(f'‚úÖ Fundamentals: {len(df_fund):,} symbols')

# Merge fundamentals with technical
df = df_tech.merge(df_fund, on='symbol', how='left')
print(f'‚úÖ Merged dataset: {df.shape}')

## Cell 4: Compute Additional Technical Features

In [None]:
print('‚öôÔ∏è  Computing additional features...')

# Volatility
df['vol_90'] = df.groupby('symbol')['ret_1d'].transform(lambda x: x.rolling(90).std())

# ADV (Average Daily Volume)
df['adv_20_median'] = (
    df.groupby('symbol')['volume'].transform(lambda x: x.rolling(20).median()) * df['close']
)

# SMA 200
df['sma_200'] = df.groupby('symbol')['close'].transform(lambda x: x.rolling(200).mean())
df['trend_200'] = (df['close'] > df['sma_200']).astype(int)

# SMA slope
def slope(series):
    if series.isna().sum() > 0:
        return np.nan
    y = series.values
    x = np.arange(len(y))
    a, b = np.polyfit(x, y, 1)
    return a

df['sma200_slope'] = df.groupby('symbol')['sma_200'].transform(
    lambda x: x.rolling(20).apply(slope, raw=False)
)
df['sma200_slope_pos'] = (df['sma200_slope'] > 0).astype(int)

# Target
df['return_1m_fwd_sign'] = (df['return_1m_fwd'] > 0).astype(int)

print('‚úÖ Features computed')
print(f'\nColumns: {list(df.columns)}')

## Cell 5: Feature Selection & Data Preparation

In [None]:
# Define feature sets
TECHNICAL_FEATURES = [
    'ret_1d', 'mom_6', 'mom_12_1', 'vol_90', 'adv_20_median',
    'trend_200', 'sma200_slope_pos'
]

FUNDAMENTAL_FEATURES = [
    'pe_ratio', 'pb_ratio', 'eps', 'market_cap'
]

# Combine features
ALL_FEATURES = TECHNICAL_FEATURES + FUNDAMENTAL_FEATURES

# Filter features based on coverage (>40%)
feature_coverage = {f: df[f].notna().mean() for f in ALL_FEATURES}
FEATURES = [f for f in ALL_FEATURES if feature_coverage.get(f, 0) >= 0.4]

print(f'Feature coverage:')
for f in ALL_FEATURES:
    cov = feature_coverage.get(f, 0) * 100
    status = '‚úÖ' if f in FEATURES else '‚ùå'
    print(f'  {status} {f:20s}: {cov:5.1f}%')

print(f'\nSelected features: {len(FEATURES)}')
print(f'  Technical: {len([f for f in FEATURES if f in TECHNICAL_FEATURES])}')
print(f'  Fundamental: {len([f for f in FEATURES if f in FUNDAMENTAL_FEATURES])}')

TARGET_CLASS = 'return_1m_fwd_sign'
TARGET_REG = 'return_1m_fwd'

# Clean data
df = df.replace([np.inf, -np.inf], np.nan)
df = df.dropna(subset=FEATURES + [TARGET_CLASS, TARGET_REG])

X = df[FEATURES]
y_class = df[TARGET_CLASS]
y_reg = df[TARGET_REG]

print(f'\n‚úÖ Dataset prepared: {len(df):,} samples, {df["symbol"].nunique()} symbols')
print(f'   Class distribution: {y_class.value_counts().to_dict()}')

## Cell 6: Train with Cross-Validation (WITH CLASS WEIGHTING)

In [None]:
print('üöÄ Starting training with 12-fold TimeSeriesSplit...')
print('üéØ NEW: Using class weighting for balanced predictions\n')

tscv = TimeSeriesSplit(n_splits=12)
auc_scores = []
rmse_scores = []

for fold, (train_idx, val_idx) in enumerate(tscv.split(X), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y_class.iloc[train_idx], y_class.iloc[val_idx]
    
    # Compute class weights (NEW!)
    class_counts = y_train.value_counts()
    weight_0 = len(y_train) / (2 * class_counts[0])
    weight_1 = len(y_train) / (2 * class_counts[1])
    
    # Classifier with class weighting
    clf = lgb.LGBMClassifier(
        n_estimators=600,
        learning_rate=0.03,
        num_leaves=64,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0.2,
        reg_lambda=0.4,
        class_weight={0: weight_0, 1: weight_1},  # NEW!
        random_state=fold,
        verbose=-1
    )
    clf.fit(X_train, y_train)
    val_pred = clf.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, val_pred)
    auc_scores.append(auc)
    
    # Regressor
    reg = lgb.LGBMRegressor(
        n_estimators=600,
        learning_rate=0.05,
        num_leaves=48,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=fold,
        verbose=-1
    )
    reg.fit(X_train, y_reg.iloc[train_idx])
    val_reg = reg.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_reg.iloc[val_idx], val_reg))
    rmse_scores.append(rmse)
    
    print(f'Fold {fold:2d}: ROC-AUC = {auc:.4f}, RMSE = {rmse:.4f}')

mean_auc = np.mean(auc_scores)
std_auc = np.std(auc_scores)
mean_rmse = np.mean(rmse_scores)
std_rmse = np.std(rmse_scores)

print(f'\n' + '='*60)
print(f'‚úÖ RESULTS:')
print(f'   ROC-AUC: {mean_auc:.4f} ¬± {std_auc:.4f}')
print(f'   RMSE:    {mean_rmse:.4f} ¬± {std_rmse:.4f}')
print(f'='*60)

# Comparison to baseline
baseline_auc = 0.6030
improvement = (mean_auc - baseline_auc) * 100
print(f'\nüìä vs Baseline (v1.2):')
print(f'   Baseline ROC-AUC: {baseline_auc:.4f}')
print(f'   New ROC-AUC:      {mean_auc:.4f}')
print(f'   Improvement:      {improvement:+.2f} percentage points')

if mean_auc >= 0.64:
    print(f'\nüéâ TARGET ACHIEVED! (‚â•64%)')
elif mean_auc >= 0.62:
    print(f'\n‚úÖ Good progress! Close to target.')
else:
    print(f'\n‚ö†Ô∏è  Below target. May need more features.')

## Cell 7: Train Final Models on Full Dataset

In [None]:
print('üèãÔ∏è Training final models on full dataset...')

# Compute class weights for full dataset
class_counts_full = y_class.value_counts()
weight_0_full = len(y_class) / (2 * class_counts_full[0])
weight_1_full = len(y_class) / (2 * class_counts_full[1])

# Final classifier
clf_final = lgb.LGBMClassifier(
    n_estimators=800,
    learning_rate=0.03,
    num_leaves=64,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.2,
    reg_lambda=0.4,
    class_weight={0: weight_0_full, 1: weight_1_full},
    random_state=42,
    verbose=-1
)
clf_final.fit(X, y_class)
print('‚úÖ Classifier trained')

# Final regressor
reg_final = lgb.LGBMRegressor(
    n_estimators=600,
    learning_rate=0.05,
    num_leaves=48,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    verbose=-1
)
reg_final.fit(X, y_reg)
print('‚úÖ Regressor trained')

# Feature importance
importance_df = pd.DataFrame({
    'feature': FEATURES,
    'importance': clf_final.feature_importances_
}).sort_values('importance', ascending=False)

print(f'\nüìä Top 10 Features:')
for idx, row in importance_df.head(10).iterrows():
    feat_type = 'üìà' if row['feature'] in TECHNICAL_FEATURES else 'üí∞'
    print(f'   {feat_type} {row["feature"]:20s}: {row["importance"]:.4f}')

## Cell 8: Save Models & Metadata

In [None]:
print('üíæ Saving models and metadata...')

# Save models
joblib.dump(clf_final, 'model_a_v1_3_classifier.pkl')
joblib.dump(reg_final, 'model_a_v1_3_regressor.pkl')
print('‚úÖ Models saved')

# Save features
with open('model_a_v1_3_features.json', 'w') as f:
    json.dump({'features': FEATURES}, f, indent=2)
print('‚úÖ Features saved')

# Save metrics
metrics = {
    'model_version': 'v1_3',
    'improvements': [
        'Extended features (fundamentals + technical)',
        'Class weighting for balanced predictions',
        f'{len([f for f in FEATURES if f in FUNDAMENTAL_FEATURES])} fundamental features added'
    ],
    'roc_auc_mean': float(mean_auc),
    'roc_auc_std': float(std_auc),
    'rmse_mean': float(mean_rmse),
    'rmse_std': float(std_rmse),
    'improvement_vs_v1_2': float(improvement),
    'cv_folds': 12,
    'trained_at': datetime.utcnow().isoformat(),
    'n_samples': int(len(df)),
    'n_symbols': int(df['symbol'].nunique()),
    'n_features': len(FEATURES),
    'features': FEATURES,
    'feature_coverage': {f: float(feature_coverage[f]) for f in FEATURES}
}

with open('model_a_v1_3_metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)
print('‚úÖ Metrics saved')

# Create ZIP for download
with ZipFile('model_a_v1_3_artifacts.zip', 'w') as zipf:
    zipf.write('model_a_v1_3_classifier.pkl')
    zipf.write('model_a_v1_3_regressor.pkl')
    zipf.write('model_a_v1_3_features.json')
    zipf.write('model_a_v1_3_metrics.json')

print('\n' + '='*60)
print('‚úÖ ALL ARTIFACTS SAVED')
print('='*60)
print('\nüì¶ Download these files:')
print('   ‚Ä¢ model_a_v1_3_artifacts.zip (contains all 4 files)')
print('\nüìä Model v1.3 Summary:')
print(f'   ROC-AUC:     {mean_auc:.4f} (target: ‚â•0.64)')
print(f'   Improvement: {improvement:+.2f} pp vs v1.2')
print(f'   Features:    {len(FEATURES)} ({len([f for f in FEATURES if f in TECHNICAL_FEATURES])} technical + {len([f for f in FEATURES if f in FUNDAMENTAL_FEATURES])} fundamental)')
print(f'   Samples:     {len(df):,}')
print(f'   Symbols:     {df["symbol"].nunique()}')

## Cell 9: Display Final Metrics

In [None]:
print('\n' + '='*60)
print('üìä FINAL RESULTS - Model A v1.3')
print('='*60)
print(f'\nPerformance:')
print(f'  ROC-AUC:  {mean_auc:.4f} ¬± {std_auc:.4f}')
print(f'  RMSE:     {mean_rmse:.4f} ¬± {std_rmse:.4f}')
print(f'\nImprovement vs v1.2 (baseline: {baseline_auc:.4f}):')
print(f'  Absolute: {improvement:+.2f} percentage points')
print(f'  Relative: {(improvement / (baseline_auc * 100) * 100):+.1f}%')
print(f'\nDataset:')
print(f'  Samples:  {len(df):,}')
print(f'  Symbols:  {df["symbol"].nunique()}')
print(f'  Features: {len(FEATURES)}')
print(f'\nTop 5 Features:')
for idx, row in importance_df.head(5).iterrows():
    feat_type = 'Technical' if row['feature'] in TECHNICAL_FEATURES else 'Fundamental'
    print(f'  {row["feature"]:20s} ({feat_type:11s}): {row["importance"]:.4f}')
print('\n' + '='*60)

if mean_auc >= 0.64:
    print('\nüéâ SUCCESS! Target achieved (‚â•64%)')
    print('\n‚úÖ Next steps:')
    print('   1. Download model_a_v1_3_artifacts.zip')
    print('   2. Upload to GitHub (models/ directory)')
    print('   3. Deploy to Render')
    print('   4. Generate signals with jobs/generate_signals.py')
    print('   5. Test live API')
elif mean_auc >= 0.62:
    print('\n‚úÖ Good progress! Close to target.')
    print('\n‚ö° Optional improvements:')
    print('   - Add mom_1, mom_3 (short-term momentum)')
    print('   - Add vol_30, vol_ratio_30_90 (volatility features)')
    print('   - Hyperparameter tuning with Optuna')
else:
    print('\n‚ö†Ô∏è  Below target. Consider:')
    print('   - Adding more technical features')
    print('   - Checking data quality')
    print('   - Feature engineering')