# Repository Activity Forecasting Experiments

This notebook demonstrates:
1. **Rolling time-series cross-validation**
2. **Autoregressive forecasting** with multiple models
3. **Binary classification** metrics (active/inactive)
4. **Visualization** of predictions vs actuals

## Rolling Evaluation Strategy

```
Train: 2015-01 → 2018-06  |  Predict: 2018-07 → 2018-09  (Q3 2018)
Train: 2015-01 → 2018-09  |  Predict: 2018-10 → 2018-12  (Q4 2018)
Train: 2015-01 → 2018-12  |  Predict: 2019-01 → 2019-03  (Q1 2019)
...
```

In [None]:
import sys
sys.path.append('../models')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, mean_absolute_error, f1_score

from forecaster import (
    NaiveForecaster, MovingAverageForecaster,
    LSTMForecaster, GRUForecaster,
    create_sequences
)

# Settings
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)
np.random.seed(42)

## 1. Load Labeled Quarterly Data

In [None]:
# Load data
data_path = '../data/processed/quarters_labeled.parquet'

try:
    df = pd.read_parquet(data_path)
    print(f"✅ Loaded {len(df):,} records")
    print(f"   Repositories: {df['repo_id'].nunique():,}")
    print(f"   Date range: {df['quarter_start'].min()} to {df['quarter_end'].max()}")
    display(df.head())
except FileNotFoundError:
    print(f"❌ Data not found: {data_path}")
    print("   Run: python preprocessing/label_activity.py")
    df = None

## 2. Define Metric Columns and Sequence Length

In [None]:
if df is not None:
    # Identify metric columns to forecast
    metric_cols = [col for col in df.columns 
                   if pd.api.types.is_numeric_dtype(df[col]) 
                   and col not in ['year', 'quarter', 'quarter_index', 
                                   'quarters_since_creation', 'total_quarters',
                                   'activity_score', 'is_active']]
    
    print(f"Metrics to forecast: {metric_cols}")
    
    # Sequence length (number of past quarters to use)
    SEQUENCE_LENGTH = 4
    print(f"Sequence length: {SEQUENCE_LENGTH} quarters")

## 3. Simple Train/Test Split Example

First, let's do a simple example: train on all data before 2019, predict 2019 onwards.

In [None]:
if df is not None:
    # Split by date
    split_date = '2019-01-01'
    
    df_train = df[df['quarter_start'] < split_date]
    df_test = df[df['quarter_start'] >= split_date]
    
    print(f"Train: {len(df_train):,} records ({df_train['quarter_start'].min()} to {df_train['quarter_end'].max()})")
    print(f"Test:  {len(df_test):,} records ({df_test['quarter_start'].min()} to {df_test['quarter_end'].max()})")
    
    # Create sequences
    print("\nCreating training sequences...")
    X_train, y_train, train_repo_ids = create_sequences(
        df_train, SEQUENCE_LENGTH, metric_cols
    )
    
    print(f"Training sequences: {X_train.shape}")
    print(f"Training targets: {y_train.shape}")
    
    print("\nCreating test sequences...")
    X_test, y_test, test_repo_ids = create_sequences(
        df_test, SEQUENCE_LENGTH, metric_cols
    )
    
    print(f"Test sequences: {X_test.shape}")
    print(f"Test targets: {y_test.shape}")

## 4. Train and Evaluate Models

In [None]:
if df is not None and X_train is not None:
    results = {}
    
    # Naive baseline
    print("\n" + "="*60)
    print("NAIVE FORECASTER")
    print("="*60)
    
    naive_model = NaiveForecaster()
    naive_model.fit(X_train, y_train)
    naive_metrics = naive_model.evaluate(X_test, y_test)
    results['naive'] = naive_metrics
    
    print(f"MSE:  {naive_metrics['mse']:.6f}")
    print(f"MAE:  {naive_metrics['mae']:.6f}")
    print(f"RMSE: {naive_metrics['rmse']:.6f}")
    
    # Moving Average
    print("\n" + "="*60)
    print("MOVING AVERAGE FORECASTER")
    print("="*60)
    
    ma_model = MovingAverageForecaster(window_size=3)
    ma_model.fit(X_train, y_train)
    ma_metrics = ma_model.evaluate(X_test, y_test)
    results['moving_average'] = ma_metrics
    
    print(f"MSE:  {ma_metrics['mse']:.6f}")
    print(f"MAE:  {ma_metrics['mae']:.6f}")
    print(f"RMSE: {ma_metrics['rmse']:.6f}")
    
    # LSTM (small model for demo)
    print("\n" + "="*60)
    print("LSTM FORECASTER")
    print("="*60)
    
    lstm_model = LSTMForecaster(
        input_size=len(metric_cols),
        hidden_size=32,
        num_layers=1,
        epochs=20,
        batch_size=64
    )
    
    # Sample smaller subset for quick training (remove for full training)
    sample_size = min(5000, len(X_train))
    lstm_model.fit(X_train[:sample_size], y_train[:sample_size])
    lstm_metrics = lstm_model.evaluate(X_test, y_test)
    results['lstm'] = lstm_metrics
    
    print(f"MSE:  {lstm_metrics['mse']:.6f}")
    print(f"MAE:  {lstm_metrics['mae']:.6f}")
    print(f"RMSE: {lstm_metrics['rmse']:.6f}")

## 5. Compare Model Performance

In [None]:
if 'results' in locals():
    # Create comparison table
    comparison = pd.DataFrame({
        'Model': list(results.keys()),
        'MSE': [results[m]['mse'] for m in results],
        'MAE': [results[m]['mae'] for m in results],
        'RMSE': [results[m]['rmse'] for m in results]
    })
    
    print("\n" + "="*60)
    print("MODEL COMPARISON")
    print("="*60)
    display(comparison)
    
    # Plot
    fig, axes = plt.subplots(1, 3, figsize=(15, 4))
    
    for i, metric in enumerate(['MSE', 'MAE', 'RMSE']):
        axes[i].bar(comparison['Model'], comparison[metric], alpha=0.7)
        axes[i].set_ylabel(metric)
        axes[i].set_title(f'{metric} Comparison')
        axes[i].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.savefig('../data/processed/model_comparison.png', dpi=150, bbox_inches='tight')
    plt.show()

## 6. Visualize Predictions for Sample Repositories

In [None]:
if 'lstm_model' in locals() and X_test is not None:
    # Get predictions from best model
    predictions = lstm_model.predict(X_test[:100])  # Sample
    actuals = y_test[:100]
    
    # Plot first metric for sample repos
    fig, axes = plt.subplots(2, 3, figsize=(15, 8))
    axes = axes.flatten()
    
    for i in range(6):
        if i < len(predictions):
            # Plot all metrics for this sample
            x = np.arange(len(metric_cols))
            width = 0.35
            
            axes[i].bar(x - width/2, actuals[i], width, label='Actual', alpha=0.7)
            axes[i].bar(x + width/2, predictions[i], width, label='Predicted', alpha=0.7)
            axes[i].set_xlabel('Metric')
            axes[i].set_ylabel('Value')
            axes[i].set_title(f'Sample {i+1}')
            axes[i].set_xticks(x)
            axes[i].set_xticklabels([m[:10] for m in metric_cols], rotation=45, ha='right')
            axes[i].legend()
    
    plt.tight_layout()
    plt.savefig('../data/processed/sample_predictions.png', dpi=150, bbox_inches='tight')
    plt.show()

## 7. Binary Classification: Active vs Inactive

Convert forecasts to binary labels and compute classification metrics.

In [None]:
if 'predictions' in locals() and 'actuals' in locals():
    # Load threshold from labeling
    import json
    
    try:
        with open('../config/threshold_config.json', 'r') as f:
            threshold_config = json.load(f)
            threshold = threshold_config['threshold']
    except:
        # Use default threshold (median activity score)
        threshold = df['activity_score'].median()
    
    print(f"Using activity threshold: {threshold:.4f}")
    
    # Compute activity scores for predictions and actuals
    # Simple heuristic: sum of log-transformed metrics
    from sklearn.metrics import precision_recall_curve, roc_curve, auc
    
    pred_scores = np.log1p(predictions).sum(axis=1)
    actual_scores = np.log1p(actuals).sum(axis=1)
    
    # Binary labels
    pred_labels = (pred_scores >= threshold).astype(int)
    actual_labels = (actual_scores >= threshold).astype(int)
    
    # Classification metrics
    from sklearn.metrics import precision_score, recall_score, accuracy_score, confusion_matrix
    
    precision = precision_score(actual_labels, pred_labels, zero_division=0)
    recall = recall_score(actual_labels, pred_labels, zero_division=0)
    f1 = f1_score(actual_labels, pred_labels, zero_division=0)
    accuracy = accuracy_score(actual_labels, pred_labels)
    
    print("\n" + "="*60)
    print("BINARY CLASSIFICATION METRICS")
    print("="*60)
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1 Score:  {f1:.4f}")
    print(f"Accuracy:  {accuracy:.4f}")
    
    # Confusion matrix
    cm = confusion_matrix(actual_labels, pred_labels)
    
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Inactive', 'Active'],
                yticklabels=['Inactive', 'Active'])
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.title('Confusion Matrix: Active/Inactive Classification')
    plt.savefig('../data/processed/confusion_matrix.png', dpi=150, bbox_inches='tight')
    plt.show()

## 8. Rolling Cross-Validation (Simplified)

Demonstrate one step of rolling evaluation.

In [None]:
if df is not None:
    print("\n" + "="*60)
    print("ROLLING CROSS-VALIDATION EXAMPLE")
    print("="*60)
    
    # Define rolling windows
    windows = [
        ('2015-01-01', '2018-06-30', '2018-07-01', '2018-09-30'),  # Q3 2018
        ('2015-01-01', '2018-09-30', '2018-10-01', '2018-12-31'),  # Q4 2018
    ]
    
    rolling_results = []
    
    for train_start, train_end, test_start, test_end in windows:
        print(f"\nTrain: {train_start} → {train_end}")
        print(f"Test:  {test_start} → {test_end}")
        
        # Split data
        df_train_roll = df[
            (df['quarter_start'] >= train_start) & 
            (df['quarter_end'] <= train_end)
        ]
        df_test_roll = df[
            (df['quarter_start'] >= test_start) & 
            (df['quarter_end'] <= test_end)
        ]
        
        if len(df_train_roll) == 0 or len(df_test_roll) == 0:
            print("  ⚠️ Insufficient data for this window")
            continue
        
        print(f"  Train samples: {len(df_train_roll):,}")
        print(f"  Test samples:  {len(df_test_roll):,}")
        
        # Create sequences
        X_train_roll, y_train_roll, _ = create_sequences(
            df_train_roll, SEQUENCE_LENGTH, metric_cols
        )
        X_test_roll, y_test_roll, _ = create_sequences(
            df_test_roll, SEQUENCE_LENGTH, metric_cols
        )
        
        if len(X_train_roll) == 0 or len(X_test_roll) == 0:
            print("  ⚠️ Could not create sequences")
            continue
        
        # Train simple model (Naive for speed)
        model = NaiveForecaster()
        model.fit(X_train_roll, y_train_roll)
        metrics = model.evaluate(X_test_roll, y_test_roll)
        
        print(f"  MSE: {metrics['mse']:.6f}, MAE: {metrics['mae']:.6f}")
        
        rolling_results.append({
            'test_period': f"{test_start} to {test_end}",
            'mse': metrics['mse'],
            'mae': metrics['mae'],
            'rmse': metrics['rmse']
        })
    
    if rolling_results:
        print("\n" + "="*60)
        print("ROLLING RESULTS SUMMARY")
        print("="*60)
        rolling_df = pd.DataFrame(rolling_results)
        display(rolling_df)
        
        print(f"\nAverage MSE across windows: {rolling_df['mse'].mean():.6f}")
        print(f"Average MAE across windows: {rolling_df['mae'].mean():.6f}")

## Summary

This notebook demonstrated:

1. ✅ Loading labeled quarterly data
2. ✅ Creating time series sequences
3. ✅ Training multiple forecasting models (Naive, MA, LSTM)
4. ✅ Computing forecast metrics (MSE, MAE, RMSE)
5. ✅ Binary classification metrics (Precision, Recall, F1)
6. ✅ Rolling cross-validation example
7. ✅ Visualizations of predictions vs actuals

## Next Steps

- Tune hyperparameters for LSTM/GRU models
- Implement full rolling evaluation pipeline
- Try more sophisticated models (Transformer, TCN)
- Add repository-specific features
- Analyze per-metric forecasting performance