# Notebook 05: LightGBM Modeling

## Mục Tiêu
- Xây dựng LightGBM model cho traffic forecasting
- Time Series Cross-Validation
- Feature Importance Analysis
- Hyperparameter Tuning

---

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
import warnings

warnings.filterwarnings('ignore')

# Thêm src vào path
sys.path.insert(0, os.path.abspath('..'))

from src.data.preprocessor import load_timeseries, split_train_test
from src.features.feature_engineering import TimeSeriesFeatureEngineer
from src.models.lightgbm_forecaster import LightGBMForecaster
from src.models.evaluation import calculate_metrics, calculate_forecast_accuracy

# Settings
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

print("Libraries loaded successfully!")

## 1. Load & Prepare Data

In [None]:
# Load time series
df = load_timeseries('../data/processed/timeseries_15min.parquet')

# Remove storm period
df_clean = df[df['is_storm_period'] == 0].copy()

print(f"Clean records: {len(df_clean)}")
print(f"Date range: {df_clean.index.min()} to {df_clean.index.max()}")

In [None]:
# Feature Engineering
print("Creating features...")
fe = TimeSeriesFeatureEngineer(df_clean)
df_features = fe.create_all_features(
    target_col='request_count',
    granularity='15min'
)

print(f"Feature DataFrame shape: {df_features.shape}")

In [None]:
# Get feature columns
feature_cols = fe.get_feature_columns(df_features)
print(f"Number of features: {len(feature_cols)}")

# Prepare supervised data
X, y = fe.prepare_supervised(
    df_features,
    target_col='request_count',
    feature_cols=feature_cols,
    forecast_horizon=1
)

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

In [None]:
# Train/Test split
test_start = '1995-08-23'
train_mask = X.index < test_start

X_train_full, X_test = X[train_mask], X[~train_mask]
y_train_full, y_test = y[train_mask], y[~train_mask]

print(f"Train: {len(X_train_full)} samples")
print(f"Test: {len(X_test)} samples")

In [None]:
# Create validation set from end of train (20%)
val_size = len(X_train_full) // 5
X_val = X_train_full.iloc[-val_size:]
y_val = y_train_full.iloc[-val_size:]
X_train = X_train_full.iloc[:-val_size]
y_train = y_train_full.iloc[:-val_size]

print(f"Train: {len(X_train)} samples")
print(f"Validation: {len(X_val)} samples")
print(f"Test: {len(X_test)} samples")

## 2. Train LightGBM Model

In [None]:
# Create and train model
model = LightGBMForecaster(
    n_estimators=1000,
    early_stopping_rounds=50
)

print("Training LightGBM model...")
model.fit(X_train, y_train, X_val, y_val, verbose=100)

In [None]:
# Training info
print(f"\nBest iteration: {model.best_iteration}")
print(f"Best validation score: {model.model.best_score}")

## 3. Feature Importance Analysis

In [None]:
# Top 20 features
fi = model.get_feature_importance(20)
print("Top 20 Most Important Features:")
print("="*50)
print(fi.to_string(index=False))

In [None]:
# Feature importance plot
fig, ax = plt.subplots(figsize=(10, 8))

fi_plot = model.get_feature_importance(20)
bars = ax.barh(range(len(fi_plot)), fi_plot['importance'], align='center', color='steelblue')
ax.set_yticks(range(len(fi_plot)))
ax.set_yticklabels(fi_plot['feature'])
ax.set_xlabel('Importance (Gain)')
ax.set_title('Top 20 Feature Importances - LightGBM')
ax.invert_yaxis()

plt.tight_layout()
plt.savefig('../reports/figures/lightgbm_feature_importance.png', dpi=150, bbox_inches='tight')
plt.show()

## 4. Time Series Cross-Validation

In [None]:
# Cross-validation với full training data
print("Running Time Series Cross-Validation...")
cv_results = model.cross_validate(
    X_train_full, y_train_full,
    n_splits=5,
    verbose=True
)

In [None]:
# CV Summary
print("\nCross-Validation Summary:")
print("="*50)
print(f"  RMSE: {cv_results['rmse_mean']:.4f} (+/- {cv_results['rmse_std']:.4f})")
print(f"  MAE:  {cv_results['mae_mean']:.4f} (+/- {cv_results['mae_std']:.4f})")
print(f"  MAPE: {cv_results['mape_mean']:.2f}% (+/- {cv_results['mape_std']:.2f}%)")

## 5. Generate Predictions

In [None]:
# Retrain on full training data
print("Retraining on full training data...")

final_model = LightGBMForecaster(
    n_estimators=model.best_iteration,  # Use best iteration from CV
    early_stopping_rounds=None  # No early stopping for final model
)

# Use last portion as validation for monitoring
val_size_final = len(X_train_full) // 10
final_model.fit(
    X_train_full.iloc[:-val_size_final], 
    y_train_full.iloc[:-val_size_final],
    X_train_full.iloc[-val_size_final:],
    y_train_full.iloc[-val_size_final:],
    verbose=0
)

In [None]:
# Generate test predictions
predictions = final_model.predict(X_test)

print(f"Predictions shape: {predictions.shape}")
print(f"Min prediction: {predictions.min():.2f}")
print(f"Max prediction: {predictions.max():.2f}")

In [None]:
# Visualize predictions
fig, ax = plt.subplots(figsize=(14, 6))

ax.plot(y_test.index, y_test.values, label='Actual', alpha=0.8)
ax.plot(y_test.index, predictions, label='LightGBM Forecast', alpha=0.8, linestyle='--')

ax.set_xlabel('Timestamp')
ax.set_ylabel('Request Count')
ax.set_title('LightGBM Forecast vs Actual (Test Set)')
ax.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('../reports/figures/lightgbm_forecast.png', dpi=150, bbox_inches='tight')
plt.show()

## 6. Evaluate Model

In [None]:
# Calculate metrics
metrics = calculate_metrics(y_test.values, predictions)

print("\nLightGBM Model Metrics:")
print("="*50)
for name, value in metrics.items():
    print(f"  {name}: {value:.4f}")

In [None]:
# Forecast accuracy
accuracy = calculate_forecast_accuracy(y_test.values, predictions, threshold_pct=20)

print("\nForecast Accuracy Analysis:")
print("="*50)
print(f"  Accuracy within 20%: {accuracy['accuracy_within_threshold']:.2f}%")
print(f"  Mean Error: {accuracy['mean_error']:.2f}")
print(f"  Mean % Error: {accuracy['mean_pct_error']:.2f}%")

In [None]:
# Error analysis
errors = predictions - y_test.values

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Error histogram
axes[0].hist(errors, bins=50, edgecolor='black', alpha=0.7)
axes[0].axvline(x=0, color='red', linestyle='--')
axes[0].set_title('Error Distribution')
axes[0].set_xlabel('Error')

# Actual vs Predicted
axes[1].scatter(y_test.values, predictions, alpha=0.5, s=10)
max_val = max(y_test.values.max(), predictions.max())
axes[1].plot([0, max_val], [0, max_val], 'r--')
axes[1].set_xlabel('Actual')
axes[1].set_ylabel('Predicted')
axes[1].set_title('Predicted vs Actual')

# Error over time
axes[2].plot(y_test.index, errors, alpha=0.7)
axes[2].axhline(y=0, color='red', linestyle='--')
axes[2].set_title('Error over Time')
axes[2].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('../reports/figures/lightgbm_error_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

## 7. Hyperparameter Tuning (Optional)

In [None]:
# Simple grid search
param_grid = {
    'num_leaves': [15, 31, 50],
    'learning_rate': [0.01, 0.05, 0.1]
}

best_params = None
best_rmse = float('inf')
results = []

print("Running hyperparameter search...")
for num_leaves in param_grid['num_leaves']:
    for lr in param_grid['learning_rate']:
        params = {
            'num_leaves': num_leaves,
            'learning_rate': lr
        }
        
        temp_model = LightGBMForecaster(
            params=params,
            n_estimators=500,
            early_stopping_rounds=30
        )
        temp_model.fit(X_train, y_train, X_val, y_val, verbose=0)
        
        preds = temp_model.predict(X_val)
        rmse = np.sqrt(np.mean((y_val.values - preds) ** 2))
        
        results.append({
            'num_leaves': num_leaves,
            'learning_rate': lr,
            'rmse': rmse
        })
        
        if rmse < best_rmse:
            best_rmse = rmse
            best_params = params.copy()
        
        print(f"  num_leaves={num_leaves}, lr={lr}: RMSE={rmse:.4f}")

print(f"\nBest params: {best_params}")
print(f"Best RMSE: {best_rmse:.4f}")

In [None]:
# Results heatmap
results_df = pd.DataFrame(results)
pivot = results_df.pivot(index='num_leaves', columns='learning_rate', values='rmse')

plt.figure(figsize=(8, 6))
sns.heatmap(pivot, annot=True, fmt='.2f', cmap='RdYlGn_r')
plt.title('Hyperparameter Search Results (RMSE)')
plt.tight_layout()
plt.savefig('../reports/figures/lightgbm_hyperparam.png', dpi=150, bbox_inches='tight')
plt.show()

## 8. Save Model

In [None]:
# Save final model
final_model.save('../models/lightgbm_15min.pkl')
print("Model saved successfully!")

In [None]:
# Test loading
loaded_model = LightGBMForecaster.load('../models/lightgbm_15min.pkl')
print(f"Model loaded with {len(loaded_model.feature_names)} features")

## 9. Summary

In [None]:
print("="*60)
print("            LIGHTGBM MODEL SUMMARY")
print("="*60)
print(f"\nModel: LightGBM Gradient Boosting")
print(f"Features: {len(feature_cols)}")
print(f"Granularity: 15 minutes")
print(f"\nTraining Data:")
print(f"  Train: {len(X_train)} samples")
print(f"  Validation: {len(X_val)} samples")
print(f"\nTest Data: {len(X_test)} samples")
print(f"\nCross-Validation (5-fold):")
print(f"  RMSE: {cv_results['rmse_mean']:.2f} (+/- {cv_results['rmse_std']:.2f})")
print(f"  MAE:  {cv_results['mae_mean']:.2f} (+/- {cv_results['mae_std']:.2f})")
print(f"\nTest Set Performance:")
print(f"  RMSE: {metrics['RMSE']:.2f} requests/interval")
print(f"  MAE: {metrics['MAE']:.2f} requests/interval")
print(f"  MAPE: {metrics['MAPE']:.2f}%")
print(f"\nTop 5 Features:")
for i, row in fi.head(5).iterrows():
    print(f"  {row['feature']}: {row['importance']:.2f}")
print(f"\nSaved to: ../models/lightgbm_15min.pkl")
print("="*60)