# Notebook 03: Feature Engineering

## Mục Tiêu
- Tạo features từ time series data cho forecasting models
- Temporal features (hour, day_of_week, cyclical encoding)
- Lag features và rolling statistics
- Chuẩn bị data cho supervised learning

---

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

# Thêm src vào path
sys.path.insert(0, os.path.abspath('..'))

from src.features.feature_engineering import TimeSeriesFeatureEngineer
from src.data.preprocessor import load_timeseries, split_train_test

# Settings
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)
pd.set_option('display.max_columns', 50)

print("Libraries loaded successfully!")

## 1. Load Data

In [None]:
# Load time series data (15-minute granularity)
df_15min = load_timeseries('../data/processed/timeseries_15min.parquet')

print(f"Data shape: {df_15min.shape}")
print(f"Date range: {df_15min.index.min()} to {df_15min.index.max()}")
print(f"\nColumns: {df_15min.columns.tolist()}")
df_15min.head()

In [None]:
# Loại bỏ storm period cho training
df_clean = df_15min[df_15min['is_storm_period'] == 0].copy()
print(f"Clean data: {len(df_clean)} records (removed {len(df_15min) - len(df_clean)} storm records)")

## 2. Feature Engineering với TimeSeriesFeatureEngineer

In [None]:
# Khởi tạo Feature Engineer
fe = TimeSeriesFeatureEngineer(df_clean)

# Tạo tất cả features
df_features = fe.create_all_features(
    target_col='request_count',
    granularity='15min'
)

print(f"Feature DataFrame shape: {df_features.shape}")
print(f"\nNumber of features created: {len(df_features.columns)}")

In [None]:
# Xem tất cả features
print("All features:")
for i, col in enumerate(df_features.columns):
    print(f"  {i+1}. {col}")

## 3. Chi Tiết Các Loại Features

### 3.1 Temporal Features

In [None]:
# Temporal features
temporal_cols = ['hour', 'day_of_week', 'day_of_month', 'is_weekend', 'is_business_hour',
                 'hour_sin', 'hour_cos', 'dow_sin', 'dow_cos']

print("Temporal Features:")
df_features[temporal_cols].head(10)

In [None]:
# Visualize cyclical encoding
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Hour cyclical
sample = df_features[['hour', 'hour_sin', 'hour_cos']].drop_duplicates().sort_values('hour')
axes[0].scatter(sample['hour_cos'], sample['hour_sin'], c=sample['hour'], cmap='viridis', s=100)
axes[0].set_xlabel('hour_cos')
axes[0].set_ylabel('hour_sin')
axes[0].set_title('Hour Cyclical Encoding')
for _, row in sample.iterrows():
    axes[0].annotate(f"{int(row['hour'])}h", (row['hour_cos'], row['hour_sin']), fontsize=8)

# Day of week cyclical
sample_dow = df_features[['day_of_week', 'dow_sin', 'dow_cos']].drop_duplicates().sort_values('day_of_week')
days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
axes[1].scatter(sample_dow['dow_cos'], sample_dow['dow_sin'], c=sample_dow['day_of_week'], cmap='viridis', s=100)
axes[1].set_xlabel('dow_cos')
axes[1].set_ylabel('dow_sin')
axes[1].set_title('Day of Week Cyclical Encoding')
for _, row in sample_dow.iterrows():
    axes[1].annotate(days[int(row['day_of_week'])], (row['dow_cos'], row['dow_sin']), fontsize=8)

plt.tight_layout()
plt.savefig('../reports/figures/cyclical_encoding.png', dpi=150, bbox_inches='tight')
plt.show()

### 3.2 Lag Features

In [None]:
# Lag features
lag_cols = [col for col in df_features.columns if '_lag_' in col]
print(f"Lag features ({len(lag_cols)}): {lag_cols}")

# Sample data
df_features[['request_count'] + lag_cols].head(10)

In [None]:
# Correlation của lag features với target
if len(lag_cols) > 0:
    lag_corr = df_features[['request_count'] + lag_cols].corr()['request_count'].drop('request_count')
    
    plt.figure(figsize=(10, 5))
    lag_corr.plot(kind='bar', color='steelblue')
    plt.title('Correlation của Lag Features với Request Count')
    plt.xlabel('Lag Feature')
    plt.ylabel('Correlation')
    plt.xticks(rotation=45)
    plt.axhline(y=0, color='red', linestyle='--', alpha=0.5)
    plt.tight_layout()
    plt.savefig('../reports/figures/lag_correlation.png', dpi=150, bbox_inches='tight')
    plt.show()
    
    print("\nLag Correlations:")
    print(lag_corr.sort_values(ascending=False))
else:
    print("No lag features found!")

### 3.3 Rolling Statistics

In [None]:
# Rolling features
rolling_cols = [col for col in df_features.columns if '_rolling_' in col]
print(f"Rolling features ({len(rolling_cols)}): {rolling_cols}")

In [None]:
# Visualize rolling statistics
sample_data = df_features.iloc[200:400].copy()

fig, ax = plt.subplots(figsize=(14, 6))

ax.plot(sample_data.index, sample_data['request_count'], label='Actual', alpha=0.8)

# Rolling means - check with correct column names
rolling_mean_cols = [col for col in sample_data.columns if '_rolling_mean_' in col]
if len(rolling_mean_cols) >= 1:
    ax.plot(sample_data.index, sample_data[rolling_mean_cols[0]], label=f'{rolling_mean_cols[0]}', alpha=0.7)
if len(rolling_mean_cols) >= 2:
    ax.plot(sample_data.index, sample_data[rolling_mean_cols[-1]], label=f'{rolling_mean_cols[-1]}', alpha=0.7)

ax.set_xlabel('Timestamp')
ax.set_ylabel('Request Count')
ax.set_title('Request Count với Rolling Means')
ax.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('../reports/figures/rolling_features.png', dpi=150, bbox_inches='tight')
plt.show()

### 3.4 Derived Features (Diff, Pct Change, EWM)

In [None]:
# Derived features
derived_cols = [col for col in df_features.columns if '_diff_' in col or '_pct_change_' in col or '_ewm_' in col]
print(f"Derived features ({len(derived_cols)}): {derived_cols}")

In [None]:
# Distribution của diff features
diff_col = [col for col in df_features.columns if '_diff_1' in col]
pct_col = [col for col in df_features.columns if '_pct_change_1' in col]

if len(diff_col) > 0:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Diff distribution
    axes[0].hist(df_features[diff_col[0]].dropna(), bins=50, edgecolor='black', alpha=0.7)
    axes[0].axvline(x=0, color='red', linestyle='--')
    axes[0].set_title(f'Distribution của {diff_col[0]}')
    axes[0].set_xlabel('Change')
    axes[0].set_ylabel('Frequency')
    
    # Pct change distribution
    if len(pct_col) > 0:
        pct_data = df_features[pct_col[0]].dropna()
        pct_data = pct_data[(pct_data > -2) & (pct_data < 2)]  # Filter outliers
        axes[1].hist(pct_data, bins=50, edgecolor='black', alpha=0.7)
        axes[1].axvline(x=0, color='red', linestyle='--')
        axes[1].set_title(f'Distribution của {pct_col[0]}')
        axes[1].set_xlabel('% Change')
        axes[1].set_ylabel('Frequency')
    
    plt.tight_layout()
    plt.savefig('../reports/figures/derived_features.png', dpi=150, bbox_inches='tight')
    plt.show()
else:
    print("No diff features found!")

## 4. Feature Selection

In [None]:
# Lấy danh sách feature columns
feature_cols = fe.get_feature_columns(df_features)
print(f"Total feature columns: {len(feature_cols)}")
print(f"\nFeatures: {feature_cols}")

In [None]:
# Correlation matrix cho top features
# Chọn features có correlation cao với target
corr_with_target = df_features[feature_cols + ['request_count']].corr()['request_count'].drop('request_count')
top_features = corr_with_target.abs().nlargest(15).index.tolist()

# Correlation heatmap
corr_matrix = df_features[top_features + ['request_count']].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap='RdBu_r', center=0, fmt='.2f')
plt.title('Correlation Matrix - Top 15 Features')
plt.tight_layout()
plt.savefig('../reports/figures/feature_correlation.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Top features by correlation
print("Top 20 Features by Correlation với Target:")
print("="*50)
for i, (feat, corr) in enumerate(corr_with_target.abs().nlargest(20).items()):
    actual_corr = corr_with_target[feat]
    print(f"{i+1:2d}. {feat:30s}: {actual_corr:+.4f}")

## 5. Prepare Supervised Learning Data

In [None]:
# Chuẩn bị data cho supervised learning
X, y = fe.prepare_supervised(
    df_features,
    target_col='request_count',
    feature_cols=feature_cols,
    forecast_horizon=1  # Dự đoán 1 step ahead
)

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"\nFirst few rows of X:")
X.head()

In [None]:
# Train/Test split
test_start = '1995-08-23'
train_mask = X.index < test_start

X_train, X_test = X[train_mask], X[~train_mask]
y_train, y_test = y[train_mask], y[~train_mask]

print(f"Train set: {len(X_train)} samples ({X_train.index.min()} to {X_train.index.max()})")
print(f"Test set: {len(X_test)} samples ({X_test.index.min()} to {X_test.index.max()})")

In [None]:
# Check for missing values
missing = X.isnull().sum()
missing_cols = missing[missing > 0]

if len(missing_cols) > 0:
    print("Columns with missing values:")
    print(missing_cols)
else:
    print("No missing values in feature columns!")

## 6. Save Processed Data

In [None]:
# Save feature-engineered data
df_features.to_parquet('../data/processed/features_15min.parquet')
print("Saved: features_15min.parquet")

# Save train/test splits
X_train.to_parquet('../data/processed/X_train_15min.parquet')
X_test.to_parquet('../data/processed/X_test_15min.parquet')
y_train.to_frame().to_parquet('../data/processed/y_train_15min.parquet')
y_test.to_frame().to_parquet('../data/processed/y_test_15min.parquet')

print("\nTrain/Test data saved!")

## 7. Summary

In [None]:
print("="*60)
print("            FEATURE ENGINEERING SUMMARY")
print("="*60)
print(f"\nGranularity: 15 minutes")
print(f"Total records: {len(df_features)}")
print(f"\nFeatures created: {len(feature_cols)}")
print(f"  - Temporal: hour, day_of_week, is_weekend, cyclical encoding")
print(f"  - Lag: {[c for c in feature_cols if c.startswith('lag_')]}")
print(f"  - Rolling: mean, std, max, min")
print(f"  - Derived: diff, pct_change, ewm")
print(f"\nTrain samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")
print(f"\nTop 5 Features by Correlation:")
for feat, corr in corr_with_target.abs().nlargest(5).items():
    print(f"  - {feat}: {corr_with_target[feat]:+.4f}")
print("="*60)