# Experiment 06: Bagging SVM with RobustScaler

## Overview
This notebook introduces ensemble learning to stabilize the SVM model.
- **Feature Extraction**: Includes SNR and basic statistics (Max, Min, Mean, Std, Skew).
- **Preprocessing**: Median Imputation + RobustScaler (better for outliers) + SMOTE.
- **Model**: BaggingClassifier with SVC base estimator.
- **Optimization**: RandomizedSearchCV for Bagging SVM.
- **Refinement**: Threshold Tuning.

In [None]:
# 1. Import & Config
%load_ext autoreload
%autoreload 2

import os
import gc
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import f1_score, classification_report, confusion_matrix, precision_recall_curve

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

%matplotlib inline
warnings.filterwarnings("ignore")

In [None]:
# 2. Feature Extraction
from scipy.stats import skew

def extract_features_from_split(csv_path):
    if not os.path.exists(csv_path): return None
    df = pd.read_csv(csv_path)
    
    # 1. TÍNH SNR (Tín hiệu trên nhiễu)
    df['snr'] = df['Flux'] / (df['Flux_err'] + 1e-6)
    
    # 2. Thống kê cơ bản
    aggs = df.groupby(['object_id', 'Filter']).agg({
        'Flux': ['max', 'min', 'mean', 'std', skew], # Skew rất quan trọng
        'snr': ['max', 'mean']
    }).unstack()
    aggs.columns = [f'{col[0]}_{col[1]}_{col[2]}' for col in aggs.columns]
    
    # 3. Tính Màu & Biên độ (Vật lý)
    if 'Flux_max_g' in aggs.columns and 'Flux_max_r' in aggs.columns:
        aggs['color_g_r'] = aggs['Flux_max_g'] - aggs['Flux_max_r']
        # Thêm tỷ lệ (Ratio) - SVM thích cái này hơn hiệu số
        aggs['ratio_g_r'] = aggs['Flux_max_g'] / (aggs['Flux_max_r'] + 1)

    filters = df['Filter'].unique()
    for f in filters:
        if f'Flux_max_{f}' in aggs.columns and f'Flux_min_{f}' in aggs.columns:
            aggs[f'amp_{f}'] = aggs[f'Flux_max_{f}'] - aggs[f'Flux_min_{f}']
            
    # 4. Số lượng quan sát
    counts = df.groupby('object_id').size().to_frame('n_obs')
    
    # Merge lại
    features = aggs.merge(counts, left_index=True, right_index=True)
    return features

# Hàm hỗ trợ load toàn bộ 20 splits
def load_all_splits(base_path, mode='train'):
    all_features = []
    print(f"Bắt đầu xử lý dữ liệu {mode} từ 20 splits...")
    
    for i in range(1, 21):
        split_name = f'split_{i:02d}' # Format 01, 02...
        file_name = f'{mode}_full_lightcurves.csv'
        full_path = os.path.join(base_path, split_name, file_name)
        
        print(f"Processing {split_name}...", end='\r')
        
        feats = extract_features_from_split(full_path)
        if feats is not None:
            all_features.append(feats)
            
        # Giải phóng bộ nhớ RAM
        del feats
        gc.collect()
        
    print(f"\nĐã xử lý xong {mode}!")
    # Gộp tất cả các split thành 1 DataFrame lớn
    return pd.concat(all_features)

In [None]:
# 3. Load & Process Data
BASE_PATH = 'data/raw'

print("Loading Train features...")
train_lc_features = load_all_splits(BASE_PATH, mode='train')
test_lc_features = load_all_splits(BASE_PATH, mode='test')

print("Loading Log...")
train_log = pd.read_csv(os.path.join(BASE_PATH, 'train_log.csv'))
test_log = pd.read_csv(os.path.join(BASE_PATH, 'test_log.csv'))

full_train = train_log.merge(train_lc_features, on='object_id', how='left')
full_test = test_log.merge(test_lc_features, on='object_id', how='left')

full_train.fillna(0, inplace=True)
full_test.fillna(0, inplace=True)

display(full_train.head(3))

In [None]:
# 4. Prepare Data
drop_cols = ['object_id', 'SpecType', 'English Translation', 'split', 'target', 'Z_err']
feature_cols = [c for c in full_train.columns if c not in drop_cols]

X = full_train[feature_cols]
y = full_train['target']
X_test_sub = full_test[feature_cols]

# Split
X_train_org, X_val_org, y_train_org, y_val_org = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"TDE in Val: {sum(y_val_org==1)}")

In [None]:
# 5. Pipeline & RandomizedSearchCV
# Base SVM
svc_base = SVC(
    kernel='rbf', 
    probability=True,       
    class_weight='balanced', 
    random_state=42
)

svm_pipeline = ImbPipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler()),
    ('smote', SMOTE(random_state=42, k_neighbors=7)),
    ('select', SelectKBest(score_func=f_classif)),    
    ('bagging', BaggingClassifier(
        estimator=svc_base,
        n_estimators=10,
        max_samples=0.8,
        bootstrap=True,
        random_state=42,
        n_jobs=-1
    )) 
])

param_grid = {
    'select__k': [20, 30, 'all'],
    'bagging__estimator__C': [10, 50, 100],
    'bagging__estimator__gamma': ['scale', 0.1],
    'smote__sampling_strategy': [0.5, 0.8]
}

print("Running RandomizedSearchCV...")
search = RandomizedSearchCV(
    svm_pipeline, 
    param_grid, 
    n_iter=10,
    cv=3, 
    scoring='f1', 
    verbose=2, 
    random_state=42,
    n_jobs=1
)

search.fit(X_train_org, y_train_org)
best_model = search.best_estimator_

print("\nBest params:", search.best_params_)

In [None]:
# 6. Threshold Tuning
y_val_prob = best_model.predict_proba(X_val_org)[:, 1]

precisions, recalls, thresholds = precision_recall_curve(y_val_org, y_val_prob)
f1_scores = np.divide(2 * precisions * recalls, precisions + recalls, out=np.zeros_like(precisions), where=(precisions + recalls)!=0)

best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]

print(f"Best Threshold: {best_threshold:.4f}")
print(f"Max Validation F1: {f1_scores[best_idx]:.4f}")

In [None]:
# 7. Submission
y_test_prob = best_model.predict_proba(X_test_sub)[:, 1]
final_predictions = (y_test_prob >= best_threshold).astype(int)

submission = pd.DataFrame({
    'object_id': full_test['object_id'],
    'prediction': final_predictions
})

print(submission['prediction'].value_counts())
submission.to_csv('submission_svm_bagging_robust.csv', index=False)
print("Done.")