# Experiment 06: SVM New Features (Standard GridSearch)

## Overview
This experiment introduces SNR, Duration, and Ratio features.
- **Feature Extraction**: Enhanced (SNR, Duration, Ratio).
- **Pipeline**: Imputer -> Scaler -> SMOTE -> SelectKBest -> SVC.


In [None]:
# 1. Import & Config
%load_ext autoreload
%autoreload 2

import os
import gc
import warnings
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.svm import SVC
from sklearn.metrics import f1_score, classification_report, confusion_matrix

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

warnings.filterwarnings("ignore")

In [None]:
# 2. Feature Extraction (Enhanced)
from scipy.stats import skew, kurtosis

def extract_features_from_split(csv_path):
    if not os.path.exists(csv_path): return None
    df = pd.read_csv(csv_path)
    
    # 1. TÍNH TOÁN CÁC ĐẶC TRƯNG CƠ BẢN (Như cũ)
    # Thêm 'Flux_err' vào để tính SNR
    df['snr'] = df['Flux'] / (df['Flux_err'] + 1e-6)
    
    aggs = df.groupby(['object_id', 'Filter']).agg({
        'Flux': ['max', 'min', 'mean', 'std', skew],
        'snr': ['max', 'mean']
    }).unstack()
    aggs.columns = [f'{col[0]}_{col[1]}_{col[2]}' for col in aggs.columns]
    
    # --- 2. TÍNH ĐẶC TRƯNG THỜI GIAN (MỚI) ---
    time_aggs = df.groupby('object_id')['Time (MJD)'].agg(['min', 'max'])
    time_aggs['duration'] = time_aggs['max'] - time_aggs['min']
    
    # --- 3. TÍNH MÀU & BIÊN ĐỘ (Như cũ) ---
    if 'Flux_max_g' in aggs.columns and 'Flux_max_r' in aggs.columns:
        aggs['color_g_r'] = aggs['Flux_max_g'] - aggs['Flux_max_r']
        aggs['ratio_g_r'] = aggs['Flux_max_g'] / (aggs['Flux_max_r'] + 1)

    filters = df['Filter'].unique()
    for f in filters:
        if f'Flux_max_{f}' in aggs.columns and f'Flux_min_{f}' in aggs.columns:
            aggs[f'amp_{f}'] = aggs[f'Flux_max_{f}'] - aggs[f'Flux_min_{f}']
            
    # --- 4. MERGE ---
    counts = df.groupby('object_id').size().to_frame('n_obs')
    
    features = aggs.merge(counts, left_index=True, right_index=True)
    features = features.merge(time_aggs[['duration']], left_index=True, right_index=True)
    
    return features

# Hàm hỗ trợ load toàn bộ 20 splits
def load_all_splits(base_path, mode='train'):
    all_features = []
    print(f"Bắt đầu xử lý dữ liệu {mode} từ 20 splits...")
    
    for i in range(1, 21):
        split_name = f'split_{i:02d}' 
        file_name = f'{mode}_full_lightcurves.csv'
        full_path = os.path.join(base_path, split_name, file_name)
        print(f"Processing {split_name}...", end='\r')
        feats = extract_features_from_split(full_path)
        if feats is not None:
            all_features.append(feats)
        del feats
        gc.collect()
    print(f"\nĐã xử lý xong {mode}!")
    return pd.concat(all_features)

In [None]:
# 3. Load & Prepare Data
BASE_PATH = 'data/raw'
train_lc_features = load_all_splits(BASE_PATH, mode='train')
test_lc_features = load_all_splits(BASE_PATH, mode='test')
train_log = pd.read_csv(os.path.join(BASE_PATH, 'train_log.csv'))
test_log = pd.read_csv(os.path.join(BASE_PATH, 'test_log.csv'))
full_train = train_log.merge(train_lc_features, on='object_id', how='left')
full_test = test_log.merge(test_lc_features, on='object_id', how='left')
full_train.fillna(0, inplace=True)
full_test.fillna(0, inplace=True)

drop_cols = ['object_id', 'SpecType', 'English Translation', 'split', 'target', 'Z_err']
feature_cols = [c for c in full_train.columns if c not in drop_cols]
X = full_train[feature_cols]
y = full_train['target']
X_test_sub = full_test[feature_cols]

X_train_org, X_val_org, y_train_org, y_val_org = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
# 4. Pipeline & GridSearch
svm_pipeline = ImbPipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42, k_neighbors=5)),
    ('select', SelectKBest(score_func=f_classif)),
    ('svm', SVC(probability=True, kernel='rbf', class_weight='balanced', random_state=42))
])

param_grid = {
    'select__k': [20, 30, 'all'],
    'svm__C': [1, 10, 100],
    'svm__gamma': ['scale', 0.1],
    'smote__sampling_strategy': [0.5, 1.0]
}

print("Running GridSearch...")
grid = GridSearchCV(svm_pipeline, param_grid, cv=3, scoring='f1', verbose=2, n_jobs=-1)
grid.fit(X_train_org, y_train_org)

print("Best params:", grid.best_params_)
best_model = grid.best_estimator_

In [None]:
# 5. Evaluation
y_pred_val = best_model.predict(X_val_org)
print("Validation F1:", f1_score(y_val_org, y_pred_val))
print(classification_report(y_val_org, y_pred_val))