# Experiment 07: SVM with Probability Calibration

## Overview
This notebook focuses on improving the reliability of the model's probability outputs.
- **Feature Extraction**: SNR + Color + Amplitude + Basic Stats.
- **Pipeline**: Imputer -> StandardScaler -> SMOTE -> SelectKBest -> SVC.
- **New Step**: `CalibratedClassifierCV` (Isotonic) to refine probabilities.
- **Evaluation**: Threshold Tuning on Calibrated Probabilities.

In [None]:
# 1. Import & Config
%load_ext autoreload
%autoreload 2

import os
import gc
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.svm import SVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import f1_score, classification_report, confusion_matrix, precision_recall_curve

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

%matplotlib inline
warnings.filterwarnings("ignore")

In [None]:
# 2. Feature Extraction
from scipy.stats import skew

def extract_features_from_split(csv_path):
    if not os.path.exists(csv_path): return None
    df = pd.read_csv(csv_path)
    
    # 1. TÍNH SNR
    df['snr'] = df['Flux'] / (df['Flux_err'] + 1e-6)
    
    # 2. Thống kê
    aggs = df.groupby(['object_id', 'Filter']).agg({
        'Flux': ['max', 'min', 'mean', 'std', skew],
        'snr': ['max', 'mean']
    }).unstack()
    aggs.columns = [f'{col[0]}_{col[1]}_{col[2]}' for col in aggs.columns]
    
    # 3. Tính Màu (Quan trọng)
    if 'Flux_max_g' in aggs.columns and 'Flux_max_r' in aggs.columns:
        aggs['color_g_r'] = aggs['Flux_max_g'] - aggs['Flux_max_r']
    
    # 4. Biên độ
    filters = df['Filter'].unique()
    for f in filters:
        if f'Flux_max_{f}' in aggs.columns and f'Flux_min_{f}' in aggs.columns:
            aggs[f'amp_{f}'] = aggs[f'Flux_max_{f}'] - aggs[f'Flux_min_{f}']
            
    # 5. Số lượng quan sát
    counts = df.groupby('object_id').size().to_frame('n_obs')
    
    features = aggs.merge(counts, left_index=True, right_index=True)
    return features

def load_all_splits(base_path, mode='train'):
    all_features = []
    print(f"Bắt đầu xử lý dữ liệu {mode} từ 20 splits...")
    
    for i in range(1, 21):
        split_name = f'split_{i:02d}' # Format 01, 02...
        file_name = f'{mode}_full_lightcurves.csv'
        full_path = os.path.join(base_path, split_name, file_name)
        
        print(f"Processing {split_name}...", end='\r')
        
        feats = extract_features_from_split(full_path)
        if feats is not None:
            all_features.append(feats)
            
        # Giải phóng bộ nhớ RAM
        del feats
        gc.collect()
        
    print(f"\nĐã xử lý xong {mode}!")
    # Gộp tất cả các split thành 1 DataFrame lớn
    return pd.concat(all_features)

In [None]:
# 3. Load Data
BASE_PATH = 'data/raw'

print("Loading Train features...")
train_lc_features = load_all_splits(BASE_PATH, mode='train')
test_lc_features = load_all_splits(BASE_PATH, mode='test')

train_log = pd.read_csv(os.path.join(BASE_PATH, 'train_log.csv'))
test_log = pd.read_csv(os.path.join(BASE_PATH, 'test_log.csv'))

full_train = train_log.merge(train_lc_features, on='object_id', how='left')
full_test = test_log.merge(test_lc_features, on='object_id', how='left')

full_train.fillna(0, inplace=True)
full_test.fillna(0, inplace=True)

display(full_train.head(3))

In [None]:
# 4. Prepare Data
drop_cols = ['object_id', 'SpecType', 'English Translation', 'split', 'target', 'Z_err']
feature_cols = [c for c in full_train.columns if c not in drop_cols]

X = full_train[feature_cols]
y = full_train['target']
X_test_sub = full_test[feature_cols]

# Split (Stratify)
X_train_org, X_val_org, y_train_org, y_val_org = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"TDE in Val: {sum(y_val_org==1)}")

In [None]:
# 5. Pipeline & Calibration
# Removing class_weight='balanced' as requested
svm_pipeline = ImbPipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),                 
    ('smote', SMOTE(random_state=42, k_neighbors=5)), 
    ('select', SelectKBest(score_func=f_classif)),
    ('svm', SVC(probability=True, kernel='rbf', random_state=42)) 
])

# GridSearch
param_grid = {
    'select__k': [20, 30],
    'svm__C': [10, 50, 100],
    'svm__gamma': ['scale', 0.1],
    'smote__sampling_strategy': [0.5, 0.75]
}

print("Finding Best SVM...")
grid = GridSearchCV(svm_pipeline, param_grid, cv=3, scoring='f1', verbose=1, n_jobs=-1)
grid.fit(X_train_org, y_train_org)

print("Best params:", grid.best_params_)
best_pipeline = grid.best_estimator_

# --- CALIBRATION ---
print("Calibrating probabilities...")
# cv='prefit' means we calibrate on the already fitted model using the validation set
calibrated_svm = CalibratedClassifierCV(best_pipeline, method='isotonic', cv='prefit')
calibrated_svm.fit(X_val_org, y_val_org)

y_pred_val_calibrated = calibrated_svm.predict(X_val_org)
print("\nValidation F1 (Calibrated):", f1_score(y_val_org, y_pred_val_calibrated))

In [None]:
# 6. Threshold Tuning
y_val_prob = calibrated_svm.predict_proba(X_val_org)[:, 1]

precisions, recalls, thresholds = precision_recall_curve(y_val_org, y_val_prob)
f1_scores = np.divide(2 * precisions * recalls, precisions + recalls, out=np.zeros_like(precisions), where=(precisions + recalls)!=0)

best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]
print(f"Best Threshold: {best_threshold:.4f}")
print(f"Max F1 Val: {f1_scores[best_idx]:.4f}")

In [None]:
# 7. Submission
y_test_prob = calibrated_svm.predict_proba(X_test_sub)[:, 1]
final_predictions = (y_test_prob >= best_threshold).astype(int)

submission = pd.DataFrame({
    'object_id': full_test['object_id'],
    'prediction': final_predictions
})

print(submission['prediction'].value_counts())
submission.to_csv('submission_svm_calibrated.csv', index=False)
print("Done.")