# Experiment 04: SVM Improved (Standard GridSearch)

## Overview
This experiment uses the standard GridSearch approach with SelectKBest (tuning k=20, 30, all).
- **Feature Extraction**: Standard (User Engine).
- **Pipeline**: Imputer -> Scaler -> SMOTE -> SelectKBest -> SVC.


In [None]:
# 1. Import & Config
%load_ext autoreload
%autoreload 2

import os
import gc
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.svm import SVC
from sklearn.metrics import f1_score, classification_report, confusion_matrix

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# Custom module
from src.data_processing import load_all_splits

%matplotlib inline
warnings.filterwarnings("ignore")

In [None]:
# 2. Load Data
BASE_PATH = 'data/raw'
train_lc_features = load_all_splits(BASE_PATH, mode='train')
test_lc_features = load_all_splits(BASE_PATH, mode='test')

train_log = pd.read_csv(os.path.join(BASE_PATH, 'train_log.csv'))
test_log = pd.read_csv(os.path.join(BASE_PATH, 'test_log.csv'))

full_train = train_log.merge(train_lc_features, on='object_id', how='left')
full_test = test_log.merge(test_lc_features, on='object_id', how='left')

full_train.fillna(0, inplace=True)
full_test.fillna(0, inplace=True)

In [None]:
# 3. Prepare Data
drop_cols = ['object_id', 'SpecType', 'English Translation', 'split', 'target', 'Z_err']
feature_cols = [c for c in full_train.columns if c not in drop_cols]
X = full_train[feature_cols]
y = full_train['target']
X_test_sub = full_test[feature_cols]

X_train_org, X_val_org, y_train_org, y_val_org = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
# 4. Pipeline & GridSearch (Standard)
svm_pipeline = ImbPipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42, k_neighbors=5)), 
    ('select', SelectKBest(score_func=f_classif)),
    ('svm', SVC(probability=True, kernel='rbf', class_weight='balanced', random_state=42))
])

param_grid = {
    'select__k': [20, 30, 'all'],
    'svm__C': [1, 10, 100],
    'svm__gamma': ['scale', 0.1],
    'smote__sampling_strategy': [0.5, 1.0]
}

print("Running Standard GridSearch...")
grid = GridSearchCV(svm_pipeline, param_grid, cv=3, scoring='f1', verbose=2, n_jobs=-1)
grid.fit(X_train_org, y_train_org)

print("Best params:", grid.best_params_)
best_model = grid.best_estimator_

In [None]:
# 5. Evaluation
y_pred_val = best_model.predict(X_val_org)
print("Validation F1:", f1_score(y_val_org, y_pred_val))
print(classification_report(y_val_org, y_pred_val))