# Experiment 03: Basic SVM Pipeline (No Threshold Tuning)

## Overview
This notebook implements the SVM pipeline without the threshold tuning step, using the default 0.5 decision boundary.
- **Core**: Feature Extraction (User Engine).
- **Preprocessing**: StandardScaler + SMOTE.
- **Model**: SVC (RBF kernel) + GridSearch.
- **Evaluation**: Standard F1 Score (Default Threshold).

In [None]:
# 1. Import & Config
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import gc
import warnings

from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

# Custom module
from src.data_processing import load_all_splits

%matplotlib inline
warnings.filterwarnings('ignore')

In [None]:
# 2. Feature Extraction
BASE_PATH = 'data/raw'

print("Loading Train features...")
train_lc_features = load_all_splits(BASE_PATH, mode='train')

print("Loading Test features...")
test_lc_features = load_all_splits(BASE_PATH, mode='test')

print("Shape Train:", train_lc_features.shape)
print("Shape Test:", test_lc_features.shape)

In [None]:
# 3. Merge Metadata
train_log = pd.read_csv(os.path.join(BASE_PATH, 'train_log.csv'))
test_log = pd.read_csv(os.path.join(BASE_PATH, 'test_log.csv'))

full_train = train_log.merge(train_lc_features, on='object_id', how='left')
full_test = test_log.merge(test_lc_features, on='object_id', how='left')

full_train.fillna(0, inplace=True)
full_test.fillna(0, inplace=True)

display(full_train.head(3))

In [None]:
# 4. Preprocessing (Scale & SMOTE)
drop_cols = ['object_id', 'SpecType', 'English Translation', 'split', 'target', 'Z_err']
feature_cols = [c for c in full_train.columns if c not in drop_cols]

print(f"Using {len(feature_cols)} features.")

X = full_train[feature_cols]
y = full_train['target']
X_test_final = full_test[feature_cols]

# Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test_final)

# SMOTE
print(f"Original TDE: {sum(y==1)}")
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)
print(f"After SMOTE TDE: {sum(y_resampled==1)}")

In [None]:
# 5. Training & Grid Search
X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

svm = SVC(kernel='rbf', probability=True, random_state=42)

param_grid = {
    'C': [1, 10, 100],
    'gamma': ['scale', 0.1, 0.01]
}

print("Starting Grid Search...")
grid = GridSearchCV(svm, param_grid, cv=3, scoring='f1', verbose=2, n_jobs=-1)
grid.fit(X_train, y_train)

best_model = grid.best_estimator_
print("Best params:", grid.best_params_)

# Validation Eval (Using predict directly, implying 0.5 threshold)
y_pred_val = best_model.predict(X_val)
print("Val F1:", f1_score(y_val, y_pred_val))
print(classification_report(y_val, y_pred_val))

In [None]:
# 6. Threshold Tuning (Commented Out as per User request)
# from sklearn.metrics import precision_recall_curve

# # Dự đoán xác suất thay vì nhãn (cần set probability=True khi tạo SVM)
# y_val_prob = best_model.predict_proba(X_val)[:, 1]

# # Tìm ngưỡng tối ưu cho F1
# precisions, recalls, thresholds = precision_recall_curve(y_val, y_val_prob)
# f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
# best_threshold = thresholds[np.argmax(f1_scores)]

# print(f"Ngưỡng tối ưu: {best_threshold}")
# print(f"F1 Score tốt nhất: {np.max(f1_scores)}")

# # Áp dụng ngưỡng này cho tập Test
# y_test_prob = best_model.predict_proba(X_test_scaled)[:, 1]
# final_predictions = (y_test_prob >= best_threshold).astype(int)

In [None]:
# 7. Submission
# Using predict directly (0.5 threshold)
final_predictions = best_model.predict(X_test_scaled)

submission = pd.DataFrame({
    'object_id': full_test['object_id'],
    'prediction': final_predictions
})

print(submission['prediction'].value_counts())
submission.to_csv('submission_svm_improved.csv', index=False)
print("Done.")