In [None]:
# 0. Reset working (Kaggle-style)
!rm -rf /kaggle/working/*

In [None]:
# 1. Imports & config
import os
import gc
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
warnings.filterwarnings('ignore')
%matplotlib inline
from scipy.stats import skew, kurtosis
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

In [None]:
# 2. Core feature-extraction engine
def extract_features_from_split(csv_path):
    if not os.path.exists(csv_path):
        return None
    df = pd.read_csv(csv_path)
    # compute SNR if available
    if 'Flux_err' in df.columns:
        df['snr'] = df['Flux'] / (df['Flux_err'] + 1e-6)
    # basic stats + skew
    aggs = df.groupby(['object_id','Filter'])['Flux'].agg(['max','min','mean','std',skew]).unstack()
    aggs.columns = [f'{stat}_{filt}' for stat,filt in aggs.columns]
    # color features (example g - r)
    if 'max_g' in aggs.columns and 'max_r' in aggs.columns:
        aggs['color_g_r'] = aggs['max_g'] - aggs['max_r']
    if 'max_u' in aggs.columns and 'max_z' in aggs.columns:
        aggs['color_u_z'] = aggs['max_u'] - aggs['max_z']
    # amplitude per filter
    filters = df['Filter'].unique()
    for f in filters:
        if f'max_{f}' in aggs.columns and f'min_{f}' in aggs.columns:
            aggs[f'amp_{f}'] = aggs[f'max_{f}'] - aggs[f'min_{f}']
    counts = df.groupby('object_id').size().to_frame('n_obs')
    features = aggs.merge(counts, left_index=True, right_index=True)
    return features

def load_all_splits(base_path, mode='train'):
    all_features = []
    print(f"Bắt đầu xử lý dữ liệu {mode} từ 20 splits...")
    for i in range(1,21):
        split_name = f'split_{i:02d}'
        file_name = f'{mode}_full_lightcurves.csv'
        full_path = os.path.join(base_path, split_name, file_name)
        print(f"Processing {split_name}...", end='
')
        feats = extract_features_from_split(full_path)
        if feats is not None:
            all_features.append(feats)
        del feats
        gc.collect()
    print(f"
Đã xử lý xong {mode}!")
    return pd.concat(all_features)

In [None]:
# 3. Run feature extraction (example base path)
BASE_PATH = '/kaggle/input/mallorn-dataset'
# train_lc_features = load_all_splits(BASE_PATH, mode='train')
# test_lc_features = load_all_splits(BASE_PATH, mode='test')
# print('Train features shape:', train_lc_features.shape)
# print('Test features shape:', test_lc_features.shape)

In [None]:
# 4. Preprocessing: select features, scale, SMOTE (example)
# drop_cols = ['object_id','SpecType','English Translation','split','target','Z_err']
# feature_cols = [c for c in full_train.columns if c not in drop_cols]
# X = full_train[feature_cols]
# y = full_train['target']
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)
# smote = SMOTE(random_state=42)
# X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

In [None]:
# 5. Model training: SVM baseline (GridSearch example)
# svm = SVC(kernel='rbf', probability=True, random_state=42)
# param_grid = {'C':[1,10,100], 'gamma':['scale',0.1,0.01]}
# grid = GridSearchCV(svm, param_grid, cv=3, scoring='f1', verbose=2, n_jobs=-1)
# grid.fit(X_train, y_train)
# best_model = grid.best_estimator_
# y_pred_val = best_model.predict(X_val)
# print('Validation F1:', f1_score(y_val, y_pred_val))

In [None]:
# 6. XGBoost GPU example (commented; requires GPU & xgboost installed)
# from xgboost import XGBClassifier
# xgb = XGBClassifier(tree_method='hist', device='cuda', eval_metric='logloss', random_state=42, use_label_encoder=False)
# # param grid / randomized search snippet here

---
Notes:
- Cells that run long (feature extraction, grid search) are left commented so the notebook stored in repo has no heavy outputs.
- To run: uncomment required cells and ensure `BASE_PATH` points to your dataset (`data/raw` or Kaggle input`).