In [None]:
# version 9
# 0. Reset output
!rm -rf /kaggle/working/*

# 1. Import v√† c·∫•u h√¨nh¬∂
# ===============================
# üì¶ IMPORT TH∆Ø VI·ªÜN C∆† B·∫¢N
# ===============================
import os
import gc
import warnings

import numpy as np
import pandas as pd

# ===============================
# üìä VISUALIZATION
# ===============================
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
warnings.filterwarnings("ignore")

# ===============================
# ü§ñ MACHINE LEARNING - SKLEARN
# ===============================
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
    StratifiedKFold
)

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import (
    f1_score,
    classification_report,
    confusion_matrix
)

# ===============================
# ‚öñÔ∏è IMBALANCED LEARNING
# ===============================
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# 2. H√†m tr√≠ch xu·∫•t ƒë·∫∑c tr∆∞ng (Core Engine)
from scipy.stats import skew

def extract_features_from_split(csv_path):
    if not os.path.exists(csv_path): return None
    df = pd.read_csv(csv_path)
    
    # 1. T√çNH SNR
    df['snr'] = df['Flux'] / (df['Flux_err'] + 1e-6)
    
    # 2. Th·ªëng k√™
    aggs = df.groupby(['object_id', 'Filter']).agg({
        'Flux': ['max', 'min', 'mean', 'std', skew],
        'snr': ['max', 'mean']
    }).unstack()
    aggs.columns = [f'{col[0]}_{col[1]}_{col[2]}' for col in aggs.columns]
    
    # 3. T√≠nh M√†u (Quan tr·ªçng)
    if 'Flux_max_g' in aggs.columns and 'Flux_max_r' in aggs.columns:
        aggs['color_g_r'] = aggs['Flux_max_g'] - aggs['Flux_max_r']
    
    # 4. Bi√™n ƒë·ªô
    filters = df['Filter'].unique()
    for f in filters:
        if f'Flux_max_{f}' in aggs.columns and f'Flux_min_{f}' in aggs.columns:
            aggs[f'amp_{f}'] = aggs[f'Flux_max_{f}'] - aggs[f'Flux_min_{f}']
            
    # 5. S·ªë l∆∞·ª£ng quan s√°t
    counts = df.groupby('object_id').size().to_frame('n_obs')
    
    features = aggs.merge(counts, left_index=True, right_index=True)
    return features

# !!! B·∫†N NH·ªö CH·∫†Y L·∫†I H√ÄM LOAD D·ªÆ LI·ªÜU ƒê·ªÇ C·∫¨P NH·∫¨T full_train NH√â !!!

# H√†m h·ªó tr·ª£ load to√†n b·ªô 20 splits
def load_all_splits(base_path, mode='train'):
    all_features = []
    print(f"B·∫Øt ƒë·∫ßu x·ª≠ l√Ω d·ªØ li·ªáu {mode} t·ª´ 20 splits...")
    
    for i in range(1, 21):
        split_name = f'split_{i:02d}' # Format 01, 02...
        file_name = f'{mode}_full_lightcurves.csv'
        full_path = os.path.join(base_path, split_name, file_name)
        
        print(f"Processing {split_name}...", end='\r')
        
        feats = extract_features_from_split(full_path)
        if feats is not None:
            all_features.append(feats)
            
        # Gi·∫£i ph√≥ng b·ªô nh·ªõ RAM
        del feats
        gc.collect()
        
    print(f"\nƒê√£ x·ª≠ l√Ω xong {mode}!")
    # G·ªôp t·∫•t c·∫£ c√°c split th√†nh 1 DataFrame l·ªõn
    return pd.concat(all_features)

# 3. Ch·∫°y x·ª≠ l√Ω d·ªØ li·ªáu (M·∫•t kho·∫£ng 2-5 ph√∫t)
BASE_PATH = '/kaggle/input/mallorn-dataset'

# 1. Load Feature t·ª´ Lightcurves (B∆∞·ªõc t·ªën th·ªùi gian nh·∫•t)
train_lc_features = load_all_splits(BASE_PATH, mode='train')
test_lc_features = load_all_splits(BASE_PATH, mode='test')

print("K√≠ch th∆∞·ªõc Train features:", train_lc_features.shape)
print("K√≠ch th∆∞·ªõc Test features:", test_lc_features.shape)

# 2. Load file Log (Metadata)
train_log = pd.read_csv(f'{BASE_PATH}/train_log.csv')
test_log = pd.read_csv(f'{BASE_PATH}/test_log.csv')

# 3. G·ªôp (Merge) Lightcurve Features v√†o Log Data
# D√πng left join ƒë·ªÉ ƒë·∫£m b·∫£o gi·ªØ nguy√™n th·ª© t·ª± c·ªßa file Log
full_train = train_log.merge(train_lc_features, on='object_id', how='left')
full_test = test_log.merge(test_lc_features, on='object_id', how='left')

# ƒêi·ªÅn 0 cho c√°c gi√° tr·ªã NaN sinh ra do merge (v√≠ d·ª• ng√¥i sao kh√¥ng c√≥ d·ªØ li·ªáu ·ªü filter 'u')
full_train.fillna(0, inplace=True)
full_test.fillna(0, inplace=True)

display(full_train.head(3))

# 4. Chu·∫©n b·ªã d·ªØ li·ªáu cho SVM (Scale & SMOTE)
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.calibration import CalibratedClassifierCV # <--- V≈® KH√ç M·ªöI
from sklearn.metrics import f1_score, classification_report
import numpy as np

# 1. Chu·∫©n b·ªã d·ªØ li·ªáu
drop_cols = ['object_id', 'SpecType', 'English Translation', 'split', 'target', 'Z_err']
feature_cols = [c for c in full_train.columns if c not in drop_cols]

X = full_train[feature_cols]
y = full_train['target']
X_test_sub = full_test[feature_cols]

# Chia t·∫≠p (Stratify)
X_train_org, X_val_org, y_train_org, y_val_org = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 5. Hu·∫•n luy·ªán SVM & Tinh ch·ªânh (Training & Tuning)
# Thay v√¨ ch·∫°y 1 l·∫ßn, ch√∫ng ta d√πng Cross-Validation ƒë·ªÉ ƒë·∫£m b·∫£o k·∫øt qu·∫£ ·ªïn ƒë·ªãnh.

# 2. Pipeline (Quay l·∫°i b·∫£n 0.3848)
# L∆∞u √Ω: T√¥i b·ªè class_weight='balanced' v√¨ ƒë√£ c√≥ SMOTE. 
# D√πng c·∫£ 2 ƒë√¥i khi l√†m model b·ªã nhi·ªÖu (Double balancing).
svm_pipeline = ImbPipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),                 # Quay l·∫°i StandardScaler
    ('smote', SMOTE(random_state=42, k_neighbors=5)), 
    ('select', SelectKBest(score_func=f_classif)),
    ('svm', SVC(probability=True, kernel='rbf', random_state=42)) # B·ªè class_weight='balanced'
])

# 3. GridSearch
param_grid = {
    'select__k': [20, 30],
    'svm__C': [10, 50, 100],      # TƒÉng C l√™n ƒë·ªÉ ph·∫°t l·ªói n·∫∑ng h∆°n
    'svm__gamma': ['scale', 0.1],
    'smote__sampling_strategy': [0.5, 0.75] # T·ª∑ l·ªá sinh v·ª´a ph·∫£i
}

print("ƒêang t√¨m Best SVM...")
grid = GridSearchCV(svm_pipeline, param_grid, cv=3, scoring='f1', verbose=1, n_jobs=-1)
grid.fit(X_train_org, y_train_org)

print("Best params:", grid.best_params_)
best_pipeline = grid.best_estimator_

# --- B∆Ø·ªöC M·ªöI: HI·ªÜU CH·ªàNH X√ÅC SU·∫§T (CALIBRATION) ---
# SVM thu·∫ßn th∆∞·ªùng t·ª± tin th√°i qu√° ho·∫∑c qu√° r·ª•t r√®. 
# CalibratedClassifierCV s·∫Ω "n·∫Øn" l·∫°i x√°c su·∫•t cho chu·∫©n.
print("ƒêang hi·ªáu ch·ªânh x√°c su·∫•t (Calibration)...")
calibrated_svm = CalibratedClassifierCV(best_pipeline, method='isotonic', cv='prefit')
calibrated_svm.fit(X_val_org, y_val_org) # Fit tr√™n t·∫≠p Val ƒë·ªÉ h·ªçc c√°ch s·ª≠a l·ªói

# D·ª± ƒëo√°n th·ª≠ tr√™n Val
y_pred_val_calibrated = calibrated_svm.predict(X_val_org)
print("\nValidation F1 (Calibrated):", f1_score(y_val_org, y_pred_val_calibrated))

# Tinh ch·ªânh ng∆∞·ª°ng (Threshold Tuning)
from sklearn.metrics import precision_recall_curve

# L·∫•y x√°c su·∫•t t·ª´ model ƒë√£ hi·ªáu ch·ªânh
y_val_prob = calibrated_svm.predict_proba(X_val_org)[:, 1]

# T√¨m ng∆∞·ª°ng t·ªëi ∆∞u
precisions, recalls, thresholds = precision_recall_curve(y_val_org, y_val_prob)
f1_scores = np.divide(2 * precisions * recalls, precisions + recalls, out=np.zeros_like(precisions), where=(precisions + recalls)!=0)

best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]
print(f"Ng∆∞·ª°ng t·ªëi ∆∞u: {best_threshold:.4f}")
print(f"Max F1 Val: {f1_scores[best_idx]:.4f}")

# 6. D·ª± ƒëo√°n v√† Submit
# D·ª± ƒëo√°n Test
y_test_prob = calibrated_svm.predict_proba(X_test_sub)[:, 1]
final_predictions = (y_test_prob >= best_threshold).astype(int)

submission = pd.DataFrame({
    'object_id': full_test['object_id'],
    'prediction': final_predictions
})

print(submission['prediction'].value_counts())
submission.to_csv('submission_svm_calibrated.csv', index=False)
print('ƒê√£ l∆∞u file: submission_svm_calibrated.csv')


In [None]:
# version 8
# 0. Reset output
!rm -rf /kaggle/working/*

# 1. Import v√† c·∫•u h√¨nh¬∂
# ===============================
# üì¶ IMPORT TH∆Ø VI·ªÜN C∆† B·∫¢N
# ===============================
import os
import gc
import warnings

import numpy as np
import pandas as pd

# ===============================
# üìä VISUALIZATION
# ===============================
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
warnings.filterwarnings("ignore")

# ===============================
# ü§ñ MACHINE LEARNING - SKLEARN
# ===============================
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
    StratifiedKFold
)

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import (
    f1_score,
    classification_report,
    confusion_matrix
)

# ===============================
# ‚öñÔ∏è IMBALANCED LEARNING
# ===============================
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# 2. H√†m tr√≠ch xu·∫•t ƒë·∫∑c tr∆∞ng (Core Engine)
from scipy.stats import skew, kurtosis

# Reset h√†m feature t·ªët nh·∫•t
from scipy.stats import skew

def extract_features_from_split(csv_path):
    if not os.path.exists(csv_path): return None
    df = pd.read_csv(csv_path)
    
    # 1. T√çNH SNR (T√≠n hi·ªáu tr√™n nhi·ªÖu)
    df['snr'] = df['Flux'] / (df['Flux_err'] + 1e-6)
    
    # 2. Th·ªëng k√™ c∆° b·∫£n
    aggs = df.groupby(['object_id', 'Filter']).agg({
        'Flux': ['max', 'min', 'mean', 'std', skew], # Skew r·∫•t quan tr·ªçng
        'snr': ['max', 'mean']
    }).unstack()
    aggs.columns = [f'{col[0]}_{col[1]}_{col[2]}' for col in aggs.columns]
    
    # 3. T√≠nh M√†u & Bi√™n ƒë·ªô (V·∫≠t l√Ω)
    if 'Flux_max_g' in aggs.columns and 'Flux_max_r' in aggs.columns:
        aggs['color_g_r'] = aggs['Flux_max_g'] - aggs['Flux_max_r']
        # Th√™m t·ª∑ l·ªá (Ratio) - SVM th√≠ch c√°i n√†y h∆°n hi·ªáu s·ªë
        aggs['ratio_g_r'] = aggs['Flux_max_g'] / (aggs['Flux_max_r'] + 1)

    filters = df['Filter'].unique()
    for f in filters:
        if f'Flux_max_{f}' in aggs.columns and f'Flux_min_{f}' in aggs.columns:
            aggs[f'amp_{f}'] = aggs[f'Flux_max_{f}'] - aggs[f'Flux_min_{f}']
            
    # 4. S·ªë l∆∞·ª£ng quan s√°t
    counts = df.groupby('object_id').size().to_frame('n_obs')
    
    # Merge l·∫°i
    features = aggs.merge(counts, left_index=True, right_index=True)
    return features

# --- CH·∫†Y L·∫†I B∆Ø·ªöC LOAD D·ªÆ LI·ªÜU ---
# (B·∫°n nh·ªõ ch·∫°y l·∫°i h√†m load_all_splits v√† t·∫°o full_train, full_test nh√©)
# ...

# H√†m h·ªó tr·ª£ load to√†n b·ªô 20 splits
def load_all_splits(base_path, mode='train'):
    all_features = []
    print(f"B·∫Øt ƒë·∫ßu x·ª≠ l√Ω d·ªØ li·ªáu {mode} t·ª´ 20 splits...")
    
    for i in range(1, 21):
        split_name = f'split_{i:02d}' # Format 01, 02...
        file_name = f'{mode}_full_lightcurves.csv'
        full_path = os.path.join(base_path, split_name, file_name)
        
        print(f"Processing {split_name}...", end='\r')
        
        feats = extract_features_from_split(full_path)
        if feats is not None:
            all_features.append(feats)
            
        # Gi·∫£i ph√≥ng b·ªô nh·ªõ RAM
        del feats
        gc.collect()
        
    print(f"\nƒê√£ x·ª≠ l√Ω xong {mode}!")
    # G·ªôp t·∫•t c·∫£ c√°c split th√†nh 1 DataFrame l·ªõn
    return pd.concat(all_features)

# 3. Ch·∫°y x·ª≠ l√Ω d·ªØ li·ªáu (M·∫•t kho·∫£ng 2-5 ph√∫t)
BASE_PATH = '/kaggle/input/mallorn-dataset'

# 1. Load Feature t·ª´ Lightcurves (B∆∞·ªõc t·ªën th·ªùi gian nh·∫•t)
train_lc_features = load_all_splits(BASE_PATH, mode='train')
test_lc_features = load_all_splits(BASE_PATH, mode='test')

print("K√≠ch th∆∞·ªõc Train features:", train_lc_features.shape)
print("K√≠ch th∆∞·ªõc Test features:", test_lc_features.shape)

# 2. Load file Log (Metadata)
train_log = pd.read_csv(f'{BASE_PATH}/train_log.csv')
test_log = pd.read_csv(f'{BASE_PATH}/test_log.csv')

# 3. G·ªôp (Merge) Lightcurve Features v√†o Log Data
# D√πng left join ƒë·ªÉ ƒë·∫£m b·∫£o gi·ªØ nguy√™n th·ª© t·ª± c·ªßa file Log
full_train = train_log.merge(train_lc_features, on='object_id', how='left')
full_test = test_log.merge(test_lc_features, on='object_id', how='left')

# ƒêi·ªÅn 0 cho c√°c gi√° tr·ªã NaN sinh ra do merge (v√≠ d·ª• ng√¥i sao kh√¥ng c√≥ d·ªØ li·ªáu ·ªü filter 'u')
full_train.fillna(0, inplace=True)
full_test.fillna(0, inplace=True)

display(full_train.head(3))

# 4. Chu·∫©n b·ªã d·ªØ li·ªáu cho SVM (Scale & SMOTE)
# --- B∆Ø·ªöC 4: CHU·∫®N B·ªä D·ªÆ LI·ªÜU (S·ª¨A L·∫†I) ---
# Ch·ªçn Feature
drop_cols = ['object_id', 'SpecType', 'English Translation', 'split', 'target', 'Z_err']
feature_cols = [c for c in full_train.columns if c not in drop_cols]
X = full_train[feature_cols]
y = full_train['target']
X_test_sub = full_test[feature_cols]

# Chia t·∫≠p Train/Val G·ªêC (Ch∆∞a scale, ch∆∞a SMOTE)
# Stratify ƒë·ªÉ ƒë·∫£m b·∫£o t·ª∑ l·ªá TDE ·ªü 2 t·∫≠p nh∆∞ nhau
X_train_org, X_val_org, y_train_org, y_val_org = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"S·ªë l∆∞·ª£ng TDE trong t·∫≠p Val th·ª±c t·∫ø: {sum(y_val_org==1)}")

# 5. Hu·∫•n luy·ªán SVM & Tinh ch·ªânh (Training & Tuning)
# changes for v8: Bagging + RobustScaler + RandomizedSearchCV
from sklearn.ensemble import BaggingClassifier
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import RobustScaler  # <--- THAY ƒê·ªîI 1
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score, classification_report, precision_recall_curve

# 1. CHU·∫®N B·ªä D·ªÆ LI·ªÜU
# L·∫•y l·∫°i d·ªØ li·ªáu g·ªëc t·ª´ full_train (ƒë√£ ch·∫°y qua h√†m extract_features chu·∫©n ·ªü b∆∞·ªõc tr∆∞·ªõc)
drop_cols = ['object_id', 'SpecType', 'English Translation', 'split', 'target', 'Z_err']
feature_cols = [c for c in full_train.columns if c not in drop_cols]

X = full_train[feature_cols]
y = full_train['target']
X_test_sub = full_test[feature_cols]

# Chia t·∫≠p Train/Val (Stratify ƒë·ªÉ gi·ªØ t·ª∑ l·ªá TDE)
X_train_org, X_val_org, y_train_org, y_val_org = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 2. X√ÇY D·ª∞NG PIPELINE "BAGGING SVM"
# SVM c∆° s·ªü (Base Estimator)
svc_base = SVC(
    kernel='rbf', 
    probability=True,       # B·∫Øt bu·ªôc True ƒë·ªÉ Bagging t√≠nh ƒë∆∞·ª£c x√°c su·∫•t trung b√¨nh
    class_weight='balanced', 
    random_state=42
)

svm_pipeline = ImbPipeline([
    ('imputer', SimpleImputer(strategy='median')),    # D√πng Median an to√†n h∆°n Mean
    ('scaler', RobustScaler()),                       # D√πng RobustScaler ch·ªëng nhi·ªÖu
    ('smote', SMOTE(random_state=42, k_neighbors=7)), # TƒÉng neighbor l√™n 7 cho m∆∞·ª£t
    ('select', SelectKBest(score_func=f_classif)),    
    ('bagging', BaggingClassifier(                    # <--- THAY ƒê·ªîI 2: B·ªçc trong Bagging
        estimator=svc_base,
        n_estimators=10,        # T·∫°o 10 model con (Ensemble)
        max_samples=0.8,        # M·ªói model h·ªçc 80% d·ªØ li·ªáu
        bootstrap=True,
        random_state=42,
        n_jobs=-1               # Ch·∫°y song song
    )) 
])

# 3. T√åM THAM S·ªê (GridSearch/RandomizedSearch)
# L∆∞u √Ω c√∫ ph√°p: t√™n_b∆∞·ªõc__t√™n_tham_s·ªë
# V·ªõi Bagging: bagging__estimator__tham_s·ªë_c·ªßa_svm
param_grid = {
    'select__k': [20, 30, 'all'],             # S·ªë l∆∞·ª£ng feature gi·ªØ l·∫°i
    'bagging__estimator__C': [10, 50, 100],   # C c·ªßa SVM con (tƒÉng l√™n v√¨ RobustScaler thu nh·ªè data)
    'bagging__estimator__gamma': ['scale', 0.1],
    'smote__sampling_strategy': [0.5, 0.8]    # T·ª∑ l·ªá sinh m·∫´u
}

print("ƒêang ch·∫°y RandomizedSearchCV cho Bagging SVM...")
search = RandomizedSearchCV(
    svm_pipeline, 
    param_grid, 
    n_iter=10,    # Th·ª≠ 10 t·ªï h·ª£p t·ªët nh·∫•t
    cv=3, 
    scoring='f1', 
    verbose=2, 
    random_state=42,
    n_jobs=1      # ƒê·ªÉ 1 v√¨ b√™n trong Bagging ƒë√£ d√πng ƒëa nh√¢n r·ªìi
)

search.fit(X_train_org, y_train_org)
best_model = search.best_estimator_

print("\nBest params:", search.best_params_)

# Tinh ch·ªânh ng∆∞·ª°ng (Threshold Tuning)
# 4. T√åM NG∆Ø·ª†NG T·ªêI ∆ØU (Threshold Tuning)
# L·∫•y x√°c su·∫•t d·ª± ƒëo√°n tr√™n t·∫≠p Val
y_val_prob = best_model.predict_proba(X_val_org)[:, 1]

# T√≠nh ƒë∆∞·ªùng cong P-R
precisions, recalls, thresholds = precision_recall_curve(y_val_org, y_val_prob)
f1_scores = np.divide(2 * precisions * recalls, precisions + recalls, out=np.zeros_like(precisions), where=(precisions + recalls)!=0)

# Ch·ªçn ng∆∞·ª°ng c√≥ F1 cao nh·∫•t
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]
print(f"Ng∆∞·ª°ng t·ªëi ∆∞u: {best_threshold:.4f}")
print(f"Validation F1 (Max): {f1_scores[best_idx]:.4f}")

# 6. D·ª± ƒëo√°n v√† Submit
# 5. SUBMIT
# D·ª± ƒëo√°n tr√™n t·∫≠p Test
y_test_prob = best_model.predict_proba(X_test_sub)[:, 1]
final_predictions = (y_test_prob >= best_threshold).astype(int)

submission = pd.DataFrame({
    'object_id': full_test['object_id'],
    'prediction': final_predictions
})

print("\nTh·ªëng k√™ d·ª± ƒëo√°n:")
print(submission['prediction'].value_counts())

submission.to_csv('submission_svm_bagging_robust.csv', index=False)
print("ƒê√£ l∆∞u file: submission_svm_bagging_robust.csv")


In [None]:
# version 6
# 0. Reset output
!rm -rf /kaggle/working/*

# 1. Import v√† c·∫•u h√¨nh
# ===============================
# üì¶ IMPORT TH∆Ø VI·ªÜN C∆† B·∫¢N
# ===============================
import os
import gc
import warnings

import numpy as np
import pandas as pd

# ===============================
# üìä VISUALIZATION
# ===============================
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
warnings.filterwarnings("ignore")

# ===============================
# ü§ñ MACHINE LEARNING - SKLEARN
# ===============================
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
    StratifiedKFold
)

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import (
    f1_score,
    classification_report,
    confusion_matrix
)

# ===============================
# ‚öñÔ∏è IMBALANCED LEARNING
# ===============================
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# 2. H√†m tr√≠ch xu·∫•t ƒë·∫∑c tr∆∞ng (Core Engine)
from scipy.stats import skew, kurtosis

# Reset h√†m feature t·ªët nh·∫•t
from scipy.stats import skew

def extract_features_from_split(csv_path):
    if not os.path.exists(csv_path): return None
    df = pd.read_csv(csv_path)
    
    # 1. T√çNH SNR (T√≠n hi·ªáu tr√™n nhi·ªÖu)
    df['snr'] = df['Flux'] / (df['Flux_err'] + 1e-6)
    
    # 2. Th·ªëng k√™ c∆° b·∫£n
    aggs = df.groupby(['object_id', 'Filter']).agg({
        'Flux': ['max', 'min', 'mean', 'std', skew], # Skew r·∫•t quan tr·ªçng
        'snr': ['max', 'mean']
    }).unstack()
    aggs.columns = [f'{col[0]}_{col[1]}_{col[2]}' for col in aggs.columns]
    
    # 3. T√≠nh M√†u & Bi√™n ƒë·ªô (V·∫≠t l√Ω)
    if 'Flux_max_g' in aggs.columns and 'Flux_max_r' in aggs.columns:
        aggs['color_g_r'] = aggs['Flux_max_g'] - aggs['Flux_max_r']
        # Th√™m t·ª∑ l·ªá (Ratio) - SVM th√≠ch c√°i n√†y h∆°n hi·ªáu s·ªë
        aggs['ratio_g_r'] = aggs['Flux_max_g'] / (aggs['Flux_max_r'] + 1)

    filters = df['Filter'].unique()
    for f in filters:
        if f'Flux_max_{f}' in aggs.columns and f'Flux_min_{f}' in aggs.columns:
            aggs[f'amp_{f}'] = aggs[f'Flux_max_{f}'] - aggs[f'Flux_min_{f}']
            
    # 4. S·ªë l∆∞·ª£ng quan s√°t
    counts = df.groupby('object_id').size().to_frame('n_obs')
    
    # Merge l·∫°i
    features = aggs.merge(counts, left_index=True, right_index=True)
    return features

# H√†m h·ªó tr·ª£ load to√†n b·ªô 20 splits
def load_all_splits(base_path, mode='train'):
    all_features = []
    print(f"B·∫Øt ƒë·∫ßu x·ª≠ l√Ω d·ªØ li·ªáu {mode} t·ª´ 20 splits...")
    
    for i in range(1, 21):
        split_name = f'split_{i:02d}' # Format 01, 02...
        file_name = f'{mode}_full_lightcurves.csv'
        full_path = os.path.join(base_path, split_name, file_name)
        
        print(f"Processing {split_name}...", end='\r')
        
        feats = extract_features_from_split(full_path)
        if feats is not None:
            all_features.append(feats)
            
        # Gi·∫£i ph√≥ng b·ªô nh·ªõ RAM
        del feats
        gc.collect()
        
    print(f"\nƒê√£ x·ª≠ l√Ω xong {mode}!")
    # G·ªôp t·∫•t c·∫£ c√°c split th√†nh 1 DataFrame l·ªõn
    return pd.concat(all_features)

# 3. Ch·∫°y x·ª≠ l√Ω d·ªØ li·ªáu (M·∫•t kho·∫£ng 2-5 ph√∫t)
BASE_PATH = '/kaggle/input/mallorn-dataset'

# 1. Load Feature t·ª´ Lightcurves (B∆∞·ªõc t·ªën th·ªùi gian nh·∫•t)
train_lc_features = load_all_splits(BASE_PATH, mode='train')
test_lc_features = load_all_splits(BASE_PATH, mode='test')

print("K√≠ch th∆∞·ªõc Train features:", train_lc_features.shape)
print("K√≠ch th∆∞·ªõc Test features:", test_lc_features.shape)

# 2. Load file Log (Metadata)
train_log = pd.read_csv(f'{BASE_PATH}/train_log.csv')
test_log = pd.read_csv(f'{BASE_PATH}/test_log.csv')

# 3. G·ªôp (Merge) Lightcurve Features v√†o Log Data
# D√πng left join ƒë·ªÉ ƒë·∫£m b·∫£o gi·ªØ nguy√™n th·ª© t·ª± c·ªßa file Log
full_train = train_log.merge(train_lc_features, on='object_id', how='left')
full_test = test_log.merge(test_lc_features, on='object_id', how='left')

# ƒêi·ªÅn 0 cho c√°c gi√° tr·ªã NaN sinh ra do merge (v√≠ d·ª• ng√¥i sao kh√¥ng c√≥ d·ªØ li·ªáu ·ªü filter 'u')
full_train.fillna(0, inplace=True)
full_test.fillna(0, inplace=True)

display(full_train.head(3))

# 4. Chu·∫©n b·ªã d·ªØ li·ªáu cho SVM (Scale & SMOTE)
# --- B∆Ø·ªöC 4: CHU·∫®N B·ªä D·ªÆ LI·ªÜU (S·ª¨A L·∫†I) ---
# Ch·ªçn Feature
drop_cols = ['object_id', 'SpecType', 'English Translation', 'split', 'target', 'Z_err']
feature_cols = [c for c in full_train.columns if c not in drop_cols]
X = full_train[feature_cols]
y = full_train['target']
X_test_sub = full_test[feature_cols]

# Chia t·∫≠p Train/Val G·ªêC (Ch∆∞a scale, ch∆∞a SMOTE)
# Stratify ƒë·ªÉ ƒë·∫£m b·∫£o t·ª∑ l·ªá TDE ·ªü 2 t·∫≠p nh∆∞ nhau
X_train_org, X_val_org, y_train_org, y_val_org = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"S·ªë l∆∞·ª£ng TDE trong t·∫≠p Val th·ª±c t·∫ø: {sum(y_val_org==1)}")

# 5. Hu·∫•n luy·ªán SVM & Tinh ch·ªânh (Training & Tuning)
svm_pipeline = ImbPipeline([
    ('imputer', SimpleImputer(strategy='mean')),      # 1. ƒêi·ªÅn gi√° tr·ªã thi·∫øu
    ('scaler', StandardScaler()),                     # 2. Chu·∫©n h√≥a
    ('smote', SMOTE(random_state=42, k_neighbors=5)), # 3. Sinh d·ªØ li·ªáu (ch·ªâ tr√™n train)
    ('select', SelectKBest(score_func=f_classif)),    # 4. Ch·ªçn ƒë·∫∑c tr∆∞ng t·ªët nh·∫•t
    ('svm', SVC(probability=True, kernel='rbf', class_weight='balanced', random_state=42)) # 5. SVM
])

# Thi·∫øt l·∫≠p tham s·ªë GridSearch
param_grid = {
    'select__k': [20, 30, 'all'],           # Gi·ªØ l·∫°i 20, 30 ho·∫∑c t·∫•t c·∫£ feature
    'svm__C': [1, 10, 100],                 # Tham s·ªë C c·ªßa SVM
    'svm__gamma': ['scale', 0.1],           # Tham s·ªë Gamma
    'smote__sampling_strategy': [0.5, 1.0]  # T·ª∑ l·ªá sinh d·ªØ li·ªáu
}

print("ƒêang ch·∫°y GridSearch cho SVM (Quy tr√¨nh chu·∫©n)...")
# cv=3: Chia 3 fold ki·ªÉm tra ch√©o
grid = GridSearchCV(svm_pipeline, param_grid, cv=3, scoring='f1', verbose=2, n_jobs=-1)

# L∆∞u √Ω: D√πng X_train_org v√† y_train_org (D·ªØ li·ªáu G·ªêC ch∆∞a qua x·ª≠ l√Ω)
grid.fit(X_train_org, y_train_org)

print("\nBest params:", grid.best_params_)
best_model = grid.best_estimator_

# --- ƒê√ÅNH GI√Å TRUNG TH·ª∞C ---
# D·ª± ƒëo√°n tr√™n t·∫≠p Val g·ªëc (Pipeline s·∫Ω t·ª± scale, b·∫°n ch·ªâ c·∫ßn ƒë∆∞a d·ªØ li·ªáu th√¥ v√†o)
y_pred_val = best_model.predict(X_val_org)
print("\nValidation F1 Score (Real):", f1_score(y_val_org, y_pred_val))
print(classification_report(y_val_org, y_pred_val))

# Tinh ch·ªânh l·∫ßn 2 (thu nh·ªè grid)
svm_pipeline = ImbPipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42, k_neighbors=5)), 
    ('select', SelectKBest(score_func=f_classif, k=20)), # CH·ªà GI·ªÆ L·∫†I 20 FEATURE T·ªêT NH·∫§T
    ('svm', SVC(probability=True, kernel='rbf', class_weight='balanced', random_state=42))
])

param_grid = {
    'select__k': [15, 25, 'all'],     # Th·ª≠ ch·ªçn 15, 25 ho·∫∑c l·∫•y h·∫øt feature
    'svm__C': [1, 10, 50],            # C c√†ng l·ªõn c√†ng ph·∫°t l·ªói m·∫°nh (d·ªÖ overfit)
    'svm__gamma': ['scale', 0.01, 0.1],
    'smote__sampling_strategy': [0.5, 0.8, 1.0]
}

print("ƒêang ch·∫°y GridSearch cho SVM (Quy tr√¨nh chu·∫©n)...")
grid = GridSearchCV(svm_pipeline, param_grid, cv=3, scoring='f1', verbose=2, n_jobs=-1)
grid.fit(X_train_org, y_train_org)

print("\nBest params:", grid.best_params_)
best_model = grid.best_estimator_

# --- ƒê√ÅNH GI√Å TRUNG TH·ª∞C ---
y_pred_val = best_model.predict(X_val_org)
print("\nValidation F1 Score (Real):", f1_score(y_val_org, y_pred_val))
print(classification_report(y_val_org, y_pred_val))

# Tinh ch·ªânh ng∆∞·ª°ng (Threshold Tuning)
from sklearn.metrics import precision_recall_curve

# --- 1. T√åM NG∆Ø·ª†NG T·ªêI ∆ØU (Threshold Tuning) ---
# D·ª± ƒëo√°n x√°c su·∫•t tr√™n t·∫≠p Validation
y_val_prob = best_model.predict_proba(X_val_org)[:, 1]

# T√≠nh Precision-Recall curve
precisions, recalls, thresholds = precision_recall_curve(y_val_org, y_val_prob)

# T√≠nh F1 score cho t·ª´ng ng∆∞·ª°ng (X·ª≠ l√Ω chia cho 0 ƒë·ªÉ tr√°nh l·ªói)
f1_scores = np.divide(2 * precisions * recalls, precisions + recalls, out=np.zeros_like(precisions), where=(precisions + recalls)!=0)

# L·∫•y ng∆∞·ª°ng c√≥ F1 cao nh·∫•t
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]
best_f1 = f1_scores[best_idx]

print(f"Ng∆∞·ª°ng t·ªëi ∆∞u (Best Threshold): {best_threshold:.4f}")
print(f"F1 Score k·ª≥ v·ªçng t·∫°i ng∆∞·ª°ng n√†y: {best_f1:.4f}")

# 6. D·ª± ƒëo√°n v√† Submit
# --- 2. D·ª∞ ƒêO√ÅN V√Ä SUBMIT ---

# D·ª± ƒëo√°n x√°c su·∫•t tr√™n t·∫≠p Test th·∫≠t
# L∆ØU √ù QUAN TR·ªåNG: D√πng X_test_sub (d·ªØ li·ªáu th√¥), KH√îNG d√πng X_test_scaled
y_test_prob = best_model.predict_proba(X_test_sub)[:, 1]

# √Åp d·ª•ng ng∆∞·ª°ng t·ªëi ∆∞u
final_predictions = (y_test_prob >= best_threshold).astype(int)

# T·∫°o file submission
submission = pd.DataFrame({
    'object_id': full_test['object_id'],
    'prediction': final_predictions
})

# Ki·ªÉm tra ph√¢n ph·ªëi d·ª± ƒëo√°n
print("\nTh·ªëng k√™ d·ª± ƒëo√°n (Test Set):")
print(submission['prediction'].value_counts())

submission.to_csv('submission_svm_final.csv', index=False)
print("ƒê√£ l∆∞u file: submission_svm_final.csv")
