# Import Libraries

In [5]:
# %load_ext autoreload
# %reload_ext autoreload # This line is causing the error and can be removed.
# %autoreload 2 # This line is also causing an error and can be removed

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
from IPython import display

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_curve, auc, accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# Data Load

In [7]:
df_train = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')



# Preprocessing

In [8]:
def preprocessing_feature_01(df_data, is_train = True, is_debug = True, **kwargs):
    df_output = pd.DataFrame()

    # Sex: gioi tinh
    cls_sex = {'female': 0, 'male' : 1}
    df_output["Sex"] = df_data["Sex"].apply(lambda x: cls_sex[x])
    # Age: median
    df_output["Age"] = df_data["Age"].fillna(df_data["Age"].median())
    # Fare, Pclass
    for name in ['Fare', 'Pclass', 'SibSp', 'Parch']:
        df_output[name] = df_data[name]
    # Cabin
    cls_cabin = {'A':1, 'B':2, 'C':3, 'D':4, 'E':5, 'F':6, 'G':7, 'T':8, 'Z':0}
    df_output["Cabin"] = df_data['Cabin'].apply(lambda x: cls_cabin['Z'] if pd.isna(x) else cls_cabin[x[0]])
    # Embarked
    cls_embarked = {'0': 0, 'C':1, 'Q':2, 'S':3}
    df_output["Embarked"] =  df_data['Embarked'].apply(lambda x: cls_embarked['0'] if pd.isna(x) else cls_embarked[x])
    # Surname
    surnames = ['Capt.', 'Col.', 'Don.', 'Dr.', 'Jonkheer.', 'Lady.', 'Major.',
            'Master.', 'Miss.', 'Mlle.', 'Mme.', 'Mr.', 'Mrs.', 'Ms.', 'Rev.', 'Sir.', 'the', 'Dona.']
    cls_surnames = dict(zip(surnames, range(len(surnames))))
    df_output["Surname"] = df_data['Name'].apply(lambda x: cls_surnames[x.split(',')[1].split(' ')[1]])

    if is_train:
        df_output["Output"] = df_data["Survived"]

    # display.display(df_output)

    if is_debug:
        print("head(10)")
        display.display(df_data.head(5))
        print("tail(10)")
        # display.display(df_data.tail(5))
        print("isna")
        display.display(df_data.isna().sum())
        # Sex: gioi tinh
        print("sex")
        display.display(np.unique(df_data['Sex'], return_counts=True))
        # Age: lay median
        print(f'Age IsNa: {df_data["Age"].isna().sum()}')
        print(f"Age Median: {df_data['Age'].median()}")
        # Fare
        display.display(df_data["Fare"].describe())
        # Cabin
        print("-*10", "Cabin")
        display.display(np.unique(df_data['Cabin'].apply(
            lambda x: 'Z0' if pd.isna(x) else x), return_counts=True))
        # Embarked
        display.display(
            np.unique(df_data['Embarked'].apply(lambda x: '0' if pd.isna(x) else x), return_counts=True)
        )
        globals().update(**locals())

    return df_output, None
    pass

# df_train = pd.read_csv(f'{data_dir}/train.csv')
# preprocessing_feature_01(df_train)

# Train

In [9]:
def train_and_evaluate(feat_path, seed):
    """
    Tải dữ liệu đặc trưng và huấn luyện nhiều mô hình, hiển thị thêm các metrics.
    """
    print(f"Loading features from: {feat_path}")
    try:
        data = np.load(feat_path)
        X_train = data['X_train']
        y_train = data['y_train']
        X_test = data['X_test']
        X_cols = data['X_cols']
        print(f"Features loaded: {X_cols}")
    except Exception as e:
        print(f"Error loading {feat_path}: {e}")
        return

    # Khởi tạo mô hình
    models = {
        'LogisticRegression': LogisticRegression(random_state=seed, max_iter=1000),
        'KNeighbors': KNeighborsClassifier(),
        'SVC': SVC(probability=True, random_state=seed),
        'RandomForest': RandomForestClassifier(random_state=seed)
    }

    # Thiết lập Stratified K-Fold
    n_splits = 5
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)

    print("\n" + "="*20, "Model Training", "="*20)

    results = {}
    for model_name, model in models.items():
        print(f"\n--- Training {model_name} ---")
        fold_aucs = []
        all_y_val = []
        all_y_pred_proba = []
        all_y_pred = []


        for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
            # Phân chia dữ liệu
            X_train_fold, X_val_fold = X_train[train_idx], X_train[val_idx]
            y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]

            # Huấn luyện
            model.fit(X_train_fold, y_train_fold)

            # Đánh giá
            y_pred_proba = model.predict_proba(X_val_fold)[:, 1]
            y_pred = model.predict(X_val_fold)

            fold_auc = roc_auc_score(y_val_fold, y_pred_proba)
            fold_aucs.append(fold_auc)

            all_y_val.extend(y_val_fold)
            all_y_pred_proba.extend(y_pred_proba)
            all_y_pred.extend(y_pred)


            print(f"  Fold {fold+1} AUC: {fold_auc:.4f}")
            # print(f"  Fold {fold+1} Accuracy: {accuracy_score(y_val_fold, y_pred):.4f}")
            # print(f"  Fold {fold+1} Classification Report:\n{classification_report(y_val_fold, y_pred)}")
            # print(f"  Fold {fold+1} Confusion Matrix:\n{confusion_matrix(y_val_fold, y_pred)}")


        mean_auc = np.mean(fold_aucs)
        std_auc = np.std(fold_aucs)
        print(f"-> Mean AUC for {model_name}: {mean_auc:.4f} +/- {std_auc:.4f}")

        # Calculate and print overall metrics across all folds
        overall_auc = roc_auc_score(all_y_val, all_y_pred_proba)
        overall_accuracy = accuracy_score(all_y_val, all_y_pred)
        overall_classification_report = classification_report(all_y_val, all_y_pred)
        overall_confusion_matrix = confusion_matrix(all_y_val, all_y_pred)
        fpr, tpr, thresholds = roc_curve(all_y_val, all_y_pred_proba)

        print(f"\nOverall Metrics for {model_name}:")
        print(f"  Overall AUC: {overall_auc:.4f}")
        print(f"  Overall Accuracy: {overall_accuracy:.4f}")
        print(f"  Overall Classification Report:\n{overall_classification_report}")
        print(f"  Overall Confusion Matrix:\n{overall_confusion_matrix}")


        results[model_name] = {
            "mean_auc": mean_auc,
            "std_auc": std_auc,
            "overall_auc": overall_auc,
            "overall_accuracy": overall_accuracy,
            "overall_classification_report": overall_classification_report,
            "overall_confusion_matrix": overall_confusion_matrix,
            "fpr": fpr,
            "tpr": tpr,
            "thresholds": thresholds
            }

    print("\n" + "="*20, "Training Complete", "="*20)
    return results

# Main

In [12]:
def run_feature_engineering(params_cfg):
    print("\n[ACTION]: Running Feature Engineering (main_feat01)")
    data_dir = params_cfg["data_dir"]
    save_dir = params_cfg["save_dir"]
    verbose = params_cfg["verbose"]

    # Tải dữ liệu
    train_path = os.path.join(data_dir, 'train.csv')
    test_path = os.path.join(data_dir, 'test.csv')

    try:
        df_train = pd.read_csv(train_path)
        df_test = pd.read_csv(test_path)
    except FileNotFoundError:
        print(f"Error: train.csv or test.csv not found in {data_dir}")
        print("Please make sure the 'data_dir' path is correct.")
        exit() # Thoát nếu không tìm thấy file

    # (Code verbose của bạn - đã di chuyển vào đây)
    if verbose:
        print("-"*10, "information", "-"*10)
        print(f'train-col: {set(df_train.columns)}')
        print(f'test-col: {set(df_test.columns)}')
        print("Union:", set(df_train.columns).intersection(set(df_test.columns)))
        print("Difference:", set(df_train.columns).difference(set(df_test.columns)))

    # Xử lý đặc trưng
    print("\nProcessing training data...")
    df_train_feat, _ = preprocessing_feature_01(df_train, is_train=True, is_debug=verbose)
    print("\nProcessing test data...")
    df_test_feat, _ = preprocessing_feature_01(df_test, is_train=False, is_debug=verbose)

    # Chuẩn bị dữ liệu để lưu
    y_train = df_train_feat['Output'].values
    X_train = df_train_feat.drop('Output', axis=1).values
    X_test = df_test_feat.values
    X_cols = df_train_feat.drop('Output', axis=1).columns.tolist()

    # Lưu vào file .npz
    feat_save_path = os.path.join(save_dir, 'data.npz')
    np.savez(feat_save_path,
             X_train=X_train,
             y_train=y_train,
             X_test=X_test,
             X_cols=X_cols)

    print(f"\nFeatures saved successfully to: {feat_save_path}")

def run_training(params_cfg):
    print("\n[ACTION]: Running Model Training (train)")
    feat_path = params_cfg["feat_path"]
    seed = params_cfg["seed"]

    if not os.path.exists(feat_path):
        print(f"Error: Feature file not found at {feat_path}")
        print("Please run the 'main_feat01' action first to generate features.")
        return

    # Gọi hàm huấn luyện và đánh giá
    training_results = train_and_evaluate(feat_path=feat_path, seed=seed)

    # Tạo một bảng tóm tắt kết quả
    print("\n" + "="*20, "Final Training Summary", "="*20)

    summary_data = []
    for model_name, metrics in training_results.items():
        summary_data.append({
            "Model": model_name,
            "Overall Accuracy": metrics['overall_accuracy'],
            "Overall AUC": metrics['overall_auc'],
            "Mean Fold AUC": metrics['mean_auc'],
            "Std Fold AUC": metrics['std_auc']
        })

    # Chuyển sang DataFrame của Pandas để in cho đẹp
    # Sắp xếp theo Overall AUC giảm dần để xem mô hình nào tốt nhất
    summary_df = pd.DataFrame(summary_data)
    summary_df = summary_df.sort_values(by="Overall AUC", ascending=False)

    # In bảng tóm tắt ra console
    # Sử dụng to_string để đảm bảo định dạng đẹp
    print(summary_df.to_string(index=False, float_format="%.4f"))

# --- Main Execution Block ---
if __name__ == "__main__":

    # params_cfg = {
    #     "action"  : "main_feat01",
    #     "seed"    : 42, # Set random seed
    #     "exp_dir" : os.path.abspath('./exps'), # Dùng đường dẫn tương đối
    #     'exp_name': 'featbase_251028',
    #     "data_dir": os.path.abspath("./"), # Giả sử train.csv/test.csv ở cùng thư mục
    #     "verbose" : True,
    # }

    # --- STEP 2: Model Training ---
    # (Chạy lần hai, SAU KHI chạy STEP 1)
    # (Comment khối params_cfg ở trên và bỏ comment khối này)
    params_cfg = {
        "action"   : "train", # Đổi action thành "train"
        "feat_path": "./exps/featbase_251028/data.npz", # Đường dẫn đến file đã lưu
        "seed"     : 42, # Set random seed
        "exp_dir"  : os.path.abspath('./exps'),
        'exp_name' : 'trainbase_251028',
        "data_dir" : os.path.abspath("./"),
        "verbose"  : True,
    }

    params_cfg.update(**{
        "save_dir": os.path.abspath(f'{params_cfg["exp_dir"]}/{params_cfg["exp_name"]}')
    })

    for v in params_cfg:
        print(f'+ {v}: {params_cfg[v]}')

    # Tạo thư mục lưu trữ nếu chưa tồn tại
    os.makedirs(params_cfg["save_dir"], exist_ok=True)

    if params_cfg["action"] == "main_feat01":
        run_feature_engineering(params_cfg)
    elif params_cfg["action"] == "train":
        run_training(params_cfg)
    else:
        print(f"Error: Unknown action '{params_cfg['action']}' in params_cfg.")

+ action: train
+ feat_path: ./exps/featbase_251028/data.npz
+ seed: 42
+ exp_dir: D:\ML\Titanic\Basic-of-Machine-learning\train\exps
+ exp_name: trainbase_251028
+ data_dir: D:\ML\Titanic\Basic-of-Machine-learning\train
+ verbose: True
+ save_dir: D:\ML\Titanic\Basic-of-Machine-learning\train\exps\trainbase_251028

[ACTION]: Running Model Training (train)
Loading features from: ./exps/featbase_251028/data.npz
Features loaded: ['Sex' 'Age' 'Fare' 'Pclass' 'SibSp' 'Parch' 'Cabin' 'Embarked' 'Surname']


--- Training LogisticRegression ---
  Fold 1 AUC: 0.8829
  Fold 2 AUC: 0.8520
  Fold 3 AUC: 0.8328
  Fold 4 AUC: 0.8334
  Fold 5 AUC: 0.8763
-> Mean AUC for LogisticRegression: 0.8555 +/- 0.0210

Overall Metrics for LogisticRegression:
  Overall AUC: 0.8550
  Overall Accuracy: 0.8013
  Overall Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.85      0.84       549
           1       0.75      0.72      0.73       342

    accura

# End