# Import Libraries

In [222]:
# %load_ext autoreload
# %reload_ext autoreload # This line is causing the error and can be removed.
# %autoreload 2 # This line is also causing an error and can be removed

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
from IPython import display

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_curve, auc, accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# Data Load

In [223]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')



# Preprocessing and Fearture Engineering

## Tạo cột FamilySize = SibSp + Parch + 1

In [224]:
def create_family_size(df):
    return df["SibSp"] + df["Parch"] + 1

## Mã hóa giới tính: female=0, male=1

In [225]:
def encode_sex(df):
    cls_sex = {'female': 0, 'male': 1}
    return df["Sex"].map(cls_sex)

## Điền giá trị thiếu của cột Age bằng median

In [226]:
def fill_age(df):
    return df["Age"].fillna(df["Age"].median())

## Sao chép các cột cơ bản giữ nguyên

In [227]:
def copy_basic_features(df, feature_list):
    return df[feature_list].copy()

## Mã hóa Cabin: lấy ký tự đầu tiên và gán giá trị số

In [228]:
def encode_cabin(df):
    cls_cabin = {'A':1, 'B':2, 'C':3, 'D':4, 'E':5, 'F':6, 'G':7, 'T':8, 'Z':0}
    return df['Cabin'].apply(lambda x: cls_cabin['Z'] if pd.isna(x) else cls_cabin.get(x[0], cls_cabin['Z']))


## Mã hóa Embarked: C=1, Q=2, S=3, NaN=0

In [229]:
def encode_embarked(df):
    cls_embarked = {'0': 0, 'C':1, 'Q':2, 'S':3}
    return df['Embarked'].apply(lambda x: cls_embarked['0'] if pd.isna(x) else cls_embarked.get(x, 0))

## Trích xuất danh xưng (title) từ cột Name

In [230]:
def extract_surname(df):
    surnames = [
        'Capt.', 'Col.', 'Don.', 'Dr.', 'Jonkheer.', 'Lady.', 'Major.',
        'Master.', 'Miss.', 'Mlle.', 'Mme.', 'Mr.', 'Mrs.', 'Ms.',
        'Rev.', 'Sir.', 'the', 'Dona.'
    ]
    cls_surnames = dict(zip(surnames, range(len(surnames))))
    def get_title(name):
        try:
            title = name.split(',')[1].split(' ')[1]
            return cls_surnames.get(title, -1)
        except Exception:
            return -1

    return df['Name'].apply(get_title)

## Hiển thị thông tin thống kê để kiểm tra dữ liệu

In [231]:
def debug_info(df):

    print("=== DEBUG MODE ===")
    print("🔹 Dữ liệu mẫu (head):")
    display.display(df.head(5))

    print("🔹 Dữ liệu thiếu:")
    display.display(df.isna().sum())

    print("🔹 Thống kê Age & Fare:")
    print(f"  Age missing: {df['Age'].isna().sum()}")
    print(f"  Age median: {df['Age'].median()}")
    display.display(df["Fare"].describe())

    print("🔹 Thống kê Cabin:")
    display.display(np.unique(
        df['Cabin'].apply(lambda x: 'Z0' if pd.isna(x) else x),
        return_counts=True
    ))

    print("🔹 Thống kê Embarked:")
    display.display(np.unique(
        df['Embarked'].apply(lambda x: '0' if pd.isna(x) else x),
        return_counts=True
    ))

## Tiền xử lý dữ liệu đầu vào và lựa chọn đặc trưng cho mô hình học máy .

In [232]:
def preprocessing_feature_01(df_data, is_train = True, is_debug = True, **kwargs):

    df_output = pd.DataFrame()

    df_output["FamilySize"] = create_family_size(df_data)
    df_output["Sex"] = encode_sex(df_data)
    df_output["Age"] = fill_age(df_data)

    basic_features = copy_basic_features(df_data, ['Fare', 'Pclass'])
    df_output = pd.concat([df_output, basic_features], axis=1)

    df_output["Cabin"] = encode_cabin(df_data)
    df_output["Embarked"] = encode_embarked(df_data)
    df_output["Surname"] = extract_surname(df_data)

    # --- Nếu là tập train, thêm nhãn đầu ra ---
    if is_train:
        df_output["Output"] = df_data["Survived"]

    # --- In thông tin debug ---
    if is_debug:
        debug_info(df_data)

    return df_output, None

# df_train = pd.read_csv(f'{data_dir}/train.csv')

Hàm này tổng hợp các bước làm sạch và chuyển đổi dữ liệu,
giúp dữ liệu sẵn sàng cho việc huấn luyện mô hình.

----------------------------------------------------------
🔹 Các bước xử lý:
1. Tạo đặc trưng mới `FamilySize` = SibSp + Parch + 1
2. Mã hóa giới tính (`Sex`) thành số: female=0, male=1
3. Điền giá trị thiếu cho `Age` bằng median (trung vị)
4. Giữ lại các cột cơ bản: Fare, Pclass, SibSp, Parch
5. Mã hóa Cabin → ký tự đầu tiên (A–T) → số; NaN → Z=0
6. Mã hóa cổng lên tàu (`Embarked`): C=1, Q=2, S=3, NaN=0
7. Trích xuất danh xưng (`Surname`) từ cột Name
8. Nếu là tập huấn luyện → thêm cột `Output` (Survived)
9. Nếu bật debug → in ra thống kê dữ liệu

----------------------------------------------------------
Tham số:
- df_data : pandas.DataFrame  
    Dữ liệu gốc chứa các cột Sex, Age, Fare, Pclass, Cabin, Embarked, Name, ...
- is_train : bool, mặc định = True  
    Nếu True, thêm cột "Output" = Survived.
- is_debug : bool, mặc định = True  
    Nếu True, hiển thị thông tin kiểm tra dữ liệu.
- **kwargs : dict  
    Tham số mở rộng cho tương lai (hiện chưa sử dụng).

----------------------------------------------------------
Giá trị trả về:
- df_output : pandas.DataFrame  
    Dữ liệu đã tiền xử lý, sẵn sàng cho mô hình.
- None : placeholder để tương thích pipeline.

----------------------------------------------------------

# Train

Tải dữ liệu đặc trưng và huấn luyện nhiều mô hình, hiển thị thêm các metrics.

In [233]:
def load_features(feat_path):
    print(f" Loading features from: {feat_path}")
    try:
        data = np.load(feat_path)
        X_train = data['X_train']
        y_train = data['y_train']
        X_test = data['X_test']
        X_cols = data['X_cols']
        print(f" Features loaded: {list(X_cols)}")
        return X_train, y_train, X_test, X_cols
    except Exception as e:
        print(f" Error loading {feat_path}: {e}")
        return None


In [234]:
models = {
    'LogisticRegression': LogisticRegression(random_state=42, max_iter=1000),
    'KNeighbors': KNeighborsClassifier(),
    'SVC': SVC(probability=True, random_state=42),
    'RandomForest': RandomForestClassifier(random_state=42)
}

In [235]:
def train_and_evaluate_model_with_kfold(feat_path, seed=42):
    X_train, y_train, X_test, X_cols = load_features(feat_path)
    # Thiết lập Stratified K-Fold
    n_splits = 5
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    results = {}
    for model_name, model in models.items():
            print(f"\n--- Training {model_name} ---")
            fold_aucs = []
            all_y_val = []
            all_y_pred_proba = []
            all_y_pred = []


            for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
                # Phân chia dữ liệu
                X_train_fold, X_val_fold = X_train[train_idx], X_train[val_idx]
                y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]

                # Huấn luyện
                model.fit(X_train_fold, y_train_fold)

                # Đánh giá
                y_pred_proba = model.predict_proba(X_val_fold)[:, 1]
                y_pred = model.predict(X_val_fold)

                fold_auc = roc_auc_score(y_val_fold, y_pred_proba)
                fold_aucs.append(fold_auc)

                all_y_val.extend(y_val_fold)
                all_y_pred_proba.extend(y_pred_proba)
                all_y_pred.extend(y_pred)


                print(f"  Fold {fold+1} AUC: {fold_auc:.4f}")
                # print(f"  Fold {fold+1} Accuracy: {accuracy_score(y_val_fold, y_pred):.4f}")
                # print(f"  Fold {fold+1} Classification Report:\n{classification_report(y_val_fold, y_pred)}")
                # print(f"  Fold {fold+1} Confusion Matrix:\n{confusion_matrix(y_val_fold, y_pred)}")


            mean_auc = np.mean(fold_aucs)
            std_auc = np.std(fold_aucs)
            print(f"-> Mean AUC for {model_name}: {mean_auc:.4f} +/- {std_auc:.4f}")

            # Calculate and print overall metrics across all folds
            overall_auc = roc_auc_score(all_y_val, all_y_pred_proba)
            overall_accuracy = accuracy_score(all_y_val, all_y_pred)
            overall_classification_report = classification_report(all_y_val, all_y_pred)
            overall_confusion_matrix = confusion_matrix(all_y_val, all_y_pred)
            fpr, tpr, thresholds = roc_curve(all_y_val, all_y_pred_proba)

            print(f"\nOverall Metrics for {model_name}:")
            print(f"  Overall AUC: {overall_auc:.4f}")
            print(f"  Overall Accuracy: {overall_accuracy:.4f}")
            print(f"  Overall Classification Report:\n{overall_classification_report}")
            print(f"  Overall Confusion Matrix:\n{overall_confusion_matrix}")


            results[model_name] = {
                "mean_auc": mean_auc,
                "std_auc": std_auc,
                "overall_auc": overall_auc,
                "overall_accuracy": overall_accuracy,
                "overall_classification_report": overall_classification_report,
                "overall_confusion_matrix": overall_confusion_matrix,
                "fpr": fpr,
                "tpr": tpr,
                "thresholds": thresholds
                }
    print("\n" + "="*20, "Training Complete", "="*20)
    return results


In [236]:
def train_and_evaluate_no_kfold(feat_path, seed):

    X_train, y_train, X_test, X_cols = load_features(feat_path)

    results = {}
    for model_name, model in models.items():
        print(f"\n--- Training {model_name} ---")

        # --- Huấn luyện mô hình ---
        model.fit(X_train, y_train)

        # --- Dự đoán ---
        y_pred_proba = model.predict_proba(X_train)[:, 1]
        y_pred = model.predict(X_train)

        # --- Tính toán các chỉ số đánh giá ---
        auc = roc_auc_score(y_train, y_pred_proba)
        accuracy = accuracy_score(y_train, y_pred)
        report = classification_report(y_train, y_pred)
        conf_matrix = confusion_matrix(y_train, y_pred)
        fpr, tpr, thresholds = roc_curve(y_train, y_pred_proba)

        # --- In kết quả ---
        print(f"{model_name} Results:")
        print(f"AUC: {auc:.4f}")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Classification Report:\n{report}")
        print(f"Confusion Matrix:\n{conf_matrix}")

        # --- Lưu kết quả ---
        results[model_name] = {
            "auc": auc,
            "accuracy": accuracy,
            "classification_report": report,
            "confusion_matrix": conf_matrix,
            "fpr": fpr,
            "tpr": tpr,
            "thresholds": thresholds
        }

    print("\n" + "="*20, "Training Complete", "="*20)
    return results

# Main

In thông tin cơ bản về các cột trong train/test.

In [237]:
def print_data_info(df_train, df_test):

    print("-" * 10, "Dataset Information", "-" * 10)
    print(f"Train columns: {set(df_train.columns)}")
    print(f"Test columns:  {set(df_test.columns)}")
    print("Union:", set(df_train.columns).intersection(set(df_test.columns)))
    print("Difference:", set(df_train.columns).difference(set(df_test.columns)))


Tiền xử lý dữ liệu train/test bằng hàm preprocessing_feature_01.

In [238]:
def process_features(df_train, df_test, verbose=True):
    print("\n Processing training data...")
    df_train_feat, _ = preprocessing_feature_01(df_train, is_train=True, is_debug=verbose)

    print(" Processing test data...")
    df_test_feat, _ = preprocessing_feature_01(df_test, is_train=False, is_debug=verbose)

    y_train = df_train_feat['Output'].values
    X_train = df_train_feat.drop('Output', axis=1).values
    X_test = df_test_feat.values
    X_cols = df_train_feat.drop('Output', axis=1).columns.tolist()

    return X_train, y_train, X_test, X_cols


Lưu dữ liệu đặc trưng đã xử lý vào file .npz.

In [239]:
def save_features(X_train, y_train, X_test, X_cols, save_dir):
    os.makedirs(save_dir, exist_ok=True)
    feat_save_path = os.path.join(save_dir, 'data.npz')

    np.savez(
        feat_save_path,
        X_train=X_train,
        y_train=y_train,
        X_test=X_test,
        X_cols=X_cols
    )

    print(f" Features saved successfully to: {feat_save_path}")


Hàm chính điều phối toàn bộ pipeline Feature Engineering.

In [240]:
def run_feature_engineering(params_cfg):

    data_dir = params_cfg["data_dir"]
    save_dir = params_cfg["save_dir"]
    verbose = params_cfg.get("verbose", True)

    if verbose:
        print_data_info(df_train, df_test)

    X_train, y_train, X_test, X_cols = process_features(df_train, df_test, verbose=verbose)

    # --- Step 4: Lưu dữ liệu ---
    save_features(X_train, y_train, X_test, X_cols, save_dir)


Chạy huấn luyện mô hình sau khi đã có file .npz.

In [241]:
def run_training(params_cfg):

    print("\n[ACTION]: Running Model Training (train)")

    feat_path = params_cfg["feat_path"]
    seed = params_cfg["seed"]

    if not os.path.exists(feat_path):
        print(f"❌ Error: Feature file not found at {feat_path}")
        print("👉 Please run the 'main_feat01' action first to generate features.")
        return

    # Gọi hàm train model
    training_results_kfold = train_and_evaluate_model_with_kfold(feat_path=feat_path, seed=seed)
    training_results_no_kfold = train_and_evaluate_no_kfold(feat_path=feat_path, seed=seed)

    # --- Tóm tắt kết quả ---
    print("\n" + "=" * 20, "📊 Final Training Summary", "=" * 20)

    # --- Tóm tắt kết quả Cross-Validation ---
    print("\n" + "=" * 20, "📊 Final Training Summary (Cross-Validation)", "=" * 20)
    summary_data_kfold = []
    for name, m in training_results_kfold.items():
        # Extract F1-score for class 1 (positive class, Survived=1) from the classification report
        report_lines = m['overall_classification_report'].split('\n')
        f1_score_line = [line for line in report_lines if '1       ' in line]
        f1_score = float(f1_score_line[0].split()[3]) if f1_score_line else np.nan

        summary_data_kfold.append({
            "Model": name,
            "Overall Accuracy": m['overall_accuracy'],
            "Overall AUC": m['overall_auc'],
            "Overall F1-Score (Survived=1)": f1_score,
            "Mean Fold AUC": m.get('mean_auc', np.nan),
            "Std Fold AUC": m.get('std_auc', np.nan)
        })

    summary_df_kfold = pd.DataFrame(summary_data_kfold).sort_values(by="Overall AUC", ascending=False)
    print("Cross-Validation Summary:")
    display.display(summary_df_kfold)


    # --- Tóm tắt kết quả Train-Test Split ---
    print("\n" + "=" * 20, "📊 Final Training Summary (Train-Test Split)", "=" * 20)
    summary_data_no_kfold = []
    for name, m in training_results_no_kfold.items():
        # Extract F1-score for class 1 (positive class, Survived=1) from the classification report
        report_lines = m['classification_report'].split('\n')
        f1_score_line = [line for line in report_lines if '1       ' in line]
        f1_score = float(f1_score_line[0].split()[3]) if f1_score_line else np.nan

        summary_data_no_kfold.append({
            "Model": name,
            "Accuracy": m['accuracy'],
            "AUC": m['auc'],
            "F1-Score (Survived=1)": f1_score
        })

    summary_df_no_kfold = pd.DataFrame(summary_data_no_kfold).sort_values(by="AUC", ascending=False)
    print("Train-Test Split Summary (on Training Data):")
    display.display(summary_df_no_kfold)

main - khối thực thi

In [242]:
if __name__ == "__main__":

    params_cfg = {
        "action"   : "train",  # hoặc "train"
        "seed"     : 42,
        "exp_dir"  : os.path.abspath('./exps'),
        "exp_name" : "featbase_251028",
        "data_dir" : os.path.abspath("./"),  # train.csv/test.csv
        "verbose"  : True,
    }

    params_cfg.update({
        "save_dir": os.path.join(params_cfg["exp_dir"], params_cfg["exp_name"]),
        "feat_path": os.path.join(params_cfg["exp_dir"], params_cfg["exp_name"], "data.npz")
    })

    os.makedirs(params_cfg["save_dir"], exist_ok=True)

    for k, v in params_cfg.items():
        print(f"+ {k}: {v}")

    if params_cfg["action"] == "main_feat01":
        run_feature_engineering(params_cfg)
    elif params_cfg["action"] == "train":
        run_training(params_cfg)
    else:
        print(f"❌ Unknown action '{params_cfg['action']}'")

+ action: train
+ seed: 42
+ exp_dir: /content/exps
+ exp_name: featbase_251028
+ data_dir: /content
+ verbose: True
+ save_dir: /content/exps/featbase_251028
+ feat_path: /content/exps/featbase_251028/data.npz

[ACTION]: Running Model Training (train)
 Loading features from: /content/exps/featbase_251028/data.npz
 Features loaded: [np.str_('FamilySize'), np.str_('Sex'), np.str_('Age'), np.str_('Fare'), np.str_('Pclass'), np.str_('Cabin'), np.str_('Embarked'), np.str_('Surname')]

--- Training LogisticRegression ---
  Fold 1 AUC: 0.8881
  Fold 2 AUC: 0.8501
  Fold 3 AUC: 0.8338
  Fold 4 AUC: 0.8294
  Fold 5 AUC: 0.8736
-> Mean AUC for LogisticRegression: 0.8550 +/- 0.0227

Overall Metrics for LogisticRegression:
  Overall AUC: 0.8539
  Overall Accuracy: 0.8002
  Overall Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.86      0.84       549
           1       0.75      0.71      0.73       342

    accuracy                    

Unnamed: 0,Model,Overall Accuracy,Overall AUC,Overall F1-Score (Survived=1),Mean Fold AUC,Std Fold AUC
3,RandomForest,0.814815,0.87086,0.75,0.872328,0.023752
0,LogisticRegression,0.800224,0.853926,0.73,0.855,0.022674
1,KNeighbors,0.725028,0.766511,0.61,0.764493,0.022681
2,SVC,0.681257,0.730773,0.44,0.746081,0.026087



Train-Test Split Summary (on Training Data):


Unnamed: 0,Model,Accuracy,AUC,F1-Score (Survived=1)
3,RandomForest,0.984287,0.997806,0.98
1,KNeighbors,0.817059,0.895453,0.75
0,LogisticRegression,0.813692,0.859223,0.75
2,SVC,0.687991,0.760585,0.44


# End