# Import Libraries

In [67]:
# %load_ext autoreload
# %reload_ext autoreload # This line is causing the error and can be removed.
# %autoreload 2 # This line is also causing an error and can be removed

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
from IPython import display

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_curve, auc, accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# Data Load

In [68]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

if params_cfg["verbose"]:
    print("-"*10, "information", "-"*10)
    print(f'train-col: {set(df_train.columns)}')
    print(f'test-col: {set(df_test.columns)}')
    print("Union:", set(df_train.columns).intersection(set(df_test.columns)))
    print("Difference:", set(df_train.columns).difference(set(df_test.columns)))

---------- information ----------
train-col: {'Pclass', 'Cabin', 'Ticket', 'Name', 'Survived', 'SibSp', 'Fare', 'Sex', 'Parch', 'Embarked', 'Age', 'PassengerId'}
test-col: {'Pclass', 'Cabin', 'Ticket', 'Name', 'SibSp', 'Fare', 'Sex', 'Parch', 'Embarked', 'Age', 'PassengerId'}
Union: {'Pclass', 'Cabin', 'Ticket', 'Name', 'SibSp', 'Fare', 'Sex', 'Parch', 'Embarked', 'Age', 'PassengerId'}
Difference: {'Survived'}


# Preprocessing

In [69]:
def preprocessing_feature_01(df_data, is_train = True, is_debug = True, **kwargs):
    df_output = pd.DataFrame()

    # Sex: gioi tinh
    cls_sex = {'female': 0, 'male' : 1}
    df_output["Sex"] = df_data["Sex"].apply(lambda x: cls_sex[x])
    # Age: median
    df_output["Age"] = df_data["Age"].fillna(df_data["Age"].median())
    # Fare, Pclass
    for name in ['Fare', 'Pclass', 'SibSp', 'Parch']:
        df_output[name] = df_data[name]
    # Cabin
    cls_cabin = {'A':1, 'B':2, 'C':3, 'D':4, 'E':5, 'F':6, 'G':7, 'T':8, 'Z':0}
    df_output["Cabin"] = df_data['Cabin'].apply(lambda x: cls_cabin['Z'] if pd.isna(x) else cls_cabin[x[0]])
    # Embarked
    cls_embarked = {'0': 0, 'C':1, 'Q':2, 'S':3}
    df_output["Embarked"] =  df_data['Embarked'].apply(lambda x: cls_embarked['0'] if pd.isna(x) else cls_embarked[x])
    # Surname
    surnames = ['Capt.', 'Col.', 'Don.', 'Dr.', 'Jonkheer.', 'Lady.', 'Major.',
            'Master.', 'Miss.', 'Mlle.', 'Mme.', 'Mr.', 'Mrs.', 'Ms.', 'Rev.', 'Sir.', 'the', 'Dona.']
    cls_surnames = dict(zip(surnames, range(len(surnames))))
    df_output["Surname"] = df_data['Name'].apply(lambda x: cls_surnames[x.split(',')[1].split(' ')[1]])

    if is_train:
        df_output["Output"] = df_data["Survived"]

    # display.display(df_output)

    if is_debug:
        print("head(10)")
        display.display(df_data.head(5))
        print("tail(10)")
        # display.display(df_data.tail(5))
        print("isna")
        display.display(df_data.isna().sum())
        # Sex: gioi tinh
        print("sex")
        display.display(np.unique(df_data['Sex'], return_counts=True))
        # Age: lay median
        print(f'Age IsNa: {df_data["Age"].isna().sum()}')
        print(f"Age Median: {df_data['Age'].median()}")
        # Fare
        display.display(df_data["Fare"].describe())
        # Cabin
        print("-*10", "Cabin")
        display.display(np.unique(df_data['Cabin'].apply(
            lambda x: 'Z0' if pd.isna(x) else x), return_counts=True))
        # Embarked
        display.display(
            np.unique(df_data['Embarked'].apply(lambda x: '0' if pd.isna(x) else x), return_counts=True)
        )
        globals().update(**locals())

    return df_output, None
    pass

# df_train = pd.read_csv(f'{data_dir}/train.csv')
# preprocessing_feature_01(df_train)

# Train

In [70]:
def train_and_evaluate(feat_path, seed):
    """
    Tải dữ liệu đặc trưng và huấn luyện nhiều mô hình, hiển thị thêm các metrics.
    """
    print(f"Loading features from: {feat_path}")
    try:
        data = np.load(feat_path)
        X_train = data['X_train']
        y_train = data['y_train']
        X_test = data['X_test']
        X_cols = data['X_cols']
        print(f"Features loaded: {X_cols}")
    except Exception as e:
        print(f"Error loading {feat_path}: {e}")
        return

    # Khởi tạo mô hình
    models = {
        'LogisticRegression': LogisticRegression(random_state=seed, max_iter=1000),
        'KNeighbors': KNeighborsClassifier(),
        'SVC': SVC(probability=True, random_state=seed),
        'RandomForest': RandomForestClassifier(random_state=seed)
    }

    # Thiết lập Stratified K-Fold
    n_splits = 5
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)

    print("\n" + "="*20, "Model Training", "="*20)

    results = {}
    for model_name, model in models.items():
        print(f"\n--- Training {model_name} ---")
        fold_aucs = []
        all_y_val = []
        all_y_pred_proba = []
        all_y_pred = []


        for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
            # Phân chia dữ liệu
            X_train_fold, X_val_fold = X_train[train_idx], X_train[val_idx]
            y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]

            # Huấn luyện
            model.fit(X_train_fold, y_train_fold)

            # Đánh giá
            y_pred_proba = model.predict_proba(X_val_fold)[:, 1]
            y_pred = model.predict(X_val_fold)

            fold_auc = roc_auc_score(y_val_fold, y_pred_proba)
            fold_aucs.append(fold_auc)

            all_y_val.extend(y_val_fold)
            all_y_pred_proba.extend(y_pred_proba)
            all_y_pred.extend(y_pred)


            print(f"  Fold {fold+1} AUC: {fold_auc:.4f}")
            # print(f"  Fold {fold+1} Accuracy: {accuracy_score(y_val_fold, y_pred):.4f}")
            # print(f"  Fold {fold+1} Classification Report:\n{classification_report(y_val_fold, y_pred)}")
            # print(f"  Fold {fold+1} Confusion Matrix:\n{confusion_matrix(y_val_fold, y_pred)}")


        mean_auc = np.mean(fold_aucs)
        std_auc = np.std(fold_aucs)
        print(f"-> Mean AUC for {model_name}: {mean_auc:.4f} +/- {std_auc:.4f}")

        # Calculate and print overall metrics across all folds
        overall_auc = roc_auc_score(all_y_val, all_y_pred_proba)
        overall_accuracy = accuracy_score(all_y_val, all_y_pred)
        overall_classification_report = classification_report(all_y_val, all_y_pred)
        overall_confusion_matrix = confusion_matrix(all_y_val, all_y_pred)
        fpr, tpr, thresholds = roc_curve(all_y_val, all_y_pred_proba)

        print(f"\nOverall Metrics for {model_name}:")
        print(f"  Overall AUC: {overall_auc:.4f}")
        print(f"  Overall Accuracy: {overall_accuracy:.4f}")
        print(f"  Overall Classification Report:\n{overall_classification_report}")
        print(f"  Overall Confusion Matrix:\n{overall_confusion_matrix}")


        results[model_name] = {
            "mean_auc": mean_auc,
            "std_auc": std_auc,
            "overall_auc": overall_auc,
            "overall_accuracy": overall_accuracy,
            "overall_classification_report": overall_classification_report,
            "overall_confusion_matrix": overall_confusion_matrix,
            "fpr": fpr,
            "tpr": tpr,
            "thresholds": thresholds
            }

    print("\n" + "="*20, "Training Complete", "="*20)
    return results

# Main

In [72]:
params_cfg = {
    "action"  : "main_feat01",
    "seed"    : 42, # Set random seed
    "exp_dir" : os.path.abspath('./exps'), # Dùng đường dẫn tương đối
    'exp_name': 'featbase_251028',
    "data_dir": os.path.abspath("./"), # Giả sử train.csv/test.csv ở cùng thư mục
    "verbose" : True,
}

# --- STEP 2: Model Training ---
# (Chạy lần hai, SAU KHI chạy STEP 1)
# (Comment khối params_cfg ở trên và bỏ comment khối này)
# params_cfg = {
#     "action"   : "train", # Đổi action thành "train"
#     "feat_path": "./exps/featbase_251028/data.npz", # Đường dẫn đến file đã lưu
#     "seed"     : 42, # Set random seed
#     "exp_dir"  : os.path.abspath('./exps'),
#     'exp_name' : 'trainbase_251028',
#     "data_dir" : os.path.abspath("./"),
#     "verbose"  : True,
# }


# ----------------------------------------------------
# (Code của bạn không thay đổi)
# ----------------------------------------------------
# (Code của bạn không thay đổi)
params_cfg.update(**{
    "save_dir": os.path.abspath(f'{params_cfg["exp_dir"]}/{params_cfg["exp_name"]}')
})

for v in params_cfg:
    print(f'+ {v}: {params_cfg[v]}')

globals().update(**params_cfg)
# ----------------------------------------------------


# Khối thực thi chính (ĐÃ CẬP NHẬT)
if __name__ == "__main__":

    # Tạo thư mục lưu trữ nếu chưa tồn tại
    os.makedirs(save_dir, exist_ok=True)

    if action == "main_feat01":
        print("\n[ACTION]: Running Feature Engineering (main_feat01)")

        # Tải dữ liệu
        train_path = os.path.join(data_dir, 'train.csv')
        test_path = os.path.join(data_dir, 'test.csv')

        try:
            df_train = pd.read_csv(train_path)
            df_test = pd.read_csv(test_path)
        except FileNotFoundError:
            print(f"Error: train.csv or test.csv not found in {data_dir}")
            print("Please make sure the 'data_dir' path is correct.")
            # exit() # Thoát nếu không tìm thấy file
            # Sử dụng 'pass' hoặc 'raise' nếu trong môi trường notebook
            pass

        # (Code verbose của bạn - đã di chuyển vào đây)
        if params_cfg["verbose"] and 'df_train' in locals():
            print("-"*10, "information", "-"*10)
            print(f'train-col: {set(df_train.columns)}')
            print(f'test-col: {set(df_test.columns)}')
            print("Union:", set(df_train.columns).intersection(set(df_test.columns)))
            print("Difference:", set(df_train.columns).difference(set(df_test.columns)))

        if 'df_train' in locals():
            # Xử lý đặc trưng
            print("\nProcessing training data...")
            df_train_feat, _ = preprocessing_feature_01(df_train, is_train=True, is_debug=verbose)
            print("\nProcessing test data...")
            df_test_feat, _ = preprocessing_feature_01(df_test, is_train=False, is_debug=verbose)

            # Chuẩn bị dữ liệu để lưu
            y_train = df_train_feat['Output'].values
            X_train = df_train_feat.drop('Output', axis=1).values
            X_test = df_test_feat.values
            X_cols = df_train_feat.drop('Output', axis=1).columns.tolist()

            # Lưu vào file .npz
            feat_save_path = os.path.join(save_dir, 'data.npz')
            np.savez(feat_save_path,
                     X_train=X_train,
                     y_train=y_train,
                     X_test=X_test,
                     X_cols=X_cols)

            print(f"\nFeatures saved successfully to: {feat_save_path}")

    elif action == "train":
        print("\n[ACTION]: Running Model Training (train)")

        if "feat_path" not in params_cfg:
            print("Error: 'feat_path' not defined in params_cfg for 'train' action.")
        elif not os.path.exists(feat_path):
            print(f"Error: Feature file not found at {feat_path}")
            print("Please run the 'main_feat01' action first to generate features.")
        else:
            # Gọi hàm huấn luyện và đánh giá
            training_results = train_and_evaluate(feat_path=feat_path, seed=seed)

            # --- PHẦN ĐƯỢC CHỈNH SỬA ---
            # Tạo một bảng tóm tắt kết quả
            print("\n" + "="*20, "Final Training Summary", "="*20)

            summary_data = []
            for model_name, metrics in training_results.items():
                summary_data.append({
                    "Model": model_name,
                    "Overall Accuracy": metrics['overall_accuracy'],
                    "Overall AUC": metrics['overall_auc'],
                    "Mean Fold AUC": metrics['mean_auc'],
                    "Std Fold AUC": metrics['std_auc']
                })

            # Chuyển sang DataFrame của Pandas để in cho đẹp
            # Sắp xếp theo Overall AUC giảm dần để xem mô hình nào tốt nhất
            summary_df = pd.DataFrame(summary_data)
            summary_df = summary_df.sort_values(by="Overall AUC", ascending=False)

            # In bảng tóm tắt ra console
            # Sử dụng to_string để đảm bảo định dạng đẹp
            print(summary_df.to_string(index=False, float_format="%.4f"))

            # --- KẾT THÚC PHẦN CHỈNH SỬA ---

    else:
        print(f"Error: Unknown action '{action}' in params_cfg.")

+ action: main_feat01
+ seed: 42
+ exp_dir: /content/exps
+ exp_name: featbase_251028
+ data_dir: /content
+ verbose: True
+ save_dir: /content/exps/featbase_251028

[ACTION]: Running Feature Engineering (main_feat01)
---------- information ----------
train-col: {'Pclass', 'Cabin', 'Ticket', 'Name', 'Survived', 'SibSp', 'Fare', 'Sex', 'Parch', 'Embarked', 'Age', 'PassengerId'}
test-col: {'Pclass', 'Cabin', 'Ticket', 'Name', 'SibSp', 'Fare', 'Sex', 'Parch', 'Embarked', 'Age', 'PassengerId'}
Union: {'Pclass', 'Cabin', 'Ticket', 'Name', 'SibSp', 'Fare', 'Sex', 'Parch', 'Embarked', 'Age', 'PassengerId'}
Difference: {'Survived'}

Processing training data...
head(10)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


tail(10)
isna


Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,177
SibSp,0
Parch,0
Ticket,0
Fare,0


sex


(array(['female', 'male'], dtype=object), array([314, 577]))

Age IsNa: 177
Age Median: 28.0


Unnamed: 0,Fare
count,891.0
mean,32.204208
std,49.693429
min,0.0
25%,7.9104
50%,14.4542
75%,31.0
max,512.3292


-*10 Cabin


(array(['A10', 'A14', 'A16', 'A19', 'A20', 'A23', 'A24', 'A26', 'A31',
        'A32', 'A34', 'A36', 'A5', 'A6', 'A7', 'B101', 'B102', 'B18',
        'B19', 'B20', 'B22', 'B28', 'B3', 'B30', 'B35', 'B37', 'B38',
        'B39', 'B4', 'B41', 'B42', 'B49', 'B5', 'B50', 'B51 B53 B55',
        'B57 B59 B63 B66', 'B58 B60', 'B69', 'B71', 'B73', 'B77', 'B78',
        'B79', 'B80', 'B82 B84', 'B86', 'B94', 'B96 B98', 'C101', 'C103',
        'C104', 'C106', 'C110', 'C111', 'C118', 'C123', 'C124', 'C125',
        'C126', 'C128', 'C148', 'C2', 'C22 C26', 'C23 C25 C27', 'C30',
        'C32', 'C45', 'C46', 'C47', 'C49', 'C50', 'C52', 'C54', 'C62 C64',
        'C65', 'C68', 'C7', 'C70', 'C78', 'C82', 'C83', 'C85', 'C86',
        'C87', 'C90', 'C91', 'C92', 'C93', 'C95', 'C99', 'D', 'D10 D12',
        'D11', 'D15', 'D17', 'D19', 'D20', 'D21', 'D26', 'D28', 'D30',
        'D33', 'D35', 'D36', 'D37', 'D45', 'D46', 'D47', 'D48', 'D49',
        'D50', 'D56', 'D6', 'D7', 'D9', 'E10', 'E101', 'E12', 'E121',

(array(['0', 'C', 'Q', 'S'], dtype=object), array([  2, 168,  77, 644]))


Processing test data...
head(10)


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


tail(10)
isna


Unnamed: 0,0
PassengerId,0
Pclass,0
Name,0
Sex,0
Age,86
SibSp,0
Parch,0
Ticket,0
Fare,1
Cabin,327


sex


(array(['female', 'male'], dtype=object), array([152, 266]))

Age IsNa: 86
Age Median: 27.0


Unnamed: 0,Fare
count,417.0
mean,35.627188
std,55.907576
min,0.0
25%,7.8958
50%,14.4542
75%,31.5
max,512.3292


-*10 Cabin


(array(['A11', 'A18', 'A21', 'A29', 'A34', 'A9', 'B10', 'B11', 'B24',
        'B26', 'B36', 'B41', 'B45', 'B51 B53 B55', 'B52 B54 B56',
        'B57 B59 B63 B66', 'B58 B60', 'B61', 'B69', 'B71', 'B78', 'C101',
        'C105', 'C106', 'C116', 'C130', 'C132', 'C22 C26', 'C23 C25 C27',
        'C28', 'C31', 'C32', 'C39', 'C46', 'C51', 'C53', 'C54', 'C55 C57',
        'C6', 'C62 C64', 'C7', 'C78', 'C80', 'C85', 'C86', 'C89', 'C97',
        'D', 'D10 D12', 'D15', 'D19', 'D21', 'D22', 'D28', 'D30', 'D34',
        'D37', 'D38', 'D40', 'D43', 'E31', 'E34', 'E39 E41', 'E45', 'E46',
        'E50', 'E52', 'E60', 'F', 'F E46', 'F E57', 'F G63', 'F2', 'F33',
        'F4', 'G6', 'Z0'], dtype=object),
 array([  1,   1,   1,   1,   2,   1,   1,   1,   1,   1,   1,   1,   2,
          1,   1,   3,   1,   1,   1,   1,   1,   2,   1,   1,   2,   1,
          1,   1,   2,   1,   2,   1,   1,   1,   1,   1,   1,   2,   2,
          1,   1,   2,   2,   1,   1,   2,   1,   1,   1,   1,   1,   1,
          1,

(array(['C', 'Q', 'S'], dtype=object), array([102,  46, 270]))


Features saved successfully to: /content/exps/featbase_251028/data.npz


# End