# Titanic - Logistic Regression Baseline

## Trainning model

In [3]:
from os.path import join
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression

# === Load dữ liệu ===
path_dir = join("..", "..", "..")
input_dir = join(path_dir, "data", "feature_engineered", "baseline")

df_train = pd.read_csv(join(input_dir, "baseline_engineered_train.csv"))
df_test = pd.read_csv(join(input_dir, "baseline_engineered_test.csv"))

X = df_train.drop(['Survived'], axis=1)
y = df_train['Survived']

# === KFold ===
kf = KFold(n_splits=5, shuffle=True, random_state=42)
accuracies, precisions, recalls, f1s = [], [], [], []

fold_index = 1
for train_index, val_index in kf.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    # === Mô hình Logistic Regression ===
    model = LogisticRegression(
        max_iter=1000,     # tăng để đảm bảo hội tụ
        solver='lbfgs',    # ổn định cho dữ liệu nhỏ
        random_state=42
    )

    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    # === Tính metric ===
    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracies.append(acc)
    precisions.append(prec)
    recalls.append(rec)
    f1s.append(f1)

    print(f"Fold {fold_index} - Acc: {acc:.4f} | F1: {f1:.4f}")
    fold_index += 1

# === Mean và Std ===
mean_acc = np.mean(accuracies)
mean_prec = np.mean(precisions)
mean_rec = np.mean(recalls)
mean_f1 = np.mean(f1s)
std_acc = np.std(accuracies)

print("\n==== Mean metrics ====")
print(f"Accuracy: {mean_acc:.4f}")
print(f"Precision: {mean_prec:.4f}")
print(f"Recall: {mean_rec:.4f}")
print(f"F1-score: {mean_f1:.4f}")
print(f"Std (Accuracy): {std_acc:.4f}")


Fold 1 - Acc: 0.8101 | F1: 0.7639
Fold 2 - Acc: 0.7865 | F1: 0.6935
Fold 3 - Acc: 0.8427 | F1: 0.7971
Fold 4 - Acc: 0.7697 | F1: 0.6822
Fold 5 - Acc: 0.7865 | F1: 0.7077

==== Mean metrics ====
Accuracy: 0.7991
Precision: 0.7510
Recall: 0.7090
F1-score: 0.7289
Std (Accuracy): 0.0253


## Save model into logger and dump model to pickle file

In [4]:
import os
import sys
import joblib
import pandas as pd
import numpy as np
from os.path import join
from sklearn.linear_model import LogisticRegression

# === Load dữ liệu ===
path_dir = join("..", "..", "..")
input_dir = join(path_dir, "data", "feature_engineered", "familySize")

df_train = pd.read_csv(join(input_dir, "familySize_engineered_train.csv"))
df_test = pd.read_csv(join(input_dir, "familySize_engineered_test.csv"))

X = df_train.drop(['Survived'], axis=1)
y = df_train['Survived']

# === Chuẩn hóa dữ liệu (giúp Logistic Regression hội tụ ổn định hơn) ===
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test_scaled = scaler.transform(df_test.drop(columns=['Survived'], errors='ignore'))

# === KFold ===
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

kf = KFold(n_splits=5, shuffle=True, random_state=42)
accuracies, precisions, recalls, f1s = [], [], [], []

fold_index = 1
for train_index, val_index in kf.split(X):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    # === Mô hình Logistic Regression ===
    model = LogisticRegression(
        max_iter=1000,
        solver='lbfgs',
        random_state=42
    )

    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    # === Metrics ===
    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracies.append(acc)
    precisions.append(prec)
    recalls.append(rec)
    f1s.append(f1)

    print(f"Fold {fold_index} - Acc: {acc:.4f} | F1: {f1:.4f}")
    fold_index += 1

# === Mean & Std ===
mean_acc = np.mean(accuracies)
mean_prec = np.mean(precisions)
mean_rec = np.mean(recalls)
mean_f1 = np.mean(f1s)
std_acc = np.std(accuracies)

print("\n==== Mean metrics ====")
print(f"Accuracy: {mean_acc:.4f}")
print(f"Precision: {mean_prec:.4f}")
print(f"Recall: {mean_rec:.4f}")
print(f"F1-score: {mean_f1:.4f}")
print(f"Std (Accuracy): {std_acc:.4f}")

# === Import log_experiment ===
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..", "..", "..")))
from log.experiment_logger import log_experiment

# === Ghi log kết quả vào CSV ===
log_path = join(path_dir, "log", "experiment_log.csv")
log_experiment(
    output_path=log_path,
    model_name="LogisticRegression",
    feature_name="familySize_engineered",
    params={
        "solver": "lbfgs",
        "max_iter": 1000,
        "random_state": 42
    },
    kfold=5,
    f1=mean_f1,
    acc=mean_acc,
    rec=mean_rec,
    prec=mean_prec,
    std=std_acc,
    author="Thang"
)

# === Huấn luyện lại trên toàn bộ dữ liệu train ===
final_model = LogisticRegression(
    max_iter=1000,
    solver='lbfgs',
    random_state=42
)
final_model.fit(X, y)

# === Lưu model ===
model_dir = join(path_dir, "notebook", "model", "logistic_regression", "Model Pickles")
os.makedirs(model_dir, exist_ok=True)
model_path = join(model_dir, "lr_familySize.pkl")
joblib.dump((final_model, scaler), model_path)  # lưu luôn scaler để predict sau
print(f"✅ Model saved to {model_path}")

# === Tạo file submission ===
y_test_pred = final_model.predict(X_test_scaled)

submission = pd.DataFrame({
    'PassengerId': df_test['PassengerId'],  # phải có trong test
    'Survived': y_test_pred
})

sub_dir = join(path_dir, "notebook", "model", "logistic_regression", "submissions")
os.makedirs(sub_dir, exist_ok=True)
submission_path = join(sub_dir, "submission_lr_familySize.csv")
submission.to_csv(submission_path, index=False)
print(f"📤 Submission file saved to {submission_path}")


Fold 1 - Acc: 0.8045 | F1: 0.7552
Fold 2 - Acc: 0.7865 | F1: 0.6885
Fold 3 - Acc: 0.8371 | F1: 0.7883
Fold 4 - Acc: 0.7584 | F1: 0.6614
Fold 5 - Acc: 0.7865 | F1: 0.7121

==== Mean metrics ====
Accuracy: 0.7946
Precision: 0.7484
Recall: 0.6975
F1-score: 0.7211
Std (Accuracy): 0.0258
Logged experiment to ..\..\..\log\experiment_log.csv
✅ Model saved to ..\..\..\notebook\model\logistic_regression\Model Pickles\lr_familySize.pkl
📤 Submission file saved to ..\..\..\notebook\model\logistic_regression\submissions\submission_lr_familySize.csv


# The end