# Titanic - Random Forest Classifier

## Trainning model

In [11]:
from os.path import join
import sys
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tuningModel import tune_random_forest

# === Load dữ liệu ===
path_dir = join("..", "..", "..")
input_dir = join(path_dir, "data", "feature_engineered", "sexAndPclass")

df_train = pd.read_csv(join(input_dir, "sexAndPclass_engineered_train.csv"))
df_test = pd.read_csv(join(input_dir, "sexAndPclass_engineered_test.csv"))

In [12]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
accuracies = list()
train_accuracies = list()
max_attributes = len(list(df_test))
depth_range = range(1, max_attributes + 1)

for depth in depth_range:
    fold_accuracy = []
    fold_train_accuracy = []
    tree_model = RandomForestClassifier(max_depth=depth, n_estimators=100, random_state=42)
    for train_fold, valid_fold in kf.split(df_train):
        f_train = df_train.loc[train_fold] 
        f_valid = df_train.loc[valid_fold] 

        X_train = f_train.drop(['Survived'], axis=1)
        y_train = f_train["Survived"]
        X_valid = f_valid.drop(['Survived'], axis=1)
        y_valid = f_valid["Survived"]

        # Fit the model
        model = tree_model.fit(X_train, y_train) 
        
        # Calculate and store validation accuracy
        valid_acc = model.score(X_valid, y_valid)
        fold_accuracy.append(valid_acc)

        # Calculate and store training accuracy
        train_acc = model.score(X_train, y_train)
        fold_train_accuracy.append(train_acc)

    avg_valid_acc = sum(fold_accuracy) / len(fold_accuracy)
    accuracies.append(avg_valid_acc)
    
    avg_train_acc = sum(fold_train_accuracy) / len(fold_train_accuracy)
    train_accuracies.append(avg_train_acc)
    # print("Accuracy per fold: ", fold_accuracy, "\n")
    # print("Average accuracy: ", avg)
    # print("\n")
    
df = pd.DataFrame({
    "Max Depth": depth_range, 
    "Average Train Accuracy": train_accuracies,
    "Average Validation Accuracy": accuracies
})

print(df.to_string(index=False))

 Max Depth  Average Train Accuracy  Average Validation Accuracy
         1                0.780026                     0.781163
         2                0.787319                     0.786780
         3                0.800786                     0.792392
         4                0.829399                     0.808116
         5                0.842310                     0.824920
         6                0.853532                     0.832779
         7                0.870928                     0.829408
         8                0.893376                     0.832773
         9                0.916383                     0.831643
        10                0.934902                     0.837254
        11                0.947810                     0.830525


In [13]:
X = df_train.drop(['Survived'], axis=1)
y = df_train['Survived']

# best_model, results_df = tune_random_forest(X, y)
# === KFold ===
kf = KFold(n_splits=5, shuffle=True, random_state=42)
accuracies, precisions, recalls, f1s = [], [], [], []

fold_index = 1
for train_index, val_index in kf.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    model = RandomForestClassifier(random_state=42, max_depth=6) # depth = 10 after testing
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracies.append(acc)
    precisions.append(prec)
    recalls.append(rec)
    f1s.append(f1)

    print(f"Fold {fold_index} - Acc: {acc:.4f} | F1: {f1:.4f}")
    fold_index += 1

# === Mean và Std ===
mean_acc = np.mean(accuracies)
mean_prec = np.mean(precisions)
mean_rec = np.mean(recalls)
mean_f1 = np.mean(f1s)
std_acc = np.std(accuracies)

print("\n==== Mean metrics ====")
print(f"Accuracy: {mean_acc:.4f}")
print(f"Precision: {mean_prec:.4f}")
print(f"Recall: {mean_rec:.4f}")
print(f"F1-score: {mean_f1:.4f}")
print(f"Std (Accuracy): {std_acc:.4f}")

"""
Fold 1 - Acc: 0.8212 | F1: 0.7681
Fold 2 - Acc: 0.8258 | F1: 0.7304
Fold 3 - Acc: 0.8708 | F1: 0.8189
Fold 4 - Acc: 0.7978 | F1: 0.6786
Fold 5 - Acc: 0.8483 | F1: 0.7692

==== Mean metrics ====
Accuracy: 0.8328
Precision: 0.8614
Recall: 0.6708
F1-score: 0.7531
Std (Accuracy): 0.0249
"""

Fold 1 - Acc: 0.8268 | F1: 0.7704
Fold 2 - Acc: 0.8258 | F1: 0.7207
Fold 3 - Acc: 0.8820 | F1: 0.8346
Fold 4 - Acc: 0.7978 | F1: 0.6727
Fold 5 - Acc: 0.8315 | F1: 0.7414

==== Mean metrics ====
Accuracy: 0.8328
Precision: 0.8751
Recall: 0.6558
F1-score: 0.7480
Std (Accuracy): 0.0273


'\nFold 1 - Acc: 0.8212 | F1: 0.7681\nFold 2 - Acc: 0.8258 | F1: 0.7304\nFold 3 - Acc: 0.8708 | F1: 0.8189\nFold 4 - Acc: 0.7978 | F1: 0.6786\nFold 5 - Acc: 0.8483 | F1: 0.7692\n\n==== Mean metrics ====\nAccuracy: 0.8328\nPrecision: 0.8614\nRecall: 0.6708\nF1-score: 0.7531\nStd (Accuracy): 0.0249\n'

## Save model into logger and dump model to pickle file

In [14]:
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..", "..", "..")))
from log.experiment_logger import log_experiment
import joblib

log_path = join(path_dir, "log", "experiment_log.csv")
log_experiment(
    output_path=log_path,
    model_name="RandomForestClassifier",
    feature_name="sexAndPclass_engineered",
    params=model.get_params(),
    kfold=5,
    f1=mean_f1,
    acc=mean_acc,
    rec=mean_rec,
    prec=mean_prec,
    std=std_acc,
    author="Thien"
)

# === Huấn luyện lại trên toàn bộ dữ liệu train ===
final_model = model
final_model.fit(X, y)

# === Dump model ra .pkl ===
model_dir = join(path_dir,"notebook","model","random_forest", "Model Pickles")
os.makedirs(model_dir, exist_ok=True)
model_path = join(model_dir, "rf_sexAndPclass.pkl")
joblib.dump(final_model, model_path)
print(f"✅ Model saved to {model_path}")
df_original = pd.read_csv(join(path_dir,"data","raw","test.csv"))
passenger_ids = df_original["PassengerId"]
# === Tạo file submission ===
X_test = df_test.copy()
y_test_pred = final_model.predict(X_test)

# Tạo DataFrame submission (giả sử Kaggle cần 'PassengerId' và 'Survived')
submission = pd.DataFrame({
    'PassengerId': passenger_ids,  # nếu test có cột này
    'Survived': y_test_pred
})

sub_dir = join(path_dir,"notebook","model","random_forest", "submissions")
os.makedirs(sub_dir, exist_ok=True)
submission_path = join(sub_dir, "submission_rf_sexAndPclass.csv")
submission.to_csv(submission_path, index=False)
print(f"Submission file saved to {submission_path}")

Logged experiment to ..\..\..\log\experiment_log.csv
✅ Model saved to ..\..\..\notebook\model\random_forest\Model Pickles\rf_sexAndPclass.pkl
Submission file saved to ..\..\..\notebook\model\random_forest\submissions\submission_rf_sexAndPclass.csv


# The end