In [18]:
import pandas as pd
import numpy as np

def preprocess(df, mean_age=None, mode_embarked=None, fare_median=None, train_columns=None):
    """
    Titanic data preprocessing (final Kaggle-ready version).
    Works for both train & test sets.
    Keeps consistent columns for model training and submission.

    Args:
        df (pd.DataFrame): Input DataFrame
        mean_age, mode_embarked, fare_median: (optional) Stats from train set
        train_columns (list): List of columns from training set to align test set

    Returns:
        df_processed, mean_age, mode_embarked, fare_median
    """
    df = df.copy()
    
    # ---------------- Missing Values ----------------
    if mean_age is None:
        mean_age = df['Age'].mean()
    if mode_embarked is None:
        mode_embarked = df['Embarked'].mode()[0]
    if fare_median is None:
        fare_median = df['Fare'].median()

    df['Age'] = df['Age'].fillna(mean_age)
    df['Embarked'] = df['Embarked'].fillna(mode_embarked)
    df['Fare'] = df['Fare'].fillna(fare_median)

    # ---------------- Feature Engineering ----------------
    # Title extraction
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    df['Title'] = df['Title'].replace({
        'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs',
        'Lady': 'Rare', 'Countess': 'Rare', 'Capt': 'Rare',
        'Col': 'Rare', 'Don': 'Rare', 'Dr': 'Rare', 'Major': 'Rare',
        'Rev': 'Rare', 'Sir': 'Rare', 'Jonkheer': 'Rare', 'Dona': 'Rare'
    })

    # Fill Age by Title median (smarter than global mean)
    title_medians = df.groupby('Title')['Age'].median()
    df['Age'] = df.apply(lambda x: title_medians[x['Title']] if pd.isna(x['Age']) else x['Age'], axis=1)

    # Ticket info (prefix and group size)
    df['Ticket_prefix'] = df['Ticket'].str.extract('([A-Za-z./]+)', expand=False).fillna('NONE')
    rare_prefix = df['Ticket_prefix'].value_counts()[df['Ticket_prefix'].value_counts() < 10].index
    df['Ticket_prefix'] = df['Ticket_prefix'].replace(rare_prefix, 'Rare')
    ticket_counts = df['Ticket'].value_counts()
    df['TicketGroupSize'] = df['Ticket'].map(ticket_counts)

    # Deck from Cabin
    df['Deck'] = df['Cabin'].str[0].fillna('U')

    # Family-related features
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)

    # Interaction features
    df['Age*Class'] = df['Age'] * df['Pclass']
    df['Age*Fare'] = df['Age'] * df['Fare']

    # Binning
    df['AgeBand'] = pd.cut(df['Age'], bins=[0, 12, 20, 40, 60, np.inf], labels=[0, 1, 2, 3, 4]).astype(int)
    df['FareBand'] = pd.qcut(df['Fare'], q=4, labels=[0, 1, 2, 3]).astype(int)

    # Log transform
    df['Fare_log'] = np.log1p(df['Fare'])

    # ---------------- One-hot Encoding ----------------
    df = pd.get_dummies(
        df,
        columns=['Sex', 'Embarked', 'Title', 'Ticket_prefix', 'Deck', 'Pclass'],
        drop_first=False,
        dtype=int
    )

    # ---------------- Drop Unused Columns ----------------
    drop_cols = ['Name', 'Ticket', 'Cabin', 'SibSp', 'Parch']
    df.drop(columns=[c for c in drop_cols if c in df.columns], inplace=True)

    # ---------------- Scaling ----------------
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    df[numeric_cols] = (df[numeric_cols] - df[numeric_cols].mean()) / df[numeric_cols].std()

    # ---------------- Align test columns ----------------
    if train_columns is not None:
        # Add missing cols in test
        for col in train_columns:
            if col not in df.columns:
                df[col] = 0
        # Drop extra cols not in train
        df = df[train_columns]

    return df, mean_age, mode_embarked, fare_median


In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier

# ---------------- Import preprocess function (của bạn ở trên) ----------------
# giả sử bạn đã định nghĩa preprocess(df, ...) rồi

# ---------------- Load data ----------------
train_df = pd.read_csv("../Titanic project/input/train.csv")

# Chia label và feature
y = train_df["Survived"]
X_raw = train_df.drop("Survived", axis=1)

# ---------------- Train/Valid Split ----------------
X_train_raw, X_valid_raw, y_train, y_valid = train_test_split(
    X_raw, y, test_size=0.5, random_state=42, stratify=y
)

# ---------------- Preprocess ----------------
X_train, mean_age, mode_embarked, fare_median = preprocess(X_train_raw)
train_columns = X_train.columns.tolist()

X_valid, _, _, _ = preprocess(
    X_valid_raw, mean_age, mode_embarked, fare_median, train_columns
)

print(f"✅ Preprocessing done. Train shape: {X_train.shape}, Valid shape: {X_valid.shape}")

# ---------------- Train model (ví dụ RandomForest) ----------------
rf = RandomForestClassifier(
    n_estimators=500,
    max_depth=None,
    min_samples_split=2,
    random_state=42,
)
rf.fit(X_train, y_train)

# ---------------- Evaluate ----------------
y_pred = rf.predict(X_valid)

acc = accuracy_score(y_valid, y_pred)
prec = precision_score(y_valid, y_pred)
rec = recall_score(y_valid, y_pred)
f1 = f1_score(y_valid, y_pred)

print("\n📊 Evaluation Results (Validation Set)")
print(f"Accuracy : {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall   : {rec:.4f}")
print(f"F1-Score : {f1:.4f}")


✅ Preprocessing done. Train shape: (445, 38), Valid shape: (446, 38)

📊 Evaluation Results (Validation Set)
Accuracy : 0.8184
Precision: 0.8214
Recall   : 0.6725
F1-Score : 0.7395


In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier

models = {
    "Logistic": LogisticRegression(max_iter=3000, solver="saga", penalty="l2"),
    "AdaBoost": AdaBoostClassifier(n_estimators=500, learning_rate=0.5, random_state=42),
    "CatBoost": CatBoostClassifier(
        iterations=500, learning_rate=0.05, depth=4, l2_leaf_reg=2, verbose=0, random_state=42
    ),
    "RandomForest": RandomForestClassifier(n_estimators=500, random_state=42)
}

results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    acc = accuracy_score(y_valid, y_pred)
    prec = precision_score(y_valid, y_pred)
    rec = recall_score(y_valid, y_pred)
    f1 = f1_score(y_valid, y_pred)
    results.append([name, acc, prec, rec, f1])

results_df = pd.DataFrame(results, columns=["Model", "Accuracy", "Precision", "Recall", "F1"])
print("\n📋 Model Comparison:")
print(results_df.sort_values(by="Accuracy", ascending=False).to_string(index=False))



📋 Model Comparison:
       Model  Accuracy  Precision   Recall       F1
    Logistic  0.825112   0.778443 0.760234 0.769231
RandomForest  0.818386   0.821429 0.672515 0.739550
    AdaBoost  0.800448   0.753086 0.713450 0.732733
    CatBoost  0.795964   0.759740 0.684211 0.720000


In [None]:
# Train set
train_df = pd.read_csv("../Titanic project/input/train.csv")
y = train_df['Survived']
X, mean_age, mode_embarked, fare_median = preprocess(train_df.drop('Survived', axis=1))[:4]
train_columns = X.columns.tolist()

# Test set
test_df = pd.read_csv("../Titanic project/input/test.csv")
X_test, _, _, _ = preprocess(test_df, mean_age, mode_embarked, fare_median, train_columns)

# Fit model (ví dụ)
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=500, random_state=42)
rf.fit(X, y)
pred = rf.predict(X_test)

# Save submit
submission = pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Survived': pred})
submission.to_csv("submission.csv", index=False)


: 