In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from transformers.custom_transformer import *

In [9]:
# -------------------------------
# Load and preprocess data
# -------------------------------
data = pd.read_csv('../data/Titanic-Dataset.csv')
data.columns = data.columns.str.lower()

X = data.drop(columns=['survived'])
y = data['survived']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

In [10]:
# -------------------------------
# Helper: ColumnTransformer â†’ DataFrame
# -------------------------------
def ct_to_df(ct, X):
    return pd.DataFrame(ct.transform(X), columns=ct.get_feature_names_out(), index=X.index)

In [11]:
# -------------------------------
# ColumnTransformer
# -------------------------------
preprocessor = ColumnTransformer([
    ('pclass', PClassEncoder(), ['pclass']),
    ('name', NameExtractor(), ['name']),
    ('sex', SexEncoder(), ['sex']),
    ('sibsp', SibspBinning(), ['sibsp']),
    ('parch', ParchBinning(), ['parch']),
    ('ticket', TicketExtractorAdvanced(top_k=5), ['ticket']),
    ('fare', FareBinning(method='quantile', q=4), ['fare']),
    ('age', AgeImmputer(), ['age']),
    ('embarked', EmbarkedEncoder(), ['embarked']),
], remainder='drop')

In [12]:
# -------------------------------
# Fit and transform
# -------------------------------
preprocessor.fit(X_train)

X_train_df = ct_to_df(preprocessor, X_train)
X_test_df  = ct_to_df(preprocessor, X_test)

print("Train shape:", X_train_df.shape)
print("Test shape:", X_test_df.shape)
X_train_df.head()

Train shape: (623, 12)
Test shape: (268, 12)


Unnamed: 0,pclass__pclass_1,pclass__pclass_3,name__title,name__last_name,sex__is_male,sibsp__bin_sibsp,parch__bin_parch,ticket__ticket_prefix,ticket__ticket_number,fare__fare_bin,age__age,embarked__is_s
748,1,0,Mr,Marvin,1,small,alone,none,113773.0,bin4,19.0,0
45,0,1,Mr,Rogers,1,alone,alone,none,23567.0,bin2,30.045476,0
28,0,1,Miss,O'Dwyer,0,alone,alone,none,330959.0,bin1,30.045476,0
633,1,0,Mr,Parr,1,alone,alone,none,112052.0,bin1,30.045476,0
403,0,1,Mr,Hakkarainen,1,small,alone,none,3101279.0,bin3,28.0,0
