In [1]:
!wget https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv

--2026-02-17 10:20:37--  https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 60302 (59K) [text/plain]
Saving to: ‘titanic.csv’


2026-02-17 10:20:37 (4.57 MB/s) - ‘titanic.csv’ saved [60302/60302]



In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load dataset
df = pd.read_csv("titanic.csv")

# target
y = df["Survived"]
X = df.drop(columns=["Survived","Name","Ticket","Cabin"])


In [4]:
X["Age"] = X["Age"].fillna(X["Age"].median())

X["AgeGroup"] = pd.cut(
    X["Age"],
    bins=[0,12,18,35,60,100],
    labels=["Child","Teen","YoungAdult","Adult","Senior"]
)

In [5]:
X["Fare"] = X["Fare"].fillna(X["Fare"].median())
X["Fare_log"] = np.log1p(X["Fare"])


In [6]:
X["Pclass_Fare"] = X["Pclass"] * X["Fare"]


In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [8]:
basic_features = ["Pclass","Sex","Age","Fare","Embarked"]

cat_cols = ["Sex","Embarked"]
num_cols = ["Pclass","Age","Fare"]

preprocess_basic = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ("num", StandardScaler(), num_cols)
])

baseline_model = Pipeline([
    ("prep", preprocess_basic),
    ("model", LogisticRegression(max_iter=1000))
])

baseline_model.fit(X_train[basic_features], y_train)

pred_base = baseline_model.predict(X_test[basic_features])
print("Baseline Accuracy:", accuracy_score(y_test, pred_base))


Baseline Accuracy: 0.7988826815642458


In [9]:
cat_cols = ["Sex","Embarked","AgeGroup"]
num_cols = ["Pclass","Age","Fare","Fare_log","Pclass_Fare"]

preprocess_adv = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ("num", StandardScaler(), num_cols)
])

enhanced_model = Pipeline([
    ("prep", preprocess_adv),
    ("model", LogisticRegression(max_iter=1000))
])

enhanced_model.fit(X_train, y_train)

pred_adv = enhanced_model.predict(X_test)
print("Enhanced Accuracy:", accuracy_score(y_test, pred_adv))


Enhanced Accuracy: 0.8100558659217877
