# Survival Prediction — Titanic dataset

This notebook uses the public Titanic dataset (CSV) and demonstrates a reproducible ML workflow:

- Data loading and exploratory data analysis (EDA)
- Feature engineering and preprocessing
- Cross-validated model evaluation and hyperparameter tuning
- Visualizations and model interpretability using SHAP and LIME

Run the downloader first (if you haven't already):

```bash
python data/download_titanic.py
```

In [None]:
# Imports and dataset load
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

sns.set(style='whitegrid')

DATA_PATH = Path('data') / 'titanic.csv'
if not DATA_PATH.exists():
    print('Dataset not found locally, downloading...')
    import runpy
    runpy.run_path('data/download_titanic.py')

print('Loading', DATA_PATH)
df = pd.read_csv(DATA_PATH)
df.head()

In [None]:
# Basic EDA
print('shape:', df.shape)
print('\nSurvived value counts:')
print(df['Survived'].value_counts())

plt.figure(figsize=(6,4))
sns.countplot(x='Survived', data=df)
plt.title('Survived (0 = Died, 1 = Survived)')
plt.show()

plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
sns.histplot(df['Age'].dropna(), kde=True)
plt.title('Age distribution')
plt.subplot(1,2,2)
sns.boxplot(x='Survived', y='Fare', data=df)
plt.title('Fare by Survival')
plt.show()

In [None]:
# Feature engineering
df2 = df.copy()
# Title extraction
df2['Title'] = df2['Name'].str.extract(',\s*([^\.]+)\.', expand=False).str.strip()
# group rare titles
title_map = {
    'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs',
    'Lady': 'Rare', 'Countess': 'Rare', 'Capt': 'Rare', 'Col': 'Rare', 'Don': 'Rare',
    'Dr': 'Rare', 'Major': 'Rare', 'Rev': 'Rare', 'Sir': 'Rare', 'Jonkheer': 'Rare'
}
df2['Title'] = df2['Title'].replace(title_map)

# Family size and is alone
df2['FamilySize'] = df2['SibSp'] + df2['Parch'] + 1
df2['IsAlone'] = (df2['FamilySize'] == 1).astype(int)

# Fill missing embark/age/fare
df2['Embarked'] = df2['Embarked'].fillna(df2['Embarked'].mode()[0])
# Age: fill by median per Title, fallback to overall median
age_medians = df2.groupby('Title')['Age'].median()
def fill_age(row):
    if pd.isna(row['Age']):
        t = row['Title']
        if not pd.isna(age_medians.get(t)):
            return age_medians.get(t)
        return df2['Age'].median()
    return row['Age']

df2['Age'] = df2.apply(fill_age, axis=1)

# Fare: small fill
df2['Fare'] = df2['Fare'].fillna(df2['Fare'].median())

# Create Fare band
df2['FareBand'] = pd.qcut(df2['Fare'], 4, labels=False)

# Keep a compact set of features for modeling
FEATURES = ['Pclass','Sex','Age','Fare','Embarked','Title','FamilySize','IsAlone']
TARGET = 'Survived'
df_model = df2[FEATURES + [TARGET]].copy()
df_model.head()

In [None]:
# Preprocessing and modeling pipeline
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

X = df_model.drop(columns=TARGET)
y = df_model[TARGET]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

numeric_features = ['Age', 'Fare', 'FamilySize']
categorical_features = ['Pclass','Sex','Embarked','Title','IsAlone']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

pipe = Pipeline(steps=[('preprocessor', preprocessor), ('clf', RandomForestClassifier(random_state=42))])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_res = cross_validate(pipe, X_train, y_train, cv=cv, scoring=['accuracy','precision','recall'], n_jobs=1)

print('CV accuracy: {:.3f} ± {:.3f}'.format(cv_res['test_accuracy'].mean(), cv_res['test_accuracy'].std()))
print('CV precision: {:.3f} ± {:.3f}'.format(cv_res['test_precision'].mean(), cv_res['test_precision'].std()))
print('CV recall: {:.3f} ± {:.3f}'.format(cv_res['test_recall'].mean(), cv_res['test_recall'].std()))

In [None]:
# Hyperparameter tuning with RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
param_dist = {
    'clf__n_estimators': [100, 200, 300, 500],
    'clf__max_depth': [None, 5, 8, 15, 25],
    'clf__min_samples_split': [2, 5, 10],
    'clf__max_features': ['sqrt', 'log2']
}

rs = RandomizedSearchCV(pipe, param_dist, n_iter=25, cv=cv, scoring='accuracy', n_jobs=-1, random_state=42, verbose=1)
rs.fit(X_train, y_train)
print('Best score:', rs.best_score_)
print('Best params:', rs.best_params_)

best = rs.best_estimator_

In [None]:
# Final evaluation on hold-out test set
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, confusion_matrix

y_pred = best.predict(X_test)
print(classification_report(y_test, y_pred))

if hasattr(best, 'predict_proba'):
    y_proba = best.predict_proba(X_test)[:,1]
    auc = roc_auc_score(y_test, y_proba)
    print('ROC AUC:', auc)
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    plt.figure(figsize=(6,4))
    plt.plot(fpr, tpr, label=f'AUC={auc:.3f}')
    plt.plot([0,1],[0,1],'--',color='gray')
    plt.xlabel('FPR'); plt.ylabel('TPR'); plt.title('ROC Curve'); plt.legend(); plt.show()

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(4,3)); sns.heatmap(cm, annot=True, fmt='d', cmap='Blues'); plt.title('Confusion matrix'); plt.show()

In [None]:
# Feature importance (permutation importance)
from sklearn.inspection import permutation_importance

# Build feature names (numeric + onehot categories)
pre = best.named_steps['preprocessor']
cat_ohe = pre.named_transformers_['cat'].named_steps['onehot']
onehot_names = []
for name, cats in zip(['Pclass','Sex','Embarked','Title','IsAlone'], cat_ohe.categories_):
    onehot_names += [f'{name}_{c}' for c in cats]
feature_names = ['Age','Fare','FamilySize'] + onehot_names

X_test_trans = pre.transform(X_test)
res = permutation_importance(best.named_steps['clf'], X_test_trans, y_test, n_repeats=10, random_state=42, n_jobs=-1)
imp_df = pd.DataFrame({'feature': feature_names, 'importance': res.importances_mean}).sort_values('importance', ascending=False).head(20)

plt.figure(figsize=(8,5)); sns.barplot(data=imp_df, x='importance', y='feature', palette='viridis'); plt.title('Permutation importance'); plt.show()

In [None]:
# SHAP explainability (TreeExplainer for tree-based model)
import shap

# Use a small sample for plotting speed
X_sample = X_train.sample(200, random_state=42)
X_sample_trans = pre.transform(X_sample)
explainer = shap.TreeExplainer(best.named_steps['clf'])
shap_values = explainer.shap_values(X_sample_trans)

# For binary classification shap_values is a list; show for class 1
shap.summary_plot(shap_values[1], X_sample_trans, feature_names=feature_names, show=True)

In [None]:
# LIME explanation example
from lime.lime_tabular import LimeTabularExplainer

# LIME expects raw training data (numpy) and a predict_proba function
X_train_for_lime = X_train.copy()
explainer = LimeTabularExplainer(X_train_for_lime.values, feature_names=X_train_for_lime.columns.tolist(), class_names=['died','survived'], discretize_continuous=True, random_state=42)

instance = X_test.iloc[7]
exp = explainer.explain_instance(instance.values, best.predict_proba, num_features=6)
exp.show_in_notebook(show_table=True)

## Summary & next steps

- We replaced the synthetic demo with the actual Titanic dataset and added realistic EDA, feature engineering, cross-validation, and hyperparameter tuning.
- We used SHAP and LIME to provide model explanations (global and local).

Next steps (optional): try a gradient-boosted model (XGBoost / LightGBM), add calibration, and push a Binder/Colab-friendly environment file (`environment.yml`) to ensure reproducible hosted runs.