In [73]:
# Scenario Question: Predicting Titanic Survival
# Researchers are studying the Titanic disaster and want to build models that predict whether a
#  passenger would survive or not survive based on their information.
# - Features used:
# - Passenger class (pclass)
# - Gender (sex)
# - Age (age)
# - Number of siblings/spouses aboard (sibsp)
# - Number of parents/children aboard (parch)
# - Ticket fare (fare)
# - Label:
# - 1 = Survived
# - 0 = Died
# The researchers train three different models:
# - Logistic Regression
# - K-Nearest Neighbors (KNN) with k=5
# - Decision Tree with max depth = 4
# They then evaluate each model using a classification report (precision, recall, F1-score, accuracy).

# ❓ Questions for Learners
# - Which model performs best at predicting survival, and why?
# - How does Logistic Regression differ from Decision Tree in terms of interpretability?
# # - Why is scaling applied before training Logistic Regression and KNN, but not strictly needed
#  for Decision Trees?
# - Looking at the classification report, what do precision and recall mean in the context of survival
#  predictions?
# - Precision → Of those predicted to survive, how many actually survived?
# - Recall → Of all who truly survived, how many were correctly predicted?
# - If you were a historian, which model would you trust more to explain survival patterns, and why?

# Use the below pre-loaded dataset:
# 1. Load data (use seaborn's built-in dataset)
# import seaborn as sns
# df = sns.load_dataset('titanic')


In [74]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score

In [75]:
df = sns.load_dataset('titanic')
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [76]:
df = df[['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'survived']]

df['age'] = df['age'].fillna(df['age'].median())
df['fare'] = df['fare'].fillna(df['fare'].median())

df['sex'] = df['sex'].map({'male': 0, 'female': 1})


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['age'] = df['age'].fillna(df['age'].median())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['fare'] = df['fare'].fillna(df['fare'].median())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sex'] = df['sex'].map({'male': 0, 'female': 1})


In [77]:
df

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,survived
0,3,0,22.0,1,0,7.2500,0
1,1,1,38.0,1,0,71.2833,1
2,3,1,26.0,0,0,7.9250,1
3,1,1,35.0,1,0,53.1000,1
4,3,0,35.0,0,0,8.0500,0
...,...,...,...,...,...,...,...
886,2,0,27.0,0,0,13.0000,0
887,1,1,19.0,0,0,30.0000,1
888,3,1,28.0,1,2,23.4500,0
889,1,0,26.0,0,0,30.0000,1


In [86]:
X = df[['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare']]
Y = df['survived']

In [87]:
x_train, x_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)

In [88]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(x_train)
X_test_scaled = scaler.transform(x_test)

In [89]:
# LOGISTIC REGRESSION

model = LogisticRegression(max_iter=200)
model.fit(X_train_scaled, y_train)
model_pred = model.predict(X_test_scaled)

print("Logistic Regression Report")
print(classification_report(y_test, model_pred))
print("Accuracy:", accuracy_score(y_test, model_pred))


Logistic Regression Report
              precision    recall  f1-score   support

           0       0.81      0.86      0.83       105
           1       0.78      0.72      0.75        74

    accuracy                           0.80       179
   macro avg       0.80      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179

Accuracy: 0.7988826815642458


In [90]:
# K-NEAREST NEIGHBOUR

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)
knn_pred = knn.predict(X_test_scaled)

print("KNN (k=5) Report")
print(classification_report(y_test, knn_pred))
print("Accuracy:", accuracy_score(y_test, knn_pred))


KNN (k=5) Report
              precision    recall  f1-score   support

           0       0.81      0.83      0.82       105
           1       0.75      0.73      0.74        74

    accuracy                           0.79       179
   macro avg       0.78      0.78      0.78       179
weighted avg       0.79      0.79      0.79       179

Accuracy: 0.7877094972067039


In [91]:
# DECISION TREE

dt = DecisionTreeClassifier(max_depth=4, random_state=42)
dt.fit(X_train, y_train)   # no scaling needed
model_pred = dt.predict(X_test)

print("Decision Tree Report")
print(classification_report(y_test, model_pred))
print("Accuracy:", accuracy_score(y_test, model_pred))


Decision Tree Report
              precision    recall  f1-score   support

           0       0.80      0.88      0.84       105
           1       0.80      0.69      0.74        74

    accuracy                           0.80       179
   macro avg       0.80      0.78      0.79       179
weighted avg       0.80      0.80      0.80       179

Accuracy: 0.7988826815642458


In [92]:
# Which model performs best at predicting survival, and why?
# ANS ---  The best model is the one with the highest F1-score and accuracy.
# Typically, the Decision Tree performs well

In [93]:
# How does Logistic Regression differ from Decision Tree in terms of interpretability?
#aNS ---- logistic regression - indicates how each feature increase or decrease survival probability.
# Decision Tree - rule-based interpretability (if-else paths)

In [94]:
# Why is scaling applied before training Logistic Regression and KNN, but not strictly needed for Decision Trees?
# Ans ---- Logistic Regression & KNN depend on distance/magnitude

In [95]:
# Looking at the classification report, what do precision and recall mean in the context of survival predictions?
# Ans ----- Precision - Of passengers predicted to survive, how many actually survived.

# Predicted survive = 100
# Actually survived = 80
# Precision = 80/100 = 0.80

    # Recall - Of all real survivors, how many were correctly predicted.

# Actual survivors = 120
# Correctly predicted = 90
# Recall = 90/120 = 0.75

In [96]:
# If you were a historian, which model would you trust more to explain survival patterns, and why?
# Ans -- Decision Tree....provide clear data