In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.impute import KNNImputer,SimpleImputer
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/datasciencedojo/datasets/refs/heads/master/titanic.csv', usecols=['Age','Pclass','Fare','Survived'])

In [3]:
df.head()

Unnamed: 0,Survived,Pclass,Age,Fare
0,0,3,22.0,7.25
1,1,1,38.0,71.2833
2,1,3,26.0,7.925
3,1,1,35.0,53.1
4,0,3,35.0,8.05


In [4]:
df.isnull().mean() * 100

Unnamed: 0,0
Survived,0.0
Pclass,0.0
Age,19.86532
Fare,0.0


In [5]:
X = df.drop(columns=['Survived'])
y = df['Survived']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [7]:
X_train.head()

Unnamed: 0,Pclass,Age,Fare
30,1,40.0,27.7208
10,3,4.0,16.7
873,3,47.0,9.0
182,3,9.0,31.3875
876,3,20.0,9.8458


---

## **Applying KNN Imputation**

---



In [61]:
knn = KNNImputer(n_neighbors=3, weights='distance')

In [62]:
X_train_trf = knn.fit_transform(X_train)
X_test_trf = knn.transform(X_test)

In [63]:
pd.DataFrame(X_train_trf, columns=['PClass', 'Age', 'Fare'])

Unnamed: 0,PClass,Age,Fare
0,1.0,40.000000,27.7208
1,3.0,4.000000,16.7000
2,3.0,47.000000,9.0000
3,3.0,9.000000,31.3875
4,3.0,20.000000,9.8458
...,...,...,...
707,3.0,30.000000,8.6625
708,3.0,26.959410,8.7125
709,1.0,71.000000,49.5042
710,1.0,32.666667,221.7792


In [64]:
lr = LogisticRegression()

lr.fit(X_train_trf, y_train)

y_pred = lr.predict(X_test_trf)

accuracy_score(y_test, y_pred)

0.7039106145251397

---

## **Applying Mean Imputation**

---

In [65]:
si = SimpleImputer(strategy='mean')

In [67]:
X_train_trf2 = si.fit_transform(X_train)
X_test_trf2 = si.transform(X_test)

In [71]:
lr = LogisticRegression()

In [72]:
lr.fit(X_train_trf2, y_train)

y_pred2 = lr.predict(X_test_trf2)

accuracy_score(y_test, y_pred2)

0.6927374301675978