In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [4]:
df = pd.read_csv('train.csv', usecols=['Age', 'Fare', 'Pclass', 'Survived'])

In [5]:
df.head()

Unnamed: 0,Survived,Pclass,Age,Fare
0,0,3,22.0,7.25
1,1,1,38.0,71.2833
2,1,3,26.0,7.925
3,1,1,35.0,53.1
4,0,3,35.0,8.05


In [6]:
df.isnull().mean() * 100

Survived     0.00000
Pclass       0.00000
Age         19.86532
Fare         0.00000
dtype: float64

In [7]:
X = df.drop(columns=['Survived'])
y = df['Survived']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((712, 3), (179, 3), (712,), (179,))

In [10]:
X_train['Pclass'] = X_train['Pclass'].astype(str)
X_test['Pclass'] = X_test['Pclass'].astype(str)

In [11]:
age_impute = Pipeline(steps=[
    ('Imputer', SimpleImputer())
])
pclass_oe = Pipeline(steps=[
    ('OrdinalEncoding', OrdinalEncoder(categories=[['3', '2', '1']]))
])
preprocessor = ColumnTransformer([
    ('pipe1', age_impute, ['Age']),
    ('pipe2', pclass_oe, ['Pclass'])
])

In [12]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

In [13]:
pipeline.fit(X_train, y_train)

In [14]:
y_pred = pipeline.predict(X_test)

In [15]:
from sklearn.metrics import accuracy_score

In [16]:
accuracy_score(y_test, y_pred)

0.7597765363128491

## KNN-Imputer

In [56]:
age_impute = Pipeline(steps=[
    ('Imputer', KNNImputer(n_neighbors=3, weights='distance'))
])
pclass_oe = Pipeline(steps=[
    ('OrdinalEncoding', OrdinalEncoder(categories=[['3', '2', '1']]))
])
preprocessor = ColumnTransformer([
    ('pipe1', age_impute, ['Age']),
    ('pipe2', pclass_oe, ['Pclass'])
])

In [58]:
pipeline2 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

In [60]:
pipeline2.fit(X_train, y_train)

In [62]:
y_pred = pipeline2.predict(X_test)

In [64]:
accuracy_score(y_test, y_pred)

0.7597765363128491