## import library dan dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('titanicfull.csv')
df.drop(columns=['name', 'ticket', 'age', 'cabin'], inplace=True)
df.head()

Unnamed: 0,pclass,survived,sex,sibsp,parch,fare,embarked
0,1,1,female,0,0,211.3375,S
1,1,1,male,1,2,151.55,S
2,1,0,female,1,2,151.55,S
3,1,0,male,1,2,151.55,S
4,1,0,female,1,2,151.55,S


## Splitting data

In [3]:
X = df.drop(columns='survived')
y = df['survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print(f'X_train : {X_train.shape}')
print(f'X_test : {X_test.shape}')
print(f'y_train : {y_train.shape}')
print(f'y_test : {y_test.shape}')

X_train : (1047, 6)
X_test : (262, 6)
y_train : (1047,)
y_test : (262,)


## Preprocessor

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

In [5]:
numerical_pipeline = Pipeline([
    ("Imputer", SimpleImputer(strategy="mean")),
    ("Scaler", MinMaxScaler())
])

categorical_pipeline = Pipeline([
    ("Imputer", SimpleImputer(strategy="most_frequent")),
    ("Onehot", OneHotEncoder())
])

In [6]:
from sklearn.compose import ColumnTransformer 

In [7]:
X_train.head()

Unnamed: 0,pclass,sex,sibsp,parch,fare,embarked
999,3,female,0,0,7.75,Q
392,2,female,1,0,27.7208,C
628,3,female,4,2,31.275,S
1165,3,male,0,0,7.225,C
604,3,female,0,0,7.65,S


In [8]:
preprocessor = ColumnTransformer([
    ("numeric", numerical_pipeline, ['sibsp', 'parch', 'fare']),
    ("categoric", categorical_pipeline, ['pclass', 'sex', 'embarked'])
])

## Final pipeline

In [9]:
from sklearn.neighbors import KNeighborsClassifier

In [10]:
pipeline = Pipeline([
    ("prep", preprocessor),
    ("algo", KNeighborsClassifier())
])

In [11]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('Imputer',
                                                                   SimpleImputer()),
                                                                  ('Scaler',
                                                                   MinMaxScaler())]),
                                                  ['sibsp', 'parch', 'fare']),
                                                 ('categoric',
                                                  Pipeline(steps=[('Imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('Onehot',
                                                                   OneHotEncoder())]),
                                                  ['pclass', 'sex',
                           

In [12]:
pipeline.score(X_test, y_test)

0.7824427480916031

## GridSearchCV

In [13]:
from sklearn.model_selection import GridSearchCV

In [20]:
parameter = {
    "algo__n_neighbors": range(1, 51, 2),
    "algo__weights": ["uniform", "distance"],
    "algo__p": [1, 2]
}

model = GridSearchCV(pipeline, parameter, cv=3, n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('prep',
                                        ColumnTransformer(transformers=[('numeric',
                                                                         Pipeline(steps=[('Imputer',
                                                                                          SimpleImputer()),
                                                                                         ('Scaler',
                                                                                          MinMaxScaler())]),
                                                                         ['sibsp',
                                                                          'parch',
                                                                          'fare']),
                                                                        ('categoric',
                                                                         Pipeline(steps=[('Impu

In [24]:
pd.DataFrame(model.cv_results_).sort_values('rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algo__n_neighbors,param_algo__p,param_algo__weights,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
36,0.133929,0.011279,0.127253,0.041959,19,1,uniform,"{'algo__n_neighbors': 19, 'algo__p': 1, 'algo_...",0.799427,0.813754,0.762178,0.791786,0.021738,1
48,0.067131,0.032728,0.067407,0.014312,25,1,uniform,"{'algo__n_neighbors': 25, 'algo__p': 1, 'algo_...",0.787966,0.810888,0.765043,0.787966,0.018716,2
38,0.090127,0.024454,0.094943,0.026881,19,2,uniform,"{'algo__n_neighbors': 19, 'algo__p': 2, 'algo_...",0.785100,0.813754,0.762178,0.787011,0.021099,3
28,0.069869,0.020787,0.083134,0.021751,15,1,uniform,"{'algo__n_neighbors': 15, 'algo__p': 1, 'algo_...",0.796562,0.799427,0.765043,0.787011,0.015577,3
32,0.058914,0.013020,0.069154,0.011886,17,1,uniform,"{'algo__n_neighbors': 17, 'algo__p': 1, 'algo_...",0.793696,0.799427,0.765043,0.786055,0.015041,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8,0.101151,0.014653,0.075674,0.013977,5,1,uniform,"{'algo__n_neighbors': 5, 'algo__p': 1, 'algo__...",0.765043,0.759312,0.747851,0.757402,0.007147,96
3,0.049339,0.005796,0.028297,0.001511,1,2,distance,"{'algo__n_neighbors': 1, 'algo__p': 2, 'algo__...",0.770774,0.736390,0.722063,0.743075,0.020440,97
2,0.070262,0.016074,0.100059,0.028305,1,2,uniform,"{'algo__n_neighbors': 1, 'algo__p': 2, 'algo__...",0.770774,0.736390,0.722063,0.743075,0.020440,97
1,0.090819,0.014687,0.038842,0.006658,1,1,distance,"{'algo__n_neighbors': 1, 'algo__p': 1, 'algo__...",0.770774,0.736390,0.722063,0.743075,0.020440,97


In [26]:
model.best_params_

{'algo__n_neighbors': 19, 'algo__p': 1, 'algo__weights': 'uniform'}

In [27]:
 model.score(X_train, y_train), model.score(X_test, y_test)

(0.8013371537726839, 0.8091603053435115)