In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np

In [8]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

In [11]:
#import data dan drop kolom
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/data/titanicfull.csv')


In [12]:
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S


In [13]:
df.drop(columns=["name", "ticket", "age", "cabin"], inplace=True)

In [15]:
# Dataset Splitting
X = df.drop(columns='survived')
y = df.survived
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1047, 6), (262, 6), (1047,), (262,))

In [29]:
#preeprocessor
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('numeric', numerical_pipeline, ['sibsp', 'parch', 'fare']),
    ('categorical', categorical_pipeline, ['pclass', 'sex', 'embarked'])
])


In [30]:
# pipeline
pipeline = Pipeline([
    ("prep", preprocessor),
    ("algo", KNeighborsClassifier())
])

In [31]:
#parameter tuning
parameter = {
    "algo__n_neighbors": np.arange(1, 51, 2),
    "algo__weights": ["uniform", "distance"],
    "algo__p": [1, 2]
}
model = GridSearchCV(pipeline, param_grid=parameter, cv=3, n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [32]:
print(model.best_params_)
print(model.score(X_train, y_train), model.best_score_, model.score(X_test, y_test))

{'algo__n_neighbors': np.int64(19), 'algo__p': 1, 'algo__weights': 'uniform'}
0.8003820439350525 0.7908309455587391 0.8015267175572519


In [33]:
X.iloc[0:1]

Unnamed: 0,pclass,sex,sibsp,parch,fare,embarked
0,1,female,0,0,211.3375,S


In [34]:
data = [
    [1, "female", 1, 1, 80,"s"],
    [3, "male", 0, 0, 5,"s"]
]

X_pred = pd.DataFrame(data, index=["Rose", "Jack"], columns=X.columns)
X_pred

Unnamed: 0,pclass,sex,sibsp,parch,fare,embarked
Rose,1,female,1,1,80,s
Jack,3,male,0,0,5,s


In [35]:
model.predict(X_pred)

array([1, 0])

In [36]:
X_pred["Survived"] = model.predict(X_pred)
X_pred

Unnamed: 0,pclass,sex,sibsp,parch,fare,embarked,Survived
Rose,1,female,1,1,80,s,1
Jack,3,male,0,0,5,s,0
