In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split as TSP
from sklearn.preprocessing import OneHotEncoder as OHE
from sklearn.compose import ColumnTransformer as CT
from sklearn.impute import SimpleImputer as SI
from sklearn.preprocessing import MinMaxScaler as MMS
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.tree import DecisionTreeClassifier as DTC

In [2]:
df = pd.read_csv('train.csv')

In [3]:
df.drop(columns=['PassengerId','Name','Ticket','Cabin'],inplace=True)

In [4]:
x_train, x_test, y_train, y_test = TSP(df.drop(columns=['Survived']), 
                                       df['Survived'],
                                       test_size = 0.2,
                                       random_state = 42) 

In [5]:
x_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5000,S
733,2,male,23.0,0,0,13.0000,S
382,3,male,32.0,0,0,7.9250,S
704,3,male,26.0,1,0,7.8542,S
813,3,female,6.0,4,2,31.2750,S
...,...,...,...,...,...,...,...
106,3,female,21.0,0,0,7.6500,S
270,1,male,,0,0,31.0000,S
860,3,male,41.0,2,0,14.1083,S
435,1,female,14.0,1,2,120.0000,S


In [6]:
trf1 = CT([ 
    ('impute_age',SI(),[2]),
    ('imput_embarked', SI(strategy='most_frequent'),[6])
],remainder = 'passthrough')

In [7]:
trf2 = CT([
    ( 'ohe_sex_embarked',OHE(sparse=False, handle_unknown='ignore'),[1,6] )
],remainder='passthrough')

In [14]:
trf3 =CT([
    ('scale',MMS(),slice(0,10))
])

In [15]:
trf4=SelectKBest(score_func=chi2,k=8)

In [16]:
trf5=DTC()

In [17]:
pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3),
    ('trf4',trf4),
    ('trf5',trf5)
])
# or
# pipe = make_pipeline(trf1,trf2,trf3,trf4,trf5)

In [19]:
pipe.fit(x_train,y_train)

In [22]:
pipe.named_steps

{'trf1': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_age', SimpleImputer(), [2]),
                                 ('imput_embarked',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'trf2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_sex_embarked',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse=False),
                                  [1, 6])]),
 'trf3': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 10, None))]),
 'trf4': SelectKBest(k=8, score_func=<function chi2 at 0x7fb8c31e5a60>),
 'trf5': DecisionTreeClassifier()}

In [23]:
y_pred = pipe.predict(x_test)

In [25]:
from sklearn.metrics import accuracy_score as acc

In [26]:
acc(y_test, y_pred)

0.6256983240223464

In [27]:
import pickle

In [28]:
pickle.dump(pipe, open('pipe.pkl','wb'))