In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.tree import DecisionTreeClassifier

In [3]:
df=pd.read_csv('train.csv')
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [4]:
df.drop(columns=['PassengerId','Name','Ticket','Cabin'],inplace=True)

In [6]:
x_train,x_test,y_train,y_test=train_test_split(df.drop(columns=['Survived']),df['Survived'],test_size=0.2,random_state=42)

# impute transform


In [38]:
trf1=ColumnTransformer([
    ('impute_age',SimpleImputer(),[2]),
    ('imput_Embarked',SimpleImputer(strategy='most_frequent'),[6])
],remainder='passthrough')


In [39]:
# onehot encoding
trf2=ColumnTransformer([
    ('ohe_sex_embarked',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),[1,6])
],remainder='passthrough')

In [40]:
# scaling
trf3=ColumnTransformer([
    ('scale',MinMaxScaler(),slice(0,10))
])

In [41]:
# featue Selection
trf4=SelectKBest(score_func=chi2,k=8)

In [42]:
# train the model
trf5=DecisionTreeClassifier()


In [43]:
# pipelining
pipe=make_pipeline(trf1,trf2,trf3,trf4,trf5)

In [44]:
pipe.fit(x_train,y_train)

In [37]:
pipe.named_steps

{'columntransformer-1': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_age', SimpleImputer(), [2]),
                                 ('imput_Embarked',
                                  SimpleImputer(strategy='most frequent'),
                                  [6])]),
 'columntransformer-2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_sex_embarked',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  [1, 6])]),
 'columntransformer-3': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 10, None))]),
 'selectkbest': SelectKBest(k=8, score_func=<function chi2 at 0x000002608FFC3B00>),
 'decisiontreeclassifier': DecisionTreeClassifier()}

In [47]:
# display pipeline
from sklearn import set_config
set_config(display='diagram')

In [48]:
y_pred=pipe.predict(x_test)

In [49]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.6256983240223464

# cross validation using pipeline

In [50]:
from sklearn.model_selection import cross_val_score
cross_val_score(pipe,x_train,y_train,cv=5,scoring='accuracy').mean()

0.6391214419383433

In [56]:
# Grid Search using pipeline
params = {
    'decisiontreeclassifier__max_depth':[1,2,3,4,5,None]
}

In [57]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe, params, cv=5, scoring='accuracy')
grid.fit(x_train, y_train)

In [58]:
grid.best_score_

0.6391214419383433

In [59]:
grid.best_params_

{'decisiontreeclassifier__max_depth': 2}

In [60]:
# Exporting the pipeline
import pickle 
pickle.dump(pipe,open('pipe.pkl','wb'))