In [390]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2
import pickle

In [391]:
df = pd.read_csv('/content/train.csv')
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [392]:
df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
df.head(3)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S


In [393]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['Survived']), df['Survived'], test_size=0.2, random_state=0)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((712, 7), (179, 7), (712,), (179,))

In [394]:
X_train['Embarked'].value_counts()

Unnamed: 0_level_0,count
Embarked,Unnamed: 1_level_1
S,517
C,129
Q,64


In [395]:
y_train.head()

Unnamed: 0,Survived
140,0
439,0
817,0
378,0
491,0


<h1>Column Transformer</h1>

In [396]:
# Imputation transformer
tr1 = ColumnTransformer([
    ('impute_age', SimpleImputer(), [2]),
    ('impute_embarked', SimpleImputer(strategy='most_frequent'), [6])
], remainder='passthrough')

In [397]:
# OneHotEncoder
tr2 = ColumnTransformer([
    ('ohe_sex_embarked', OneHotEncoder(sparse_output=False, handle_unknown='ignore',dtype=np.float64), [1,6])
], remainder='passthrough')

In [398]:
# Scaling
tr3 = ColumnTransformer([
    ('scale', MinMaxScaler(),slice(0,10))
])

In [399]:
# Feature Selection
# tr4 = SelectKBest(score_func=chi2, k=1)
# Keeping all the column as k=10 because of accuracy

In [400]:
# Train the model
tr5 = DecisionTreeClassifier()

<h1>Create Pipeline</h1>

In [401]:
pipe = Pipeline([
    ('tr1',tr1),
    ('tr2',tr2),
    ('tr3',tr3),
    # ('tr4',tr4),
    ('tr5',tr5)
])

In [402]:
# Alternate Syntax
# pipe = make_pipeline(tr1,tr2,tr3,tr4,tr5)

In [403]:
# Train
pipe.fit(X_train,y_train)


#Remeber

# WE dont call pipe.fit when we are not training the model in the pipeline
# WE call pipe.fit_transform there

In [404]:
# Predict

y_pred = pipe.predict(X_test)

In [405]:
y_pred

array([1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0])

In [406]:
accuracy_score(y_test,y_pred)*100

67.59776536312849

Cross Validation

In [407]:
from sklearn.model_selection import cross_val_score
cross_val_score(pipe,X_train,y_train,cv=5,scoring='accuracy').mean()

0.626484782822811

GridSearch using Pipeline

In [408]:
params = {
    'tr5__max_depth':[1,2,3,4,5,None]
}

In [409]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe,params,cv=5,scoring='accuracy')
grid.fit(X_train,y_train)

In [410]:
grid.best_score_

0.626484782822811

In [411]:
grid.best_params_

{'tr5__max_depth': None}

<h1>Export the pipeline</h1>

In [412]:
pickle.dump(pipe,open('pipe.pkl','wb'))