# **ML Pipelines**  
* Automating processes in an ordered way.  
* Output of one step can be input to the next.

In [22]:
import numpy as np
import pandas as pd

In [23]:
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.pipeline import Pipeline, make_pipeline

In [24]:
df = pd.read_csv("assets/Titanic-Dataset.csv")
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


## Plan
|ImputeMissingValues|--->|OHE|--->|Scaling|--->|FeatureSelection|--->|ModelTraining|--->Output

In [25]:
df.drop(columns=['PassengerId','Name','Ticket','Cabin'],inplace=True)
df.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Embarked'],
      dtype='object')

In [26]:
X_train,X_test,y_train,y_test = train_test_split(df.drop('Survived',axis=1),
                                                df['Survived'],
                                                test_size=0.2,
                                                random_state=42)
X_train.shape,X_test.shape

((712, 7), (179, 7))

In [27]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5,S
733,2,male,23.0,0,0,13.0,S
382,3,male,32.0,0,0,7.925,S
704,3,male,26.0,1,0,7.8542,S
813,3,female,6.0,4,2,31.275,S


In [28]:
#first step: imputation
trf1 = ColumnTransformer([
    ('impute_age',SimpleImputer(),[2]), # passing the column through index is good strategy because the output of columntransformer is numpy array. .
    ('impute_embarked',SimpleImputer(strategy='most_frequent'),[6]) # the next steps in pipeline might tweak if we pass the column name because np-arrays dont have column names
],remainder='passthrough')

In [29]:
#second step: one hot encoding
trf2 = ColumnTransformer([
    ('ohe_sex_embarked',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),[1,6])# no need to do drop first because we are using dicision tree not any linear algorightm which uses euqlidian distance.
],remainder='passthrough')

In [30]:
# step 3: Scaling
trf3 = ColumnTransformer([
    ('scale',MinMaxScaler(),slice(0,10))# why (0,10)? see following md cell
])

In [54]:
type(trf3)

sklearn.compose._column_transformer.ColumnTransformer

after step 2:OHE, two columns got removed (sex,emarked) and inplace of sex, two new columns got added (sex_male,sex_female) and same for embarked, three new got added (Embarked_S,Embarked_C,Embarked_Q)  
initially we had 7. 2 got removed: 7-2=5 and then 3+2=5 got added totally making 5+5=10 columns

In [31]:
# step 4: Feature Selection (filhal purr krao isy)
# trf4 = SelectKBest(score_func=chi2,k=8)

In [32]:
# step 5: Model Training
trf5 = DecisionTreeClassifier()

## Creating Pipeline


In [33]:
pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3),
    ('trf4',trf4),
    ('trf5',trf5)
])

### Pipeline vs make_pipeline:
make_pipline does't require names or transformations.  
same aplies to ColumnTransformer and make_column_transformer

In [34]:
# Alternative syntax
pipe = make_pipeline(trf1,trf2,trf3,trf4,trf5)

If there is no model training step at the end of pipeline, you call pipe.fit_transform(), otherwise only pipe.fit() because it allowes calling .predict() afterwards

In [35]:
#train
pipe.fit(X_train,y_train)

In [36]:
from sklearn import set_config
set_config(display='diagram')

## Explore the pipeline

In [37]:
pipe.named_steps # here if I had gone with alternative syntax of defining pipe... I wouldn't have gotten the keys in the dictionary. so we got improved readability

{'trf1': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_age', SimpleImputer(), [2]),
                                 ('impute_embarked',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'trf2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_sex_embarked',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  [1, 6])]),
 'trf3': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 10, None))]),
 'trf5': DecisionTreeClassifier()}

In [38]:
# we can get a lot of information from this dict up there
pipe.named_steps['trf1'].transformers_[0][1].statistics_ # age was imputed with this value.. 

array([29.49884615])

In [39]:
#lets find out how embarked imputation went
pipe.named_steps['trf1'].transformers_[1][1].statistics_

array(['S'], dtype=object)

In [40]:
y_pred = pipe.predict(X_test)

In [41]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)
#feature selection is causing the reduce in percentage here... that wasn't even needed we just did that to get the idea of intergrating it into pipeline.

0.6256983240223464

## Cross Validation using Pipeline
_will do in detail in future, now including because its related to pipelines_

In [42]:
from sklearn.model_selection import cross_val_score
cross_val_score(pipe,X_train,y_train,cv=5,scoring='accuracy').mean()

np.float64(0.6391214419383433)

## Grid Search Using Pipeline
_will do in detail in future, now including because its related to pipelines_

In [43]:
params = {
'trf5__max_depth':[1,2,3,4,5,None]
}

In [45]:
from sklearn.model_selection import GridSearchCV
grid= GridSearchCV(pipe,params,cv=5,scoring='accuracy')
grid.fit(X_train,y_train)

In [46]:
grid.best_score_

np.float64(0.6391214419383433)

In [47]:
grid.best_params_

{'trf5__max_depth': 2}

## Exporting the Pipeline

In [51]:
import pickle
pickle.dump(pipe,(open('output/pipe.pkl','wb')))