In [3]:
import numpy as np
import pandas as pd

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.tree import DecisionTreeClassifier

In [5]:
df = pd.read_csv("C:\\ML and DL\\learning_pipelines\\Titanic-Dataset.csv")
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


## LETS BEGIN

In [6]:
df.drop(columns=['PassengerId','Name','Ticket','Cabin'],inplace=True)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['Survived']),df['Survived'],test_size=0.2, random_state=42)

In [8]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5,S
733,2,male,23.0,0,0,13.0,S
382,3,male,32.0,0,0,7.925,S
704,3,male,26.0,1,0,7.8542,S
813,3,female,6.0,4,2,31.275,S


In [9]:
# imputation transformer
trf1 = ColumnTransformer([
    ('impute_age',SimpleImputer(),[2]),   # good practice to call the cols with index value(like in code as [2]) rather than there name as we pass this to next step
                                          # also another reason is the output of this step will be a numpy array not a dataframe
    ('impute_embarked',SimpleImputer(strategy='most_frequent'),[6]) 
],remainder='passthrough')   # using 'passthrough' so that the other cols got ignored and don't got dropped 

In [10]:
# one hot encoder
trf2 = ColumnTransformer([
    ('OHE_sex_embarked',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),[1,6])
],remainder='passthrough')

In [11]:
# Scaling
trf3 = ColumnTransformer([
    ('scale',MinMaxScaler(), slice(0,8))  # (0,10) means applying scaling on all values 
                                          # also (0,10) is because the sex col divied into 2 cols after OHE similarly embarked after OHE got 3 col so total cols now is 2 + 3 + 5(remaining) = 10
])

In [12]:
# feature selection
trf4 = SelectKBest(score_func=chi2, k=8)          # selecting best K cols from the dataset

In [13]:
# train the model
trf5 = DecisionTreeClassifier()

## Now the steps/chains have been created it's time to make a pipeline

In [14]:
pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3),
    ('trf4',trf4),
    ('trf5',trf5),
])

### Pipeline require naming the steps whereas make_pipeline does not require naming 

In [15]:
# Alternate make_pipeline
mp_pipe = make_pipeline(trf1,trf2,trf3,trf4,trf5)      # this is mostly used 

In [16]:
pipe.fit(X_train,y_train)  # we can .fit() or .fit_transform()     # .fit() is used when we have included a model in the pipeline
                                                                   # .fit_transform() is used when we have only done data transformation

0,1,2
,steps,"[('trf1', ...), ('trf2', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('impute_age', ...), ('impute_embarked', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,transformers,"[('OHE_sex_embarked', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,transformers,"[('scale', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,score_func,<function chi...0020E0007E840>
,k,8

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


## Display pipeline

In [17]:
from sklearn import set_config
set_config(display='diagram')

In [18]:
# chedking some values of the pipelines steps
pipe.named_steps['trf1'].transformers_[0][1].statistics_    # here i m checking the mean value of mean of 'age' column

array([29.49884615])

In [19]:
pipe.named_steps['trf3'].transformers_[0][1]

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False


In [20]:
# prediction
y_pred = pipe.predict(X_test)

In [21]:
y_pred

array([1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0])

In [22]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred,y_test)

0.6256983240223464

# Cross Validation using Pipeline

In [26]:
# cross validation using cross_val_score
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, X_train, y_train, cv=5, scoring='accuracy').mean()

np.float64(0.6391214419383433)

# GridSearch using Pipeline

In [55]:
# gridsearchcv
arr = np.arange(1,16,1)
arr = np.append(arr,None)
params = {
    'trf5__max_depth': arr   # here trf5 is model , max_depth is parameter in decision tree(and we want to check max_depth at which cv is best for higher accuracy) and format is 'trf5__max_depth' is performed for cv1,2,3,4,5(cv means the corss validation dataset during train test split in cv1 , cv2, cv3 , cv4, cv5)
}

In [56]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV( pipe, params, cv=15, scoring='accuracy')
grid.fit(X_train, y_train)

0,1,2
,estimator,Pipeline(step...lassifier())])
,param_grid,"{'trf5__max_depth': array([1, 2, ... dtype=object)}"
,scoring,'accuracy'
,n_jobs,
,refit,True
,cv,15
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('impute_age', ...), ('impute_embarked', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,transformers,"[('OHE_sex_embarked', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,transformers,"[('scale', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,score_func,<function chi...0020E0007E840>
,k,8

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,1
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [57]:
grid.best_score_

np.float64(0.6391548463356974)

In [58]:
grid.best_params_

{'trf5__max_depth': 1}

# Exporting Pipeline 

In [60]:
import pickle
pickle.dump(pipe,open('pipe.pkl','wb'))  

# Conclusion

### Now in codefile"without pipeline" i need to export 'OHE_sex.pkl','OHE_embarked.pkl','clf.pkl' 
### But here using pipeline i just need to make a pipeline and export only pipeline as 'pipe.pkl'