## Machine Learning Pipeline:
   **_It is a sequence of data processing and modeling steps organized in a systematic workflow._**

In [None]:
Imputation ---> Encoding ---> Feature Scaling ---> Feature Selection ---> ML Algorithm
   |                                                                             |
   ↓                                                                             ↓
  Data                                                                         Output

In [14]:
#import necessary packages
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import set_config
set_config(display='diagram')
import seaborn as sns

In [2]:
#load data
df = sns.load_dataset('titanic')
df.drop(columns=['adult_male','alone','alive','deck','embark_town','class','who'],inplace=True)
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [3]:
df.isnull().sum()

survived      0
pclass        0
sex           0
age         177
sibsp         0
parch         0
fare          0
embarked      2
dtype: int64

In [4]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, 1:], df.iloc[:, 0], test_size=0.3, random_state=2)

In [5]:
X_train.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked
451,3,male,,1,0,19.9667,S
345,2,female,24.0,0,0,13.0,S
687,3,male,19.0,0,0,10.1708,S
279,3,female,35.0,1,1,20.25,S
742,1,female,21.0,2,2,262.375,C


In [None]:
1. Imputation ---> age, embark_town
2. encoding (OHE) ---> sex, embrak_town, class
3. Feature Scaling ---> every column
4. model selection ----> 7


In [6]:
X_train.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked
451,3,male,,1,0,19.9667,S
345,2,female,24.0,0,0,13.0,S
687,3,male,19.0,0,0,10.1708,S
279,3,female,35.0,1,1,20.25,S
742,1,female,21.0,2,2,262.375,C


In [7]:
trf1 = ColumnTransformer([
    ('age_imput', SimpleImputer(), [2]),
    ('embark_town impute', SimpleImputer(strategy='most_frequent'),[6])
], remainder='passthrough')

In [8]:
trf2 = ColumnTransformer([
    ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore', dtype=int), [1,6])
], remainder='passthrough')

In [9]:
trf3 = ColumnTransformer([
    ('scaling', MinMaxScaler(), slice(0,10))
])

In [10]:
trf4 = SelectKBest(score_func=chi2, k=8)


In [15]:
trf5 = DecisionTreeClassifier(random_state=2)

In [12]:
pipeline = Pipeline([
    ('trf1', trf1),
    ('trf2', trf2),
    ('trf4', trf4),
    ('trf5', trf5)
])

In [17]:
pipe = make_pipeline(trf1, trf2, trf3, trf4, trf5)

In [18]:
pipe.fit(X_train, y_train)



In [21]:
y_pred = pipe.predict(X_test)

In [22]:
accuracy_score(y_test, y_pred)

0.6417910447761194

# Explore the Pipeline

In [38]:
pipe.named_steps['columntransformer-1'].transformers_[0][1].statistics_

array([29.5005123])

In [42]:
pipe.named_steps['columntransformer-2'].transformers_[0][1].categories_


[array(['C', 'Q', 'S'], dtype=object),
 array([0.0, 4.0125, 5.0, 6.2375, 6.4375, 6.75, 6.8583, 6.95, 6.975,
        7.0458, 7.05, 7.125, 7.1417, 7.225, 7.2292, 7.25, 7.3125, 7.4958,
        7.5208, 7.55, 7.6292, 7.65, 7.7292, 7.7333, 7.7375, 7.7417, 7.75,
        7.775, 7.7875, 7.7958, 7.8, 7.8292, 7.8542, 7.875, 7.8792, 7.8875,
        7.8958, 7.925, 8.05, 8.1375, 8.3, 8.3625, 8.4583, 8.5167, 8.6542,
        8.6625, 8.6833, 8.7125, 9.2167, 9.225, 9.35, 9.475, 9.4833, 9.5,
        9.5875, 9.825, 9.8375, 10.1708, 10.5, 10.5167, 11.1333, 11.2417,
        11.5, 12.0, 12.275, 12.2875, 12.35, 12.475, 12.525, 12.65, 12.875,
        13.0, 13.4167, 13.5, 13.8583, 13.8625, 14.4, 14.4542, 14.4583,
        14.5, 15.0, 15.0458, 15.1, 15.2458, 15.5, 15.55, 15.7417, 15.75,
        15.85, 15.9, 16.1, 17.4, 17.8, 18.0, 18.75, 18.7875, 19.2583, 19.5,
        19.9667, 20.2125, 20.25, 20.525, 20.575, 21.0, 21.075, 21.6792,
        22.025, 22.3583, 23.0, 23.25, 23.45, 24.0, 24.15, 25.4667, 25.5875,
      