In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder , OrdinalEncoder,StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import  accuracy_score
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv("Datasets/train.csv")

In [3]:
df

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


# Pipelining using Column Transform Method

Fitted pipeline on full data :

In [4]:
df.drop(["name" , "ticket" , "cabin" , "embarked"] , axis = 1 , inplace = True)

In [5]:
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare
0,0,3,male,22.0,1,0,7.2500
1,1,1,female,38.0,1,0,71.2833
2,1,3,female,26.0,0,0,7.9250
3,1,1,female,35.0,1,0,53.1000
4,0,3,male,35.0,0,0,8.0500
...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000
887,1,1,female,19.0,0,0,30.0000
888,0,3,female,,1,2,23.4500
889,1,1,male,26.0,0,0,30.0000


In [6]:
preprocessor = ColumnTransformer([
    ("o" , OrdinalEncoder(categories = [["male" , "female"]]),[2]),
    #("ohe" , OneHotEncoder(drop = "first" , sparse_output = False) , (2)),
    ("simple" , SimpleImputer(strategy = "mean"),[3]),
    ("s", StandardScaler(), slice(3,4))
], remainder = "passthrough")

In [7]:
pipeline = Pipeline([
    ('preprocessing', preprocessor)
])

In [8]:
pipeline

In [9]:
X_transformed = pipeline.fit_transform(df)

In [10]:
X_transformed

array([[ 0.        , 22.        , -0.53037664, ...,  1.        ,
         0.        ,  7.25      ],
       [ 1.        , 38.        ,  0.57183099, ...,  1.        ,
         0.        , 71.2833    ],
       [ 1.        , 26.        , -0.25482473, ...,  0.        ,
         0.        ,  7.925     ],
       ...,
       [ 1.        , 29.69911765,         nan, ...,  1.        ,
         2.        , 23.45      ],
       [ 0.        , 26.        , -0.25482473, ...,  0.        ,
         0.        , 30.        ],
       [ 0.        , 32.        ,  0.15850313, ...,  0.        ,
         0.        ,  7.75      ]])

Train/test split pipeline :

In [11]:
x_train , x_test , y_train , y_test = train_test_split(df.drop(["survived"],axis=1),df["survived"],test_size=0.3,random_state=21)

In [12]:
x_train

Unnamed: 0,pclass,sex,age,sibsp,parch,fare
241,3,female,,1,0,15.5000
734,2,male,23.0,0,0,13.0000
581,1,female,39.0,1,1,110.8833
456,1,male,65.0,0,0,26.5500
118,1,male,24.0,0,1,247.5208
...,...,...,...,...,...,...
368,3,female,,0,0,7.7500
48,3,male,,2,0,21.6792
772,2,female,57.0,0,0,10.5000
824,3,male,2.0,4,1,39.6875


In [13]:
c1 = ColumnTransformer([("simple" , SimpleImputer(strategy = "mean"),[2])] , remainder = "passthrough")

In [14]:
c2 = ColumnTransformer([("o" , OneHotEncoder(drop = "first" , sparse_output = False) , (1))] , remainder = "passthrough")

In [15]:
c3 = ColumnTransformer([("s", StandardScaler(), slice(2,3))] , remainder = "passthrough")

In [16]:
c4 = LogisticRegression()

In [17]:
q= make_pipeline([c2,c1,c3,c4])

In [18]:
q

NOTE: q.fit_transform(x_train) -You can't use fit_transform here.

WHY: You can't use fit_transform in this case because you're applying multiple transformers (like SimpleImputer, OneHotEncoder, and StandardScaler) in
     sequence within a pipeline

   : Each of these transformers requires data in specific formats, and the output from one step is passed to the next. Using fit_transform on the  
    pipeline would apply all transformations at once, and since the transformers in the pipeline operate on different columns, fit_transform won't 
    handle the sequential transformations correctly.

SOLUTION: Use Approach as used in "Fitted pipeline on full data".

Another Example of Pipelining


In [19]:
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare
0,0,3,male,22.0,1,0,7.2500
1,1,1,female,38.0,1,0,71.2833
2,1,3,female,26.0,0,0,7.9250
3,1,1,female,35.0,1,0,53.1000
4,0,3,male,35.0,0,0,8.0500
...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000
887,1,1,female,19.0,0,0,30.0000
888,0,3,female,,1,2,23.4500
889,1,1,male,26.0,0,0,30.0000


In [20]:
a1 = SimpleImputer()

In [21]:
a2 = OneHotEncoder(drop="first" , sparse_output = False)

In [22]:
a3 = StandardScaler()

In [23]:
b = make_pipeline(a2,a1,a3)

In [24]:
b

In [25]:
b.fit_transform(df)

array([[-0.78927234, -0.51015154,  0.90258736, ..., -0.0474312 ,
        -0.06715343, -0.05812382],
       [ 1.2669898 , -0.51015154, -1.10792599, ..., -0.0474312 ,
        -0.06715343, -0.05812382],
       [ 1.2669898 , -0.51015154,  0.90258736, ..., -0.0474312 ,
        -0.06715343, -0.05812382],
       ...,
       [-0.78927234, -0.51015154,  0.90258736, ..., -0.0474312 ,
        -0.06715343, -0.05812382],
       [ 1.2669898 , -0.51015154, -1.10792599, ..., -0.0474312 ,
        -0.06715343, -0.05812382],
       [-0.78927234, -0.51015154,  0.90258736, ..., -0.0474312 ,
        -0.06715343, -0.05812382]])

# Pipelining for Matrix Method 

In [26]:
x_train , x_test , y_train , y_test= train_test_split(df.drop(["survived"],axis=1),df["survived"],test_size=0.3,random_state=21)

In [27]:
a1 = SimpleImputer()

In [28]:
a2 = OneHotEncoder(drop="first" , sparse_output = False)

In [29]:
a3 = StandardScaler()

In [30]:
a4 = LogisticRegression()

In [31]:
k = make_pipeline(a2,a1,a3 , a4)

In [32]:
k.fit(x_train , y_train)

In [33]:
k.fit(x_train , y_train)
k.fit(x_test , y_test)

In [34]:
k = k.predict(x_test)

In [35]:
accuracy_score(y_test , k)

0.9813432835820896