# **Import necessary Modules**

In [None]:
from sklearn.pipeline import Pipeline
##feature Scaling
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# **Define steps**

In [None]:
steps=[("standard_scaler",StandardScaler()),
      ("classifier",LogisticRegression())]

In [None]:
steps

[('standard_scaler', StandardScaler()), ('classifier', LogisticRegression())]

# **Convert these steps into a pipeline**

# **Choosing Between Pipeline and make_pipeline**
**Use Pipeline when:**
You need clear control over naming, particularly if you're going to access or modify specific steps or parameters within the pipeline after its creation.
You want more readable code that explicitly shows what each step in the pipeline does, which can be important for code maintenance or collaboration.

# **EXAMPLE:**

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=2)),
    ('logistic', LogisticRegression())
])


**Use make_pipeline when:**
You need a quick and easy way to create a pipeline without the necessity of referring back to its components.
The pipeline is straightforward, and there is no need to access individual steps or modify parameters after initial setup.

# **EXAMPLE:**

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

pipe = make_pipeline(
    StandardScaler(),
    PCA(n_components=2),
    LogisticRegression()
)

# **convert into pipeline**

In [None]:
pipe=Pipeline(steps)

In [None]:
##visualize Pipeline
from sklearn import set_config

In [None]:
set_config(display="diagram")

In [None]:
pipe

In [None]:
##creating a dataset
from sklearn.datasets import make_classification
X,y=make_classification(n_samples=1000)

In [None]:
X.shape

(1000, 20)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

In [None]:
X_train

array([[-0.04273089,  0.62733708,  0.73184038, ...,  0.3161405 ,
        -0.69896992,  1.64448249],
       [ 1.21701213,  0.91859103, -0.61566102, ...,  0.47032105,
        -0.69394003,  0.78492765],
       [ 0.12829608,  0.66924697, -0.02577067, ..., -0.11017591,
         1.56790864, -0.03463564],
       ...,
       [-1.00451509, -1.5852426 , -0.66870921, ...,  0.32671781,
         0.49407756,  0.00962666],
       [ 0.51858508,  1.74562552,  0.45465545, ...,  1.11683758,
         0.35944106, -0.13306576],
       [ 0.16536114, -0.98364616,  1.02300279, ...,  2.03273883,
         0.66776423, -0.47030654]])

In [None]:
# It will automatically .fit_transform the X_train
pipe.fit(X_train, y_train)

In [None]:
# Here it will just .transform the X_test based on the parameters calculated from the training data
y_pred=pipe.predict(X_test)

In [None]:
y_pred

array([0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0,
       1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1,

## **Example 2**
**Displaying a pipeline with standard scaler, dimesnionality reduction and then estimator**

In [None]:
from sklearn.decomposition import PCA
from sklearn.svm import SVC



# **Define steps**

In [None]:
steps=[("scaling",StandardScaler()),
      ("PCA",PCA(n_components=3)),
      ("SVC",SVC())]

# **Create pipeline**

In [None]:
pipe2=Pipeline(steps)

# **What if i want to see whether each step is working fine or not?**
**i can use the name of that step to check its working**

In [None]:
# By writing steps like this i can see the output of that step
pipe2['scaling'].fit_transform(X_train)

In [None]:
pipe2.fit(X_train,y_train)

In [None]:
pipe2.predict(X_test)

array([0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1,

## **Complex examples of columns transformer**

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
## numerical processing pipeline
import numpy as np
numeric_processor=Pipeline(
    steps=[("imputation_mean",SimpleImputer(missing_values=np.nan,strategy="mean")),
          ("scaler",StandardScaler())]
)

In [None]:
numeric_processor

In [None]:
##categorical procesing pipeline

In [None]:
from sklearn.preprocessing import OneHotEncoder
categorical_processor=Pipeline(
    steps=[("imputation_consatnt",SimpleImputer(fill_value="missing",strategy="constant")),
          ("onehot",OneHotEncoder(handle_unknown="ignore"))]

)

In [None]:
categorical_processor

In [None]:
## combine processing technqiues
from sklearn.compose import ColumnTransformer

In [None]:
preprocessor=ColumnTransformer(
    [("categorical",categorical_processor,["gender","City"]),
    ("numerical",numeric_processor,["age","height"])]
)

In [None]:
preprocessor

In [None]:
from sklearn.pipeline import make_pipeline

In [None]:
pipe=make_pipeline(preprocessor,LogisticRegression())

In [None]:
pipe

# **EXAMPLE WORKING**

In [17]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# Create a synthetic dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=42)
X = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(20)])
X['gender'] = np.random.choice(['male', 'female'], size=1000)  # Adding a categorical feature
y = pd.Series(y, name='target')

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Numeric pipeline
num_col_pipeline = Pipeline([
    ('mean_imputation', SimpleImputer(missing_values=np.nan, strategy='mean')),
    ('standard_scaler', StandardScaler())
])

# Categorical pipeline
cat_col_pipeline = Pipeline([
    ('const_imputation', SimpleImputer(fill_value='missing', strategy='constant')),
    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing techniques using ColumnTransformer
preprocessor = ColumnTransformer([
    ('categorical', cat_col_pipeline, ['gender']),
    ('numerical', num_col_pipeline, [col for col in X.columns if col != 'gender'])
])


from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

# Full pipeline with preprocessing and logistic regression
pipe = make_pipeline(preprocessor, LogisticRegression())


# Train the pipeline on the training data
pipe.fit(X_train, y_train)


from sklearn.metrics import accuracy_score

# Predict on the test data
y_pred = pipe.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Test Accuracy: {accuracy:.2f}')

# Simulate new data
new_data = pd.DataFrame({
    'feature_0': [0.5] * 5,
    'feature_1': [1.2] * 5,
    'feature_2': [1.2] * 5,
    'feature_3': [1.2] * 5,
    'feature_4': [1.2] * 5,
    'feature_5': [1.2] * 5,
    'feature_6': [1.2] * 5,
    'feature_7': [1.2] * 5,
    'feature_8': [1.2] * 5,
    'feature_9': [1.2] * 5,
    'feature_10': [1.2] * 5,
    'feature_11': [1.2] * 5,
    'feature_12': [1.2] * 5,
    'feature_13': [1.2] * 5,
    'feature_14': [1.2] * 5,
    'feature_15': [1.2] * 5,
    'feature_16': [1.2] * 5,
    'feature_17': [1.2] * 5,
    'feature_18': [0.1] * 5,
    'feature_19': [0.3] * 5,
    'gender': ['male', 'female', 'male', 'female', 'male']
})

# Use the trained pipeline to make predictions on new data
new_predictions = pipe.predict(new_data)
print(f'Predictions for new data: {new_predictions}')

Test Accuracy: 0.82
Predictions for new data: [0 0 0 0 0]
