# Renaming Columns
**Use Case:** Shortening column names, fixing typos, removing spaces, etc.

In [None]:
import pandas as pd
df = df.rename(columns={'old_name': 'new_name'})

# Dropping Columns
**Use Case:**
Removing unnecessary columns.

In [None]:
import pandas as pd
df = df.drop(columns=['column_to_drop'])

# Pipelines
**Use Case:** \
To increase code simplicity, Pipelines are used. \
Pipelines allow you to conduct many steps such as preprocessing \
in minimal amounts of code.

**Code Logic:** \
First, define the steps of your pipeline (ensure each step is compatible with pipelines. \
Second, create the pipeline with the aformentioned steps. \
Thirdly, use the pipeline. Common methods are fit, transform, fit_transform, and predict.

**Actionable Next Steps:** \
Feature selection \
Model Evaluation \
Feature Engineering

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# Define the steps of the pipeline
steps = [
    ('scaler', StandardScaler()),  # Transformer
    ('model', LogisticRegression())  # Estimator
]

# Create the pipeline
pipeline = Pipeline(steps)

# Use the pipeline to fit and predict
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)
predictions = pipeline.predict(X_test)

# Custom Transformer Pipelines
**Use Case:** \
Create your own preprocessing pipelnes when you have a transformation \
unique to your dataset (e.g. string transformations)

**Code Logic:** \
First, define a class name and pass BaseEstimator (for parameter tuning) \
and TransformerMixin (for transform & fit methods) \
Second, define the constructor (\_\_init\_\_) \
Third, define the fit method
Fourth, define the transform method

Afterwards your Pipeline class can be added to steps similar to the previous code cell. \ 

**Actionable Next Steps:** \
Feature selection \
Model Evaluation \
Feature Engineering

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

class CustomTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, param1):
        self.param1 = param1

    def fit(self, X, y=None):
        # This should return self unless something different happens in train and test
        return self

    def transform(self, X):
        # Depending on the data type of 'X', you might need to return a DataFrame, a Series or a numpy array
        X_transformed = X.copy()  # creating a copy to avoid changes to original dataset
        X_transformed = X_transformed + self.param1  # an example operation using 'param1'
        return X_transformed

pipeline = Pipeline([
    ('custom', CustomTransformer(param1=value)),
    # ... other steps in the pipeline ...
])

# GridSearchCV with Pipelines
**Use Case:** \
This allows you to use Cross Validation alongside preprocessing that prevents data leakage. \
Moreover, you can hyper parameter tune each step of the Pipeline.

**Code Logic:** \
First, initialize a dictionary with the name of the parameters you want to tune. \
Second, pass the pipeline into GridSearchCV. \

**Actionable Next Steps:** \
Feature selection \
Model Evaluation \
Feature Engineering

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# Define the steps of the pipeline
steps = [
    ('scaler', StandardScaler()),  # Transformer
    ('model', LogisticRegression())  # Estimator
]

# Create the pipeline
pipeline = Pipeline(steps)

# Define the parameter grid for the grid search
param_grid = {
    'scaler__with_mean': [True, False],
    'model__C': [0.1, 1.0, 10.0],
}

# Create the grid search object
grid_search = GridSearchCV(pipeline, param_grid, cv=5)

# KBinsDiscretizer
**Use Case:** \
Good for reducing noise, handling outliers, or improving model preformance/simplicity.

**Code Logic:**
* columns_to_bin should be initialized with a list of column names you want to discretize.
* adjust the parameters of KBinsDiscretizer as needed
* perform fit_transform on training data and transform on test data
* lastly, you can print the edges from fit_transform. However, it is not strictly needed to perform discretization

**Actionable Next Steps:** \
EDA \
Feature Engineering \
Modeling 

**Notes:** \
[Full documentation](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.KBinsDiscretizer.html)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import KBinsDiscretizer

X1 = df1.copy()
X2 = df2.copy()

# Define the columns you want to bin
columns_to_bin = ['Age']

# Define the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('discretizer', KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform'), columns_to_bin)
    ])),
])

X1_transformed = pipeline.fit_transform(X1)
X2_transformed = pipeline.transform(X2)

# Fit the pipeline
X1['Age'] = pd.DataFrame(X1_transformed, columns=columns_to_bin)
X2['Age'] = pd.DataFrame(X2_transformed, columns=columns_to_bin)

# Access the 'preprocessor' step in the pipeline
preprocessor = pipeline.named_steps['preprocessor']

# Access the 'discretizer' transformer in the preprocessor
discretizer = preprocessor.named_transformers_['discretizer']

# Print the bin edges
print(discretizer.bin_edges_[0])

[23.  31.8 40.6 49.4 58.2 67. ]


# Pipeline Example
**Use Case:** \
In this example, we are using a Pipeline to preprocess our data, model, and hyperparameter tune. 

**Code Logic:** \
During the preprocessing, we handle features differently based on the feature's datatype. The preprocessing pipeline is then neatly wrapped up and fed into a modeling pipeline, where it utilizes GridSearchCV to tune the hyperparameters.

**Next Steps**: \
Model Evaluation

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier

# Step 1: Identify the columns for each feature type
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns
categorical_features = df.select_dtypes(include=['object']).columns
boolean_features = df.select_dtypes(include=['bool']).columns
datetime_features = df.select_dtypes(include=['datetime']).columns

# Step 2: Create preprocessing pipelines for each feature type
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

boolean_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))])

datetime_transformer = Pipeline(steps=[
    ('date_features', DateFeatures()),
    ('imputer', SimpleImputer(strategy='most_frequent'))])

# Step 3: Use ColumnTransformer to apply the appropriate preprocessing to each subset of columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        ('bool', boolean_transformer, boolean_features),
        ('date', datetime_transformer, datetime_features)])

# Step 4: Add a model to the pipeline
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', RandomForestClassifier())])

# Now you can fit and predict using the model pipeline
model.fit(X_train, y_train)
predictions = model.predict(X_test)

from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 5, 10],
}

# Create the grid search object
grid_search = GridSearchCV(model, param_grid, cv=5)

# Fit the grid search
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_

# Get the best model
best_model = grid_search.best_estimator_

# OrdinalEncoder
**Use Case:** \
Converts categorical data with ordinality into numerical data through ordinal encoding.

**Code Logic:**
1. Define a list of categories you want to encode (Ascending), match it with the order of the columns you are encoding.
2. Set the parameters for your ordinal encoder.
3. Define a list of columns you want to encode.
4. Use OrdinalEncoder in your pipeline.
5. Output the results

**Actionable Next Steps:** \
EDA \
Feature Engineering \
Modeling 

**Notes:** \
`remainder=passthrough` ensures that features that aren't transformed are still used. \
`min_frequency` requires a minimum version of sklearn 1.3. At the time of writing this (June 17th 2024), Kaggle notebooks use sklearn 1.2.2. I could not find any work arounds so I swithced to running the notebook locally on Jupyter Notebook. A work around for Google Colab can be found in [this reddit post](https://www.reddit.com/r/GoogleColab/s/XxRXtUekR5)

In [None]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Define the categories for the 'Gender' column
feature_categories = [
    ['M', 'F'],
    ['F','M']
]

# Define the OrdinalEncoder
ordinal_encoder = OrdinalEncoder(min_frequency=10,categories=feature_categories)

# Define the columns you want to apply the OrdinalEncoder to
columns_to_encode = ['Gender','Gender2']

# Define the ColumnTransformer
preprocessor = ColumnTransformer(
       transformers=[
        ('ord', ordinal_encoder, columns_to_encode)
    ],
    remainder='passthrough',
)

# Define your pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
])

output = pipeline.fit_transform(data2)
pd.DataFrame(output, columns=pipeline.named_steps['preprocessor'].get_feature_names_out())

Unnamed: 0,ord__Gender,ord__Gender2,remainder__Respondent,remainder__Age,remainder__Systolic Blood Pressure,remainder__Diastolic Blood Pressure,remainder__Height (cm),remainder__Weight (kg),remainder__BMI (kg/m2),remainder__Waist Circumference (cm),remainder__Fasting Glucose (mg/dL),remainder__Total Cholesterol (mg/dL),remainder__Triglycerides (mg/dL),remainder__Fat,remainder__Visceral Fat,remainder__Working Period
0,0.0,1.0,1.0,46.0,110.0,80.0,157.5,65.00,26.40,86.0,83.0,138.0,266.0,27.4,13.0,24.0
1,0.0,1.0,2.0,45.0,120.0,80.0,174.5,76.55,25.13,91.0,99.0,207.0,268.0,25.1,11.0,22.0
2,0.0,1.0,3.0,42.0,130.0,80.0,163.0,59.25,23.30,75.0,104.0,246.0,121.0,28.2,4.0,22.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
362,0.0,1.0,363.0,51.0,110.0,70.0,158.5,65.50,26.20,81.0,93.0,211.0,160.0,28.6,13.5,12.0
363,0.0,1.0,364.0,52.0,120.0,80.0,154.5,55.00,23.20,74.0,75.0,176.0,83.0,24.6,9.5,24.0
364,0.0,1.0,365.0,45.0,120.0,90.0,157.0,67.10,27.20,83.0,84.0,239.0,129.0,28.8,14.0,14.0


# Target Encoder
**Use Case:** \
This is good for when your categorical data has ordinality, however you do not know the correct order. \
`TargetEncoder` implements K-Fold Target encoding to minimize data leakage.

**Code Logic:**
1. Set the parameters for TargetEncoder
2. Encorporate it into your pipeline
3. Output results

**Actionable Next Steps:** \
EDA \
Feature Engineering \
Modeling 

**Notes:** \
Target Encoding is prone to overfitting if your training data isn't a good representation of your testing data. \
Be careful when you have multiple preprocessing steps for a single column, you will need to create a separate pipeline to prevent duplicate columns from being generated. \
OrdinalEncoder isn't strictly necessary, it is just for example purposes.

In [None]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

ordinal_encoder = OrdinalEncoder(min_frequency=10)
# Define the OrdinalEncoder
target_encoder = TargetEncoder(target_type='continuous',random_state=314)

# Define the columns you want to apply the OrdinalEncoder to
columns_to_encode = ['Gender']

inner_pipeline = Pipeline([
    ('ordinal', ordinal_encoder),
    ('target', target_encoder)
])

# Define the ColumnTransformer
preprocessor = ColumnTransformer(
       transformers=[
        ('pipe', inner_pipeline, columns_to_encode),
    ],
    remainder='passthrough',
)

# Define your pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
])

output = pipeline.fit_transform(data2,data2['Total Cholesterol (mg/dL)'])
pd.DataFrame(output, columns=pipeline.named_steps['preprocessor'].get_feature_names_out())

Unnamed: 0,pipe__Gender,remainder__Respondent,remainder__Age,remainder__Systolic Blood Pressure,remainder__Diastolic Blood Pressure,remainder__Height (cm),remainder__Weight (kg),remainder__BMI (kg/m2),remainder__Waist Circumference (cm),remainder__Fasting Glucose (mg/dL),remainder__Total Cholesterol (mg/dL),remainder__Triglycerides (mg/dL),remainder__Fat,remainder__Visceral Fat,remainder__Working Period
0,200.295974,1.0,46.0,110.0,80.0,157.5,65.00,26.40,86.0,83.0,138.0,266.0,27.4,13.0,24.0
1,198.020538,2.0,45.0,120.0,80.0,174.5,76.55,25.13,91.0,99.0,207.0,268.0,25.1,11.0,22.0
2,198.020538,3.0,42.0,130.0,80.0,163.0,59.25,23.30,75.0,104.0,246.0,121.0,28.2,4.0,22.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
362,198.020538,363.0,51.0,110.0,70.0,158.5,65.50,26.20,81.0,93.0,211.0,160.0,28.6,13.5,12.0
363,199.475641,364.0,52.0,120.0,80.0,154.5,55.00,23.20,74.0,75.0,176.0,83.0,24.6,9.5,24.0
364,199.475641,365.0,45.0,120.0,90.0,157.0,67.10,27.20,83.0,84.0,239.0,129.0,28.8,14.0,14.0
