# Renaming Columns
**Use Case:** Shortening column names, fixing typos, removing spaces, etc.

In [None]:
import pandas as pd
df = df.rename(columns={'old_name': 'new_name'})

# Dropping Columns
**Use Case:**
Removing unnecessary columns.

In [None]:
import pandas as pd
df = df.drop(columns=['column_to_drop'])

# Pipelines
**Use Case:** \
To increase code simplicity, Pipelines are used. \
Pipelines allow you to conduct many steps such as preprocessing \
in minimal amounts of code.

**Code Logic:** \
First, define the steps of your pipeline (ensure each step is compatible with pipelines. \
Second, create the pipeline with the aformentioned steps. \
Thirdly, use the pipeline. Common methods are fit, transform, fit_transform, and predict.

**Actionable Next Steps:** \
Feature selection \
Model Evaluation \
Feature Engineering

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# Define the steps of the pipeline
steps = [
    ('scaler', StandardScaler()),  # Transformer
    ('model', LogisticRegression())  # Estimator
]

# Create the pipeline
pipeline = Pipeline(steps)

# Use the pipeline to fit and predict
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)
predictions = pipeline.predict(X_test)

# Custom Transformer Pipelines
**Use Case:** \
Create your own preprocessing pipelnes when you have a transformation \
unique to your dataset (e.g. string transformations)

**Code Logic:** \
First, define a class name and pass BaseEstimator (for parameter tuning) \
and TransformerMixin (for transform & fit methods) \
Second, define the constructor (\_\_init\_\_) \
Third, define the fit method
Fourth, define the transform method

Afterwards your Pipeline class can be added to steps similar to the previous code cell. \ 

**Actionable Next Steps:** \
Feature selection \
Model Evaluation \
Feature Engineering

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

class CustomTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, param1):
        self.param1 = param1

    def fit(self, X, y=None):
        # This should return self unless something different happens in train and test
        return self

    def transform(self, X):
        # Depending on the data type of 'X', you might need to return a DataFrame, a Series or a numpy array
        X_transformed = X.copy()  # creating a copy to avoid changes to original dataset
        X_transformed = X_transformed + self.param1  # an example operation using 'param1'
        return X_transformed

pipeline = Pipeline([
    ('custom', CustomTransformer(param1=value)),
    # ... other steps in the pipeline ...
])

# GridSearchCV with Pipelines
**Use Case:** \
This allows you to use Cross Validation alongside preprocessing that prevents data leakage. \
Moreover, you can hyper parameter tune each step of the Pipeline.

**Code Logic:** \
First, initialize a dictionary with the name of the parameters you want to tune. \
Second, pass the pipeline into GridSearchCV. \

**Actionable Next Steps:** \
Feature selection \
Model Evaluation \
Feature Engineering

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# Define the steps of the pipeline
steps = [
    ('scaler', StandardScaler()),  # Transformer
    ('model', LogisticRegression())  # Estimator
]

# Create the pipeline
pipeline = Pipeline(steps)

# Define the parameter grid for the grid search
param_grid = {
    'scaler__with_mean': [True, False],
    'model__C': [0.1, 1.0, 10.0],
}

# Create the grid search object
grid_search = GridSearchCV(pipeline, param_grid, cv=5)

# KBinsDiscretizer
**Use Case:** \
Good for reducing noise, handling outliers, or improving model preformance/simplicity.

**Code Logic:**
* columns_to_bin should be initialized with a list of column names you want to discretize.
* adjust the parameters of KBinsDiscretizer as needed
* perform fit_transform on training data and transform on test data
* lastly, you can print the edges from fit_transform. However, it is not strictly needed to perform discretization

**Actionable Next Steps:** \
EDA \
Feature Engineering \
Modeling 

**Notes:** \
[Full documentation](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.KBinsDiscretizer.html)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import KBinsDiscretizer

X1 = df1.copy()
X2 = df2.copy()

# Define the columns you want to bin
columns_to_bin = ['Age']

# Define the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('discretizer', KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform'), columns_to_bin)
    ])),
])

X1_transformed = pipeline.fit_transform(X1)
X2_transformed = pipeline.transform(X2)

# Fit the pipeline
X1['Age'] = pd.DataFrame(X1_transformed, columns=columns_to_bin)
X2['Age'] = pd.DataFrame(X2_transformed, columns=columns_to_bin)

# Access the 'preprocessor' step in the pipeline
preprocessor = pipeline.named_steps['preprocessor']

# Access the 'discretizer' transformer in the preprocessor
discretizer = preprocessor.named_transformers_['discretizer']

# Print the bin edges
print(discretizer.bin_edges_[0])

[23.  31.8 40.6 49.4 58.2 67. ]
