### Ensuring Feature Consistency Between Training & InferencePipelines:

**Task 1**: Consistent Feature Preparation
- Step 1: Write a function for data preprocessing and imputation shared by both training and inference pipelines.
- Step 2: Demonstrate consistent application on both datasets.

In [5]:
import pandas as pd
from sklearn.impute import SimpleImputer

def preprocess_data(df, num_imputer=None, cat_imputer=None):
    num_cols = df.select_dtypes(include=["float64", "int64"]).columns
    cat_cols = df.select_dtypes(include=["object"]).columns
    if num_imputer is None:
        num_imputer = SimpleImputer(strategy="mean")
        df[num_cols] = num_imputer.fit_transform(df[num_cols])
    else:
        df[num_cols] = num_imputer.transform(df[num_cols])
    if cat_imputer is None:
        cat_imputer = SimpleImputer(strategy="most_frequent")
        df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])
    else:
        df[cat_cols] = cat_imputer.transform(df[cat_cols])
    return df, num_imputer, cat_imputer

train_df = pd.DataFrame({
    "age": [25, None, 35, 40],
    "gender": ["M", "F", None, "F"]
})

test_df = pd.DataFrame({
    "age": [30, None],
    "gender": ["F", None]
})

train_processed, num_imp, cat_imp = preprocess_data(train_df.copy())
test_processed, _, _ = preprocess_data(test_df.copy(), num_imputer=num_imp, cat_imputer=cat_imp)

(train_processed, test_processed)

(         age gender
 0  25.000000      M
 1  33.333333      F
 2  35.000000   None
 3  40.000000      F,
          age gender
 0  30.000000      F
 1  33.333333   None)

**Task 2**: Pipeline Integration
- Step 1: Use sklearn pipelines to encapsulate the preprocessing steps.
- Step 2: Configure identical pipelines for both training and building inference models.

In [6]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

train_df = pd.DataFrame({
    "age": [25, None, 35, 40],
    "gender": ["M", "F", None, "F"]
})

test_df = pd.DataFrame({
    "age": [30, None],
    "gender": ["F", None]
})

num_cols = train_df.select_dtypes(include=["float64", "int64"]).columns
cat_cols = train_df.select_dtypes(include=["object"]).columns

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean"))
])

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent"))
])

preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_cols),
    ("cat", cat_pipeline, cat_cols)
])

train_processed = pd.DataFrame(preprocessor.fit_transform(train_df), columns=num_cols.append(cat_cols))
test_processed = pd.DataFrame(preprocessor.transform(test_df), columns=num_cols.append(cat_cols))

(train_processed, test_processed)


(         age gender
 0       25.0      M
 1  33.333333      F
 2       35.0   None
 3       40.0      F,
          age gender
 0       30.0      F
 1  33.333333   None)

**Task 3**: Saving and Loading Preprocessing Models
- Step 1: Save the transformation model after fitting it to the training data.
- Step 2: Load and apply the saved model during inference.

In [7]:
import joblib

joblib.dump(preprocessor, "preprocessor.joblib")

loaded_preprocessor = joblib.load("preprocessor.joblib")

test_processed_loaded = pd.DataFrame(loaded_preprocessor.transform(test_df), columns=num_cols.append(cat_cols))

test_processed_loaded


Unnamed: 0,age,gender
0,30.0,F
1,33.333333,
