### Ensuring Feature Consistency Between Training & InferencePipelines:

**Task 1**: Consistent Feature Preparation
- Step 1: Write a function for data preprocessing and imputation shared by both training and inference pipelines.
- Step 2: Demonstrate consistent application on both datasets.

In [None]:
# write your code from here
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing
def preprocess_data(df, is_training=True, imputer=None, scaler=None):
    df_processed = df.copy()
    numerical_cols = df_processed.select_dtypes(include=np.number).columns
    if is_training:
        imputer = SimpleImputer(strategy='mean')
        df_processed[numerical_cols] = imputer.fit_transform(df_processed[numerical_cols])
    elif imputer is not None:
        df_processed[numerical_cols] = imputer.transform(df_processed[numerical_cols])
    else:
        raise ValueError("Imputer must be provided for inference.")
    if is_training:
        scaler = StandardScaler()
        df_processed[numerical_cols] = scaler.fit_transform(df_processed[numerical_cols])
    elif scaler is not None:
        df_processed[numerical_cols] = scaler.transform(df_processed[numerical_cols])
    else:
        raise ValueError("Scaler must be provided for inference.")
    return df_processed, imputer, scaler
housing_train = fetch_california_housing(as_frame=True)
df_train = housing_train.frame.copy()
df_train = df_train.iloc[:1000] 
X_train = df_train.drop('MedHouseVal', axis=1)
y_train = df_train['MedHouseVal']
X_train_processed, imputer, scaler = preprocess_data(X_train)
print("Processed Training Data:")
print(X_train_processed.head())
print("\nTraining Data Imputer:", imputer)
print("Training Data Scaler:", scaler)
housing_inference = fetch_california_housing(as_frame=True)
df_inference = housing_inference.frame.copy()
df_inference = df_inference.iloc[1000:1100].copy() 
df_inference.iloc[[5, 15], 0] = np.nan
df_inference = df_inference.drop('MedHouseVal', axis=1, errors='ignore')
X_inference_processed, _, _ = preprocess_data(df_inference, is_training=False, imputer=imputer, scaler=scaler)
print("\nProcessed Inference Data:")
print(X_inference_processed.head())

**Task 2**: Pipeline Integration
- Step 1: Use sklearn pipelines to encapsulate the preprocessing steps.
- Step 2: Configure identical pipelines for both training and building inference models.

In [None]:
# write your code from here
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import mean_squared_error
housing_train = fetch_california_housing(as_frame=True)
df_train = housing_train.frame.copy()
X_train = df_train.drop('MedHouseVal', axis=1)
y_train = df_train['MedHouseVal']
X_train_sample, _, y_train_sample, _ = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
numerical_features = X_train_sample.select_dtypes(include=np.number).columns
preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
train_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])
train_pipeline.fit(X_train_sample, y_train_sample)
housing_inference = fetch_california_housing(as_frame=True)
df_inference = housing_inference.frame.copy()
X_inference = df_inference.drop('MedHouseVal', axis=1, errors='ignore').iloc[:50]
X_inference.iloc[[5, 15], 0] = np.nan
inference_pipeline = Pipeline(steps=[
    ('preprocessor', train_pipeline.named_steps['preprocessor'])
])
X_inference_processed = inference_pipeline.transform(X_inference)
print("Processed Inference Data:")
print(X_inference_processed[:5])
y_pred_train = train_pipeline.predict(X_train_sample)
mse_train = mean_squared_error(y_train_sample, y_pred_train)
print(f"\nMean Squared Error on Training Data: {mse_train:.2f}")

**Task 3**: Saving and Loading Preprocessing Models
- Step 1: Save the transformation model after fitting it to the training data.
- Step 2: Load and apply the saved model during inference.

In [None]:
# write your code from here

import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import joblib
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing
housing_train = fetch_california_housing(as_frame=True)
df_train = housing_train.frame.copy()
X_train = df_train.drop('MedHouseVal', axis=1)
y_train = df_train['MedHouseVal']
X_train_sample, _, _, _ = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
imputer = SimpleImputer(strategy='mean')
imputer.fit(X_train_sample)
scaler = StandardScaler()
scaler.fit(X_train_sample)
joblib.dump(imputer, 'imputer.joblib')
joblib.dump(scaler, 'scaler.joblib')
print("Preprocessing models (imputer.joblib, scaler.joblib) saved successfully.")
housing_inference = fetch_california_housing(as_frame=True)
df_inference = housing_inference.frame.copy()
X_inference = df_inference.drop('MedHouseVal', axis=1, errors='ignore').iloc[:50]
X_inference.iloc[[5, 15], 0] = np.nan
loaded_imputer = joblib.load('imputer.joblib')
loaded_scaler = joblib.load('scaler.joblib')
print("\nPreprocessing models loaded successfully.")
numerical_cols_inference = X_inference.select_dtypes(include=np.number).columns
X_inference_imputed = pd.DataFrame(loaded_imputer.transform(X_inference[numerical_cols_inference]),
                                   columns=numerical_cols_inference, index=X_inference.index)
X_inference_scaled = pd.DataFrame(loaded_scaler.transform(X_inference_imputed),
                                  columns=numerical_cols_inference, index=X_inference_imputed.index)

print("\nProcessed Inference Data:")
print(X_inference_scaled.head())