### Ensuring Feature Consistency Between Training & InferencePipelines:

**Task 1**: Consistent Feature Preparation
- Step 1: Write a function for data preprocessing and imputation shared by both training and inference pipelines.
- Step 2: Demonstrate consistent application on both datasets.

In [None]:
# write your code from here
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing

def preprocess_data(df, is_training=True, imputer=None, scaler=None):
    df_processed = df.copy()
    numerical_cols = df_processed.select_dtypes(include=np.number).columns
    if is_training:
        imputer = SimpleImputer(strategy='mean')
        df_processed[numerical_cols] = imputer.fit_transform(df_processed[numerical_cols])
    elif imputer is not None:
        df_processed[numerical_cols] = imputer.transform(df_processed[numerical_cols])
    else:
        raise ValueError("Imputer must be provided for inference.")
    if is_training:
        scaler = StandardScaler()
        df_processed[numerical_cols] = scaler.fit_transform(df_processed[numerical_cols])
    elif scaler is not None:
        df_processed[numerical_cols] = scaler.transform(df_processed[numerical_cols])
    else:
        raise ValueError("Scaler must be provided for inference.")

    return df_processed, imputer, scaler
housing_train = fetch_california_housing(as_frame=True)
df_train = housing_train.frame.copy()
df_train = df_train.iloc[:1000] 
X_train = df_train.drop('MedHouseVal', axis=1)
y_train = df_train['MedHouseVal']

# Preprocess training data
X_train_processed, imputer, scaler = preprocess_data(X_train)
print("Processed Training Data:")
print(X_train_processed.head())
print("\nTraining Data Imputer:", imputer)
print("Training Data Scaler:", scaler)

# Simulate inference data
housing_inference = fetch_california_housing(as_frame=True)
df_inference = housing_inference.frame.copy()
df_inference = df_inference.iloc[1000:1100].copy() # Sample a different subset
# Introduce some missing values in inference data
df_inference.iloc[[5, 15], 0] = np.nan
df_inference = df_inference.drop('MedHouseVal', axis=1, errors='ignore')

# Preprocess inference data using the fitted imputer and scaler
X_inference_processed, _, _ = preprocess_data(df_inference, is_training=False, imputer=imputer, scaler=scaler)
print("\nProcessed Inference Data:")
print(X_inference_processed.head())

Processed Training Data:
     MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  2.475751  0.266201  1.500473  -0.266636   -0.900449 -0.219674  1.441017   
1  2.462605 -1.251479  0.863503  -0.751121    1.186500 -0.830714  1.235934   
2  1.885951  1.100925  2.613913   0.196463   -0.725783  0.118538  1.133393   
3  0.994291  1.100925  0.504211   0.192853   -0.663546 -0.230108  1.133393   
4  0.001771  1.100925  0.900830   0.267694   -0.656520 -0.732521  1.133393   

   Longitude  
0  -0.640477  
1  -0.554449  
2  -0.726504  
3  -0.812532  
4  -0.812532  

Training Data Imputer: SimpleImputer()
Training Data Scaler: StandardScaler()

Processed Inference Data:
        MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
1000 -0.253470 -0.644407 -0.328576  -0.462455    0.702657 -0.012564 -0.507273   
1001 -0.160068 -0.113219 -0.590783  -0.488773   -0.270048 -0.222131 -0.609815   
1002 -0.265788 -0.264987 -0.095526   0.260036   -0.062256 -0.282455

**Task 2**: Pipeline Integration
- Step 1: Use sklearn pipelines to encapsulate the preprocessing steps.
- Step 2: Configure identical pipelines for both training and building inference models.

In [10]:
# write your code from here
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import mean_squared_error

# Load data for training
housing_train = fetch_california_housing(as_frame=True)
df_train = housing_train.frame.copy()
X_train = df_train.drop('MedHouseVal', axis=1)
y_train = df_train['MedHouseVal']
X_train_sample, _, y_train_sample, _ = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Define the preprocessing pipeline
numerical_features = X_train_sample.select_dtypes(include=np.number).columns
preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Define the full training pipeline
train_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Train the pipeline
train_pipeline.fit(X_train_sample, y_train_sample)

# Simulate inference data
housing_inference = fetch_california_housing(as_frame=True)
df_inference = housing_inference.frame.copy()
X_inference = df_inference.drop('MedHouseVal', axis=1, errors='ignore').iloc[:50]
# Introduce some missing values in inference data
X_inference.iloc[[5, 15], 0] = np.nan

# Create an inference pipeline using the *same* preprocessor
inference_pipeline = Pipeline(steps=[
    ('preprocessor', train_pipeline.named_steps['preprocessor'])
])

# Process the inference data
X_inference_processed = inference_pipeline.transform(X_inference)
print("Processed Inference Data:")
print(X_inference_processed[:5])

# Evaluate the training pipeline (optional, for demonstration)
y_pred_train = train_pipeline.predict(X_train_sample)
mse_train = mean_squared_error(y_train_sample, y_pred_train)
print(f"\nMean Squared Error on Training Data: {mse_train:.2f}")

Processed Inference Data:
[[ 2.33397811  0.98330419  0.64880421 -0.1682248  -0.97135592 -0.04676
   1.04692041 -1.32016306]
 [ 2.32147966 -0.60373066  0.33632187 -0.28809752  0.85710482 -0.0852553
   1.03755975 -1.31517701]
 [ 1.77322836  1.85617335  1.19503099 -0.05364345 -0.81832458 -0.02545269
   1.03287942 -1.32514912]
 [ 0.92548692  1.85617335  0.16006206 -0.05453673 -0.76379617 -0.04741729
   1.03287942 -1.33013518]
 [-0.01814599  1.85617335  0.35463383 -0.03601943 -0.75763973 -0.07906917
   1.03287942 -1.33013518]]

Mean Squared Error on Training Data: 0.52


**Task 3**: Saving and Loading Preprocessing Models
- Step 1: Save the transformation model after fitting it to the training data.
- Step 2: Load and apply the saved model during inference.

In [11]:
# write your code from here
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import joblib
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing

# Load data for training
housing_train = fetch_california_housing(as_frame=True)
df_train = housing_train.frame.copy()
X_train = df_train.drop('MedHouseVal', axis=1)
y_train = df_train['MedHouseVal']
X_train_sample, _, _, _ = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Initialize and fit the preprocessing steps
imputer = SimpleImputer(strategy='mean')
imputer.fit(X_train_sample)
scaler = StandardScaler()
scaler.fit(X_train_sample)

# Save the fitted preprocessors
joblib.dump(imputer, 'imputer.joblib')
joblib.dump(scaler, 'scaler.joblib')
print("Preprocessing models (imputer.joblib, scaler.joblib) saved successfully.")

# Simulate inference data
housing_inference = fetch_california_housing(as_frame=True)
df_inference = housing_inference.frame.copy()
X_inference = df_inference.drop('MedHouseVal', axis=1, errors='ignore').iloc[:50]
# Introduce some missing values in inference data
X_inference.iloc[[5, 15], 0] = np.nan

# Load the saved preprocessors
loaded_imputer = joblib.load('imputer.joblib')
loaded_scaler = joblib.load('scaler.joblib')
print("\nPreprocessing models loaded successfully.")

# Apply the loaded preprocessors to the inference data
numerical_cols_inference = X_inference.select_dtypes(include=np.number).columns
X_inference_imputed = pd.DataFrame(loaded_imputer.transform(X_inference[numerical_cols_inference]),
                                   columns=numerical_cols_inference, index=X_inference.index)
X_inference_scaled = pd.DataFrame(loaded_scaler.transform(X_inference_imputed),
                                  columns=numerical_cols_inference, index=X_inference_imputed.index)

print("\nProcessed Inference Data:")
print(X_inference_scaled.head())

Preprocessing models (imputer.joblib, scaler.joblib) saved successfully.

Preprocessing models loaded successfully.

Processed Inference Data:
     MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  2.333978  0.983304  0.648804  -0.168225   -0.971356 -0.046760  1.046920   
1  2.321480 -0.603731  0.336322  -0.288098    0.857105 -0.085255  1.037560   
2  1.773228  1.856173  1.195031  -0.053643   -0.818325 -0.025453  1.032879   
3  0.925487  1.856173  0.160062  -0.054537   -0.763796 -0.047417  1.032879   
4 -0.018146  1.856173  0.354634  -0.036019   -0.757640 -0.079069  1.032879   

   Longitude  
0  -1.320163  
1  -1.315177  
2  -1.325149  
3  -1.330135  
4  -1.330135  
