### Ensuring Consistency Across Training & Inference Datasets: Feature Scaling
**Question**: Load a dataset (e.g., Boston Housing) and perform feature scaling. Ensure the
same scaling is applied during model inference with new data.

In [3]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import unittest

# Load California Housing dataset
housing = fetch_california_housing(as_frame=True)
X = housing.data
y = housing.target

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# --- Error Handling & Validation ---
def validate_data(df, name="Data"):
    if df.isnull().any().any():
        raise ValueError(f"{name} contains null values.")
    if not df.applymap(lambda x: np.isfinite(x)).all().all():
        raise ValueError(f"{name} contains non-finite (inf or NaN) values.")

validate_data(X_train, "X_train")
validate_data(X_test, "X_test")

# --- Feature Scaling ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Train Model ---
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# --- Inference on New Data ---
new_data = pd.DataFrame([X.iloc[0]])  # Simulate new input
validate_data(new_data, "new_data")  # Validate before inference
new_data_scaled = scaler.transform(new_data)
predicted_value = model.predict(new_data_scaled)

print("Predicted value for new data:", predicted_value[0])

# --- Unit Testing ---
class TestScalingInference(unittest.TestCase):
    def test_scaled_mean_std(self):
        mean = np.mean(X_train_scaled, axis=0)
        std = np.std(X_train_scaled, axis=0)
        self.assertTrue(np.allclose(mean, 0, atol=1e-1), "Mean not ~0")
        self.assertTrue(np.allclose(std, 1, atol=1e-1), "Std not ~1")

    def test_prediction_shape(self):
        self.assertEqual(predicted_value.shape, (1,), "Prediction shape mismatch")

    def test_no_null_in_scaled_data(self):
        self.assertFalse(np.isnan(X_train_scaled).any(), "Nulls found in scaled data")

if __name__ == '__main__':
    unittest.main(argv=[''], exit=False)


  if not df.applymap(lambda x: np.isfinite(x)).all().all():
  if not df.applymap(lambda x: np.isfinite(x)).all().all():
  if not df.applymap(lambda x: np.isfinite(x)).all().all():
...
----------------------------------------------------------------------
Ran 3 tests in 0.005s

OK


Predicted value for new data: 4.151942685752971
