In [None]:
import pandas as pd
import numpy as np

In [None]:
from sklearn.datasets import fetch_california_housing

data = fetch_california_housing(as_frame=True)
df = data.frame
X, y = df.drop('MedHouseVal', axis = 1), df['MedHouseVal']

In [None]:
X.head()

In [None]:
y.head()

##### add synthetic categorical feature for encoding step

In [None]:
X['Region'] = np.where(X['Latitude'] > 35, 'North', 'South')


##### Train-test split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)   

##### Identify column types

In [None]:


num_features = X.select_dtypes(include=[np.number]).columns.tolist()
cat_features = X.select_dtypes(include=[object]).columns.tolist() 



##### Preprocessing Pipelines

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

numeric_pipeline = Pipeline([
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])


In [None]:
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop = 'first'))
])

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer

preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, num_features),
    ('cat', categorical_pipeline, cat_features)
])

##### Models Dictionary

In [None]:
#pip install xgboost

In [None]:
import sklearn
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor



models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'ElasticNet': ElasticNet(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'RandomForestRegressor': RandomForestRegressor(),  
    'GradientBoostingRegressor': GradientBoostingRegressor(),
    'XGBRegressor': XGBRegressor(), 
    'SVR': SVR(),
    'KNeighborsRegressor': KNeighborsRegressor()
}

##### Model Training and Evaluation with Cross-validation

In [None]:

from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score

results = {}
for name, model in models.items():
    pipeline = Pipeline([
        ('preprocess', preprocessor),
        ('model', model)
    ])
    cv_scores = cross_val_score(
        pipeline, X_train, y_train,
        scoring='neg_mean_squared_error', cv=5
    )
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    results[name] = {
        'CV RMSE Mean': np.sqrt(-cv_scores.mean()),
        'CV RMSE Std': cv_scores.std(),
        'Test MSE': mean_squared_error(y_test, y_pred),
        'Test RMSE': np.sqrt(mean_squared_error(y_test, y_pred)), 
        'Test R2': r2_score(y_test, y_pred)
    }



##### Results DataFrame

In [None]:
results_df = pd.DataFrame(results).T
print('\n======= REGRESSION RESULTS =======')
print(results_df)

##### Hyperparameter Tuning Example (e.g., Ridge alpha)

In [None]:
from sklearn.model_selection import GridSearchCV

print('\nHyperparameter tuning for Ridge Regression:')
ridge_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('ridge', Ridge())
])
param_grid = {'ridge__alpha': [0.1, 1.0, 10.0, 100.0]}
grid = GridSearchCV(ridge_pipeline, param_grid, scoring='neg_root_mean_squared_error', cv=5)
grid.fit(X_train, y_train)
print(f"Best Ridge alpha: {grid.best_params_['ridge__alpha']}")
print(f"Best Ridge CV RMSE: {-grid.best_score_:.4f}")

##### try MinMax scaling

In [None]:

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import numpy as np

numeric_pipeline_minmax = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())
])
preprocessor_minmax = ColumnTransformer([
    ('num', numeric_pipeline_minmax, num_features),
    ('cat', categorical_pipeline, cat_features)
])
pipeline_mm = Pipeline([
    ('preprocess', preprocessor_minmax),
    ('model', RandomForestRegressor(random_state=42, n_jobs=-1))
])
pipeline_mm.fit(X_train, y_train)
y_pred_mm = pipeline_mm.predict(X_test)

rmse_mm = np.sqrt(mean_squared_error(y_test, y_pred_mm))
print(f"\nRandomForest (MinMaxScaler) Test RMSE: {rmse_mm:.4f}")

print("\nAll regression models completed.")

##### View prediction distribution

In [None]:
import matplotlib.pyplot as plt

plt.scatter(y_test, results['LinearRegression']['Test RMSE'] * np.ones_like(y_test), alpha=0.1, label='Test Predictions (linear)')
plt.xlabel('True Values')
plt.ylabel('Predicted RMSE')
plt.title('Test Prediction Distribution - Example')
plt.legend()
plt.show()