In [None]:
# model_training.py

import pandas as pd
import numpy as np
import pickle
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

# ------------------------------
# 1. Loading and Preparing the Dataset
# ------------------------------
# Loading the California Housing dataset
housing = fetch_california_housing(as_frame=True)
df = housing.frame.copy()
df['MedHouseVal'] = housing.target  # target column -> median house value

# Scaling numerical features using StandardScaler
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

# Separate features (X) and target (y)
X = df.drop("MedHouseVal", axis=1)
y = df["MedHouseVal"]

# 2. Spliting the Data into Training and Testing Sets
# 80/20 split of the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ------------------------------
# 3. Train a Baseline Regression Model: Linear Regression
# ------------------------------
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
y_pred_lin = lin_reg.predict(X_test)

print("Baseline Model: Linear Regression")
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_lin)))
print("MAE:", mean_absolute_error(y_test, y_pred_lin))
print("R²:", r2_score(y_test, y_pred_lin))

# 4. Training an Advanced Model: Random Forest with Hyperparameter Tuning

rf = RandomForestRegressor(random_state=42)

# Defining a grid of hyperparameters to search over
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

# Using GridSearchCV for hyperparameter tuning with 5-fold cross-validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
                           cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# finding best hyperparameters 
print("\nBest Hyperparameters from GridSearchCV:")
print(grid_search.best_params_)

# Evaluating the best estimator from grid search
best_rf = grid_search.best_estimator_
y_pred_rf = best_rf.predict(X_test)

print("\nRandom Forest Model Performance (Optimized):")
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_rf)))
print("MAE:", mean_absolute_error(y_test, y_pred_rf))
print("R²:", r2_score(y_test, y_pred_rf))

# 5. Saving the Trained Model Using Pickle
with open("trained_house_price_model.pkl", "wb") as file:
    pickle.dump(best_rf, file)

print("\nTrained model saved as 'trained_house_price_model.pkl'")
