# House Price Prediction - MODEL TRAINING

In [None]:
####################
# IMPORT LIBRARIES
####################

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

import mlflow
import mlflow.sklearn
from urllib.parse import urlparse

%matplotlib inline
sns.set(style="whitegrid")

import warnings
warnings.filterwarnings("ignore")

In [None]:
import sys
from pathlib import Path
from dotenv import load_dotenv

project_root =  Path('/home/jovyan/work/housing-mlops')
scripts_dir = project_root / 'scripts'
load_dotenv(project_root / '.env')

sys.path.insert(0, str(project_root))
sys.path.insert(0, str(scripts_dir))

from scripts.s3_utils import *

create_bucket(os.getenv('S3_MLFLOW_BUCKET'))

mlflow.set_tracking_uri(os.getenv('MLFLOW_TRACKING_URI'))
mlflow.set_experiment(os.getenv('PROJECT_NAME', 'HousingPricePrediction'))

In [None]:
############
# READ DATA
############

df = pd.read_csv('/home/jovyan/work/housing-mlops/data/housing.csv')
print("Data shape:", df.shape)
df.head()

In [None]:
#######################
# CHECK MISSING VALUES
#######################

print("Missing Values:")
print(df.isnull().sum())

In [None]:
###################
# DATA PREPARATION
###################

median_bedrooms = df['total_bedrooms'].median()
df['total_bedrooms'].fillna(median_bedrooms, inplace=True)

X = df.drop(['median_house_value'], axis=1)
y = df['median_house_value']

categorical_features = ['ocean_proximity']
numerical_features = [col for col in X.columns if col not in categorical_features]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
####################
# EVAULATION METRICS
####################

def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

In [None]:
#########################
# PREPROCESSING PIPELINE
#########################

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

In [None]:
#############################
# MODEL 1: LINEAR REGRESSION
#############################

with mlflow.start_run(run_name="LinearRegression"):
    # Model pipeline
    model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())])
    
    # Model train
    model_pipeline.fit(X_train, y_train)
    
    # Test set prediction
    y_pred = model_pipeline.predict(X_test)
    
    # Metric
    rmse, mae, r2 = eval_metrics(y_test, y_pred)
    
    # MLflow params
    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("r2", r2)
    
    # Model save
    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
    if tracking_url_type_store != "file":
        mlflow.sklearn.log_model(model_pipeline, "linear_regression_model")
    else:
        mlflow.sklearn.log_model(model_pipeline, "linear_regression_model")
    
    print(f"Linear Regression - RMSE: {rmse}, MAE: {mae}, R2: {r2}")

In [None]:
##########################
# MODEL 2: RIDGE REGRESSION
##########################

with mlflow.start_run(run_name="RidgeRegression"):
    # Hiperparams
    alpha = 1.0
    
    # Model pipeline
    model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', Ridge(alpha=alpha))])
    
    # Model train
    model_pipeline.fit(X_train, y_train)
    
    # Test set prediction
    y_pred = model_pipeline.predict(X_test)
    
    # Metrics
    rmse, mae, r2 = eval_metrics(y_test, y_pred)
    
    # MLflow params
    mlflow.log_param("model_type", "RidgeRegression")
    mlflow.log_param("alpha", alpha)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("r2", r2)
    
    # Model save
    mlflow.sklearn.log_model(model_pipeline, "ridge_model")
    
    print(f"Ridge Regression - RMSE: {rmse}, MAE: {mae}, R2: {r2}")

In [None]:
####################
# MODEL 3: ELASTICNET
####################

with mlflow.start_run(run_name="ElasticNet"):
    # Hiperparams
    alpha = 0.5
    l1_ratio = 0.5
    
    # Model pipeline 
    model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42))])
    
    # Model train
    model_pipeline.fit(X_train, y_train)
    
    # Test set pred
    y_pred = model_pipeline.predict(X_test)
    
    # Metrics
    rmse, mae, r2 = eval_metrics(y_test, y_pred)
    
    # MLflow params
    mlflow.log_param("model_type", "ElasticNet")
    mlflow.log_param("alpha", alpha)
    mlflow.log_param("l1_ratio", l1_ratio)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("r2", r2)
    
    # Model save
    mlflow.sklearn.log_model(model_pipeline, "elasticnet_model", 
                           registered_model_name="HousingPricePredictor")
    
    print(f"ElasticNet - RMSE: {rmse}, MAE: {mae}, R2: {r2}")