In [23]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import mean_squared_error, mean_squared_log_error
import joblib


In [24]:
def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)

In [25]:
def build_model(data: pd.DataFrame) -> dict[str, float]:
    """Orchestrates the different steps for the model building phase and returns a dictionary with the model performances."""
    
    # Splitting data into features and target variable
    X_train = data.drop(["Id", "SalePrice"], axis=1)
    y_train = data["SalePrice"]
    
    # Splitting data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    
    # Defining continuous and categorical features
    continuous_features = ["LotArea", "GrLivArea"]
    categorical_features = ["MSZoning", "Neighborhood"] 
    
    # Scaling continuous features and encoding categorical features
    scaler = StandardScaler()
    encoder = OneHotEncoder(handle_unknown="ignore")
    scaler.fit(X_train[continuous_features])
    X_train[continuous_features] = scaler.transform(X_train[continuous_features])
    encoder.fit(X_train[categorical_features])
    X_train_processed = encoder.transform(X_train[categorical_features])
    
    # Fitting the model on the preprocessed training data
    model = LinearRegression()
    model.fit(np.hstack((X_train[continuous_features], X_train_processed.toarray())), y_train)
    
    # Preprocessing the test data and generating predictions
    X_test[continuous_features] = scaler.transform(X_test[continuous_features])
    X_test_processed = encoder.transform(X_test[categorical_features])
    y_pred = model.predict(np.hstack((X_test[continuous_features], X_test_processed.toarray())))
    
    # Computing model performances and returning them in a dictionary
    rmse = sqrt(mean_squared_error(y_test, y_pred))
    rmsle = compute_rmsle(np.log(y_test), np.log(y_pred))
    return {"rmse": rmse, "rmsle": rmsle}




In [26]:
def make_predictions(input_data: pd.DataFrame) -> np.ndarray:
    """Orchestrates the different steps of the inference phase and returns the model predictions."""
    
    # Loading the trained model and data preparation objects
    model = joblib.load('../models/model.joblib')
    encoder = joblib.load('../models/encoder.joblib')
    scaler = joblib.load('../models/scaler.joblib')
    
    # Preprocessing the input data and generating predictions
    continuous_features = ["LotArea", "GrLivArea"]
    categorical_features = ["MSZoning", "Neighborhood"] 
    input_data[continuous_features] = scaler.transform(input_data[continuous_features])
    input_data_processed = encoder.transform(input_data[categorical_features])
    predictions = model.predict(np.hstack((input_data[continuous_features], input_data_processed.toarray())))
    
    return predictions


In [27]:
build_model(pd.read_csv('/Users/admin-20218/Downloads/house-prices-advanced-regression-techniques/train.csv'))

{'rmse': 42645.40610193754, 'rmsle': 0.02}

In [28]:
make_predictions(pd.read_csv('/Users/admin-20218/Downloads/house-prices-advanced-regression-techniques/test.csv'))

array([ 88264.41248655, 150444.44970655, 195167.82402368, ...,
       154777.70406753, 130722.18402166, 206036.98027394])