# Librairies

In [47]:
# reload modules before executing user code.
%reload_ext autoreload
%autoreload 2

import sys
from pathlib import Path
from typing import Dict, Union

import mlflow
import numpy as np
import pandas as pd
import pendulum
from loguru import logger
from mlflow.models import infer_signature
from sklearn import set_config
from sklearn.compose import ColumnTransformer, make_column_selector, TransformedTargetRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, max_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder

sys.path.append(str(Path.cwd().parent))
from settings.params import (DATA_DIR_INPUT,
                             DATA_DIR_OUTPUT,
                             MODEL_DIR,
                             MODEL_NAME,
                             MODEL_PARAMS,
                             REPORT_DIR,
                             TIMEZONE,
                             SEED
                            )
from src.make_dataset import load_data
# from src.trainer import Trainer
from src.utils import save_object_with_dill


set_config(display="diagram", print_changed_only=False)
pd.set_option("display.max_columns", None)

# Settings

In [2]:
EXECUTION_DATE = pendulum.now(tz=TIMEZONE)

logger.info(f"Execution date: {EXECUTION_DATE}")

logger.info(f"\nData input directory : {DATA_DIR_INPUT}\nData output directory: {DATA_DIR_OUTPUT}")

# model parameters
FEATURES = MODEL_PARAMS.get("FEATURES")
TARGET_NAME = MODEL_PARAMS["TARGET"]

[32m2023-07-20 04:38:44.288[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 3>[0m:[36m3[0m - [1mExecution date: 2023-07-20T02:38:44.288102+00:00[0m
[32m2023-07-20 04:38:44.289[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 5>[0m:[36m5[0m - [1m
Data input directory : /Users/mouslydiaw/Downloads/sen-ia/house_price/data/input
Data output directory: /Users/mouslydiaw/Downloads/sen-ia/house_price/data/output[0m


# Data collection

In [3]:
data = load_data(dataset_name="house_prices", column_to_lower=column_to_lower)

[32m2023-07-20 04:38:44.373[0m | [1mINFO    [0m | [36msrc.make_dataset[0m:[36mload_data[0m:[36m24[0m - [1m
Args: dataset name: house_prices 
column to lower: True
[32m2023-07-20 04:38:44.591[0m | [1mINFO    [0m | [36msrc.make_dataset[0m:[36mload_data[0m:[36m30[0m - [1mShape of raw input features: (1460, 81)[0m
[32m2023-07-20 04:38:44.592[0m | [1mINFO    [0m | [36msrc.make_dataset[0m:[36mload_data[0m:[36m31[0m - [1mFull description of the dataset
Ask a home buyer to describe their dream house, and they probably won't begin with the height of the basement ceiling or the proximity to an east-west railroad. But this playground competition's dataset proves that much more influences price negotiations than the number of bedrooms or a white-picket fence.

With 79 explanatory variables describing (almost) every aspect of residential homes in Ames, Iowa, this competition challenges you to predict the final price of each home.

MSSubClass: Identifies the type of

In [4]:
data.head()

Unnamed: 0,id,mssubclass,mszoning,lotfrontage,lotarea,street,alley,lotshape,landcontour,utilities,lotconfig,landslope,neighborhood,condition1,condition2,bldgtype,housestyle,overallqual,overallcond,yearbuilt,yearremodadd,roofstyle,roofmatl,exterior1st,exterior2nd,masvnrtype,masvnrarea,exterqual,extercond,foundation,bsmtqual,bsmtcond,bsmtexposure,bsmtfintype1,bsmtfinsf1,bsmtfintype2,bsmtfinsf2,bsmtunfsf,totalbsmtsf,heating,heatingqc,centralair,electrical,1stflrsf,2ndflrsf,lowqualfinsf,grlivarea,bsmtfullbath,bsmthalfbath,fullbath,halfbath,bedroomabvgr,kitchenabvgr,kitchenqual,totrmsabvgrd,functional,fireplaces,fireplacequ,garagetype,garageyrblt,garagefinish,garagecars,garagearea,garagequal,garagecond,paveddrive,wooddecksf,openporchsf,enclosedporch,3ssnporch,screenporch,poolarea,poolqc,fence,miscfeature,miscval,mosold,yrsold,saletype,salecondition,saleprice,building_age,remodel_age
0,1.0,60.0,RL,65.0,8450.0,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7.0,5.0,2003.0,2003.0,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706.0,Unf,0.0,150.0,856.0,GasA,Ex,Y,SBrkr,856.0,854.0,0.0,1710.0,1.0,0.0,2.0,1.0,3.0,1.0,Gd,8.0,Typ,0.0,,Attchd,2003.0,RFn,2.0,548.0,TA,TA,Y,0.0,61.0,0.0,0.0,0.0,0.0,,,,0.0,2.0,2008.0,WD,Normal,208500.0,5.0,5.0
1,2.0,20.0,RL,80.0,9600.0,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6.0,8.0,1976.0,1976.0,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978.0,Unf,0.0,284.0,1262.0,GasA,Ex,Y,SBrkr,1262.0,0.0,0.0,1262.0,0.0,1.0,2.0,0.0,3.0,1.0,TA,6.0,Typ,1.0,TA,Attchd,1976.0,RFn,2.0,460.0,TA,TA,Y,298.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,5.0,2007.0,WD,Normal,181500.0,31.0,31.0
2,3.0,60.0,RL,68.0,11250.0,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7.0,5.0,2001.0,2002.0,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486.0,Unf,0.0,434.0,920.0,GasA,Ex,Y,SBrkr,920.0,866.0,0.0,1786.0,1.0,0.0,2.0,1.0,3.0,1.0,Gd,6.0,Typ,1.0,TA,Attchd,2001.0,RFn,2.0,608.0,TA,TA,Y,0.0,42.0,0.0,0.0,0.0,0.0,,,,0.0,9.0,2008.0,WD,Normal,223500.0,7.0,6.0
3,4.0,70.0,RL,60.0,9550.0,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7.0,5.0,1915.0,1970.0,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216.0,Unf,0.0,540.0,756.0,GasA,Gd,Y,SBrkr,961.0,756.0,0.0,1717.0,1.0,0.0,1.0,0.0,3.0,1.0,Gd,7.0,Typ,1.0,Gd,Detchd,1998.0,Unf,3.0,642.0,TA,TA,Y,0.0,35.0,272.0,0.0,0.0,0.0,,,,0.0,2.0,2006.0,WD,Abnorml,140000.0,91.0,36.0
4,5.0,60.0,RL,84.0,14260.0,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8.0,5.0,2000.0,2000.0,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655.0,Unf,0.0,490.0,1145.0,GasA,Ex,Y,SBrkr,1145.0,1053.0,0.0,2198.0,1.0,0.0,2.0,1.0,4.0,1.0,Gd,9.0,Typ,1.0,TA,Attchd,2000.0,RFn,3.0,836.0,TA,TA,Y,192.0,84.0,0.0,0.0,0.0,0.0,,,,0.0,12.0,2008.0,WD,Normal,250000.0,8.0,8.0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 83 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             1460 non-null   float64
 1   mssubclass     1460 non-null   float64
 2   mszoning       1460 non-null   object 
 3   lotfrontage    1201 non-null   float64
 4   lotarea        1460 non-null   float64
 5   street         1460 non-null   object 
 6   alley          91 non-null     object 
 7   lotshape       1460 non-null   object 
 8   landcontour    1460 non-null   object 
 9   utilities      1460 non-null   object 
 10  lotconfig      1460 non-null   object 
 11  landslope      1460 non-null   object 
 12  neighborhood   1460 non-null   object 
 13  condition1     1460 non-null   object 
 14  condition2     1460 non-null   object 
 15  bldgtype       1460 non-null   object 
 16  housestyle     1460 non-null   object 
 17  overallqual    1460 non-null   float64
 18  overallc

# Hold out

In [6]:
x_train, x_test, y_train, y_test = train_test_split(data.loc[:, FEATURES],  # list(FEATURES.intersection(data.columns))
                                                    data.loc[:, TARGET_NAME],
                                                    test_size=MODEL_PARAMS["TEST_SIZE"],
                                                    random_state=SEED,
                                                   )

# Modeling

In [7]:
def eval_metrics(y_actual: Union[pd.DataFrame, pd.Series, np.ndarray],
                 y_pred: Union[pd.DataFrame, pd.Series, np.ndarray]
                 ) -> Dict[str, float]:
    """Compute evaluation metrics.

    Args:
        y_actual: Ground truth (correct) target values
        y_pred: Estimated target values.

    Returns:
        Dict[str, float]: dictionary of evaluation metrics.
            Expected keys are: "rmse", "mae", "r2", "max_error"

    """
    # Root mean squared error
    rmse = mean_squared_error(y_actual, y_pred, squared=False)
    # mean absolute error
    mae = mean_absolute_error(y_actual, y_pred)
    # R-squared: coefficient of determination
    r2 = r2_score(y_actual, y_pred)
    # max error: maximum value of absolute error (y_actual - y_pred)
    maxerror = max_error(y_actual, y_pred)
    return {"rmse": rmse,
            "mae": mae,
            "r2": r2,
            "max_error": maxerror
           }

In [8]:
mlflow.get_tracking_uri()

'file:///Users/mouslydiaw/Downloads/sen-ia/house_price/notebooks/mlruns'

In [9]:
def define_pipeline(numerical_transformer: list,
                    categorical_transformer: list,
                    target_transformer,
                    estimator: Pipeline,
                    **kwargs: dict) -> Pipeline:
    """ Define pipeline for modeling

    Args:
        **kwargs:

    Returns:
        Pipeline: sklearn pipeline
    """
    numerical_transformer = make_pipeline(*numerical_transformer)

    categorical_transformer = make_pipeline(*categorical_transformer)

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numerical_transformer, make_column_selector(dtype_include=["number"])),
            ("cat", categorical_transformer, make_column_selector(dtype_include=["object", "bool"])),
        ],
        remainder="drop",  # non-specified columns are dropped
        verbose_feature_names_out=False,  # will not prefix any feature names with the name of the transformer
    )
    # Append regressor to preprocessing pipeline.
    # Now we have a full prediction pipeline.
    if target_transformer:
        model_pipe1 = Pipeline(steps=[("preprocessor", preprocessor),
                                     ("estimator", estimator)])
        model_pipe = TransformedTargetRegressor(regressor=model_pipe1,
                                                func=np.log,
                                                inverse_func=np.exp)
    
    
    else:
        model_pipe = Pipeline(steps=[("preprocessor", preprocessor), ("estimator", estimator)])
        
    logger.info(f"{model_pipe}")
    return model_pipe

In [40]:
# Model definition
reg = define_pipeline(numerical_transformer=[SimpleImputer(strategy="median"),
                                             RobustScaler()],
                      categorical_transformer=[SimpleImputer(strategy="constant", fill_value="undefined"),
                                               OneHotEncoder(drop="if_binary", handle_unknown="ignore")],
                      target_transformer=False,
                      estimator=RandomForestClassifier(n_estimators=30)
                 )

reg

[32m2023-07-20 17:43:40.302[0m | [1mINFO    [0m | [36m__main__[0m:[36mdefine_pipeline[0m:[36m28[0m - [1mPipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('simpleimputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                        

In [41]:
reg.fit(x_train, y_train)

# Evaluate Metrics
y_train_pred = reg.predict(x_train)
y_test_pred = reg.predict(x_test)
train_metrics = eval_metrics(y_train , y_train_pred)
test_metrics = eval_metrics(y_test , y_test_pred)

# log out metrics
logger.info(f"Train: {train_metrics}")
logger.info(f"Test: {test_metrics}")

[32m2023-07-20 17:44:05.991[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 10>[0m:[36m10[0m - [1mTrain: {'rmse': 338.07301905068624, 'mae': 14.526027397260274, 'r2': 0.9999815860240235, 'max_error': 8300.0}[0m
[32m2023-07-20 17:44:05.992[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 11>[0m:[36m11[0m - [1mTest: {'rmse': 46378.121549825744, 'mae': 31498.939726027398, 'r2': 0.6740322248856983, 'max_error': 237500.0}[0m


In [None]:
# Create an experiment if not exists
exp_name = "house-price"
experiment = mlflow.get_experiment_by_name(exp_name)
if not experiment:
    experiment_id = mlflow.create_experiment(exp_name)
else:
    experiment_id = experiment.experiment_id

In [44]:
# Useful for multiple runs (only doing one run in this sample notebook)
with mlflow.start_run(run_name=f"{EXECUTION_DATE.strftime('%Y%m%d_%H%m%S')}-house_price",
                      experiment_id=experiment_id,
                      tags={"version": "v1", "priority": "P1"},
                      description="house price modeling",
                     ) as mlf_run:
    print(f"run_id: {mlf_run.info.run_id}")
    print(f"version tag value: {mlf_run.data.tags.get('version')}")
    print("--")

    # Select number of estimator
    n_estimators = 10  # int(input("Estimator(s): "))
    mlflow.log_param("n_estimators", n_estimators)
    # Model definition
    reg = define_pipeline(numerical_transformer=[SimpleImputer(strategy="median"),
                                                 RobustScaler()],
                          categorical_transformer=[SimpleImputer(strategy="constant", fill_value="undefined"),
                                                   OneHotEncoder(drop="if_binary", handle_unknown="ignore")],
                          target_transformer=False,
                          estimator=RandomForestClassifier(n_estimators=n_estimators)
                     )

    reg.fit(x_train, y_train)

    # Evaluate Metrics
    y_train_pred = reg.predict(x_train)
    y_test_pred = reg.predict(x_test)
    train_metrics = eval_metrics(y_train , y_train_pred)
    test_metrics = eval_metrics(y_test , y_test_pred)

    # log out metrics
    logger.info(f"Train: {train_metrics}")
    logger.info(f"Test: {test_metrics}")
    
    # Infer model signature
    predictions = reg.predict(x_train)
    signature = infer_signature(x_train, predictions)

    # Log parameter, metrics, and model to MLflow
    for group_name, set_metrics in [("train", train_metrics),
                                    ("test", test_metrics),
                                   ]:
        for metric_name, metric_value in set_metrics.items():
            mlflow.log_metric(f"{group_name}_{metric_name}", metric_value)
    # mlflow.sklearn.log_model(reg, "model", signature=signature)

run_id: b618e9722e7e4f2eb1ea531ecce148af
version tag value: v1
--


Estimator(s):  30


[32m2023-07-20 18:41:16.537[0m | [1mINFO    [0m | [36m__main__[0m:[36mdefine_pipeline[0m:[36m28[0m - [1mPipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('simpleimputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                        

## Performance analysis

In [None]:
# todo: optimiser, performances, features importances, ...

# Save model

In [51]:
reg

In [50]:
# sauvargade du modèle en local: dill, joblib (sklearn), pickle, pycaret
model_path_name = Path(MODEL_DIR, f'{EXECUTION_DATE.strftime("%Y%m%d_%H%m")}-{MODEL_NAME}')
save_object_with_dill(object_to_save=reg, object_path=model_path_name)

[32m2023-07-20 19:04:07.634[0m | [1mINFO    [0m | [36msrc.utils[0m:[36msave_object_with_dill[0m:[36m22[0m - [1mStarting object record in /Users/mouslydiaw/Downloads/sen-ia/house_price/models/20230720_0207-model_house_price.dill[0m
[32m2023-07-20 19:04:12.083[0m | [1mINFO    [0m | [36msrc.utils[0m:[36msave_object_with_dill[0m:[36m28[0m - [1mDone object record successfully[0m


In [None]:
# auto_log