In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Common imports
import json
import numpy as np
import pandas as pd
from pathlib import Path
from pickle import dump, load
import os

# Ignore useless warnings 
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [3]:
# set the repo root directory to use as base path
os.chdir((Path.cwd() / "").parents[0]) # to change with base path
root_path = os.getcwd()

In [4]:
# Import project specific libraries
from src.config import read_config
from src.input_collector import InputArguments, DataAccessor
from src.sk_pipeline_modeler import sklearn_dataflow, CustomModelTrainer, CustomTrainTestSplit

# Configuration Settings

In [5]:
# identify the path to the configurations folder
config_path = str(root_path.replace("\\", "/") +"/"+"conf")
args = InputArguments(pathConfFile=config_path)

def prepare_config_path(args) -> dict:
    config_path = f"{args.pathConfFile}/{{}}"
    print("Training configurations filepath: {}".format(config_path.format("config.json")))
    return {"config": config_path.format("config.json")}

file_paths = prepare_config_path(args)

# Read config.py
print("Reading Housing model configuration ........")
config_path = file_paths["config"]
cfg = read_config(config_path=config_path)

Training configurations filepath: C:/Users/mdetomaso/Desktop/K_Code/custom-ml-pipelines/conf/config.json
Reading Housing model configuration ........


# Training

## Collect Housing Data

In [7]:
# Access housing data
housing = DataAccessor(**cfg.input_config).load_housing_data()
housing.head()

Housing data loaded successfully.


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


## Train / Test split

In [8]:
# Apply train / test split stratified on median income attribute
train_set, test_set, train_target, test_target = CustomTrainTestSplit(**cfg.testing_config).transform(housing)

# DUMP artitioned data
paths = [cfg.trainset_path, cfg.testset_path, cfg.train_target_path, cfg.test_target_path]
partitions = [train_set, test_set, train_target, test_target]

# save partitioned data
if not os.path.isdir(cfg.partitions_path):
    os.makedirs(cfg.partitions_path)    
for data, path in zip(partitions, paths):
    data.to_csv(path, index=False)
    print("Wrote {}".format(path))

Stratified Train / Test split.
Train set dimesions: (16512, 11)
Test set dimesions: (4128, 11)
Train target dimesions: (16512,)
Test target dimesions: (4128,)
Wrote data/outputs/partitions/train_set.csv
Wrote data/outputs/partitions/test_set.csv
Wrote data/outputs/partitions/train_target.csv
Wrote data/outputs/partitions/test_target.csv


## Apply standard sklearn Pipeline

In [9]:
# Fit the numerical pipeline to save the scaler across all data
num_pipeline, cat_pipeline, features_pipe, housing_full_pipeline = sklearn_dataflow(config=cfg)
num_pipeline.fit(housing)

# Save fitted scaler
if not os.path.isdir(cfg.artifacts_path):
    os.makedirs(cfg.artifacts_path)
    
scaler = num_pipeline.named_steps['std_scaler']
f = open(cfg.scaler_path, 'wb')
dump(scaler, f)
f.close()

Sklearn Housing Prediction Pipeline created.
Numerical missing values imputed.
New numerical attributes created.


In [10]:
# Load scaler across all housing data
scaler = load(open(cfg.scaler_path, 'rb'))

# Process the training set
num_pipeline, cat_pipeline, features_pipe, housing_full_pipeline = sklearn_dataflow(config=cfg, scaler = scaler)
prepared_trainset = features_pipe.fit_transform(train_set)
prepared_trainset

Sklearn Housing Prediction Pipeline created.
Numerical missing values imputed.
New numerical attributes created.
Categorical missing values imputed.
Categorical data encoded.


array([[-121.89,   37.29,   38.  , ...,    0.  ,    0.  ,    0.  ],
       [-121.93,   37.05,   14.  , ...,    0.  ,    0.  ,    0.  ],
       [-117.2 ,   32.77,   31.  , ...,    0.  ,    0.  ,    1.  ],
       ...,
       [-116.4 ,   34.09,    9.  , ...,    0.  ,    0.  ,    0.  ],
       [-118.01,   33.82,   31.  , ...,    0.  ,    0.  ,    0.  ],
       [-122.45,   37.77,   52.  , ...,    0.  ,    1.  ,    0.  ]])

## Train the model and save the best one

In [11]:
# OPTION 1: pass transformed data and ML estimator separately
estimator1 = RandomForestRegressor()
trainer = CustomModelTrainer(estimator1, **cfg.training_config).train(
    X=prepared_trainset, y=train_target
)

final_model_1 = trainer.best_estimator_

# OPTION 2: pass full pipeline with the estimator included
# NOTE: need to modify params_grid consistently
# estimator2 = housing_full_pipeline
# trainer = CustomModelTrainer(estimator2, **cfg.training_config).train(
#    X=train_set, y=train_target
# )

# final_model_2 = trainer.best_estimator_



In [12]:
# SAVE the best estimator
if not os.path.isdir(cfg.artifacts_path):
    os.makedirs(cfg.artifacts_path)
dump(final_model_1, open(cfg.model_path, 'wb'))
print("House model saved.")

House model saved.


In [13]:
# OPTIONAL: Explore used hyperparamters in each run
# cvres = rnd_search.cv_results_
# for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
#    print(np.sqrt(-mean_score), params)

In [14]:
# OPTIONAL: Inspect Pipeline paramters
# for param in housing_full_pipeline.get_params().keys():
#    print(param)

# Testing

## Collect Data to predict

In [15]:
# LOAD DATA
test_set = pd.read_csv(cfg.testset_path)
test_target = pd.read_csv(cfg.test_target_path)

print("Test data loaded.")
test_set.head()

Test data loaded.


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-118.39,34.12,29.0,6447.0,1012.0,2184.0,960.0,8.2816,<1H OCEAN
1,-117.86,33.77,39.0,4159.0,655.0,1669.0,651.0,4.6111,<1H OCEAN
2,-119.05,34.21,27.0,4357.0,926.0,2110.0,876.0,3.0119,<1H OCEAN
3,-118.15,34.2,52.0,1786.0,306.0,1018.0,322.0,4.1518,INLAND
4,-117.68,34.07,32.0,1775.0,314.0,1067.0,302.0,4.0375,INLAND


## Get feature engineering pipeline 

In [16]:
num_pipe, cat_pipe, feature_pipe, housing_pipeline = sklearn_dataflow(config=cfg, scaler=scaler)
processed_test = feature_pipe.fit_transform(test_set)
processed_test

Sklearn Housing Prediction Pipeline created.
Numerical missing values imputed.
New numerical attributes created.
Categorical missing values imputed.
Categorical data encoded.


array([[-118.39,   34.12,   29.  , ...,    0.  ,    0.  ,    0.  ],
       [-117.86,   33.77,   39.  , ...,    0.  ,    0.  ,    0.  ],
       [-119.05,   34.21,   27.  , ...,    0.  ,    0.  ,    0.  ],
       ...,
       [-118.49,   34.18,   31.  , ...,    0.  ,    0.  ,    0.  ],
       [-117.32,   33.99,   27.  , ...,    0.  ,    0.  ,    0.  ],
       [-118.91,   36.79,   19.  , ...,    0.  ,    0.  ,    0.  ]])

## Make predictions on test data

In [17]:
# LOAD trained model
final_model = load(open(cfg.model_path, 'rb'))
print("Fitted housing pipeline loaded.")
final_predictions = final_model.predict(processed_test)
print("Predictions are: {}".format(final_predictions))

Fitted housing pipeline loaded.
Predictions are: [487546.82 271099.   230383.01 ... 340668.05 157825.   127998.01]


## Evaluate Predictions

In [19]:
final_rmse = np.sqrt(mean_squared_error(test_target, final_predictions))
print("Prediction error is {} $".format(round(final_rmse,2)))

Prediction error is 47220.85 $
