In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"
from sklearn.ensemble import RandomForestRegressor

# Common imports
import argparse
import json
import numpy as np
import pandas as pd
from pathlib import Path
from pickle import dump, load
import os

# Databolt imports
import d6tflow
import luigi
from luigi.util import inherits

# Ignore useless warnings 
import warnings
warnings.filterwarnings(action="ignore")
os.chdir((Path.cwd() / "").parents[0])
base_path = os.getcwd()
print(base_path)

Loading postgres module without psycopg2 installed. Will crash at runtime if postgres functionality is used.
Loading S3 module without the python package boto3. Will crash at runtime if S3 functionality is used.


Welcome to d6tflow!
C:\Users\mdetomaso\Desktop\K_Code\custom-ml-pipelines


In [3]:
from src.config import read_config
from src.input_collector import InputArguments, DataAccessor
from src.databolt_pipeline_modeler import (
    databolt_training_dataflow, 
    databolt_prediction_dataflow,
    get_model_path
)

# Configuration Settings

In [4]:
# Identify the path to the configurations folder
config_path = str(base_path.replace("\\", "/") +"/"+"conf")
args = InputArguments(pathConfFile=config_path)

def prepare_config_path(args) -> dict:
    config_path = f"{args.pathConfFile}/{{}}"
    print("Training configurations filepath: {}".format(config_path.format("config.json")))
    return {"config": config_path.format("config.json")}

    # Prepare paths to config file and eventual other files

file_paths = prepare_config_path(args)

# Read config
print("Reading Housing model configuration ........")
config_path = file_paths["config"]
cfg = read_config(config_path=config_path)

Training configurations filepath: C:/Users/mdetomaso/Desktop/K_Code/custom-ml-pipelines/conf/config.json
Reading Housing model configuration ........


In [5]:
# Databolt related settings
d6tflow.settings.log_level = 'ERROR' # 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'
# set the repo for eack task output
d6tflow.set_dir(dir='data/processed') 
# folder where workflow outputs are saved
d6tflow.settings.dirpath 

WindowsPath('data/processed')

# Collect Housing data

In [6]:
# Access housing data
housing = DataAccessor(**cfg.input_config).load_housing_data()
housing.head()

Housing data loaded successfully.


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


# Training 

In [7]:
print("Execute Training pipeline.")

databolt_training_dataflow(
    config=cfg, input_kwargs=cfg.input_config, training_kwargs=cfg.training_config,
    testing_kwargs=cfg.testing_config, do_split=True, train=True, 
    estimator=RandomForestRegressor(), evaluation=True
)

Execute Training pipeline.
Housing data loaded successfully.
Stratified Train / Test split.
Train set dimesions: (16512, 11)
Test set dimesions: (4128, 11)
Train target dimesions: (16512,)
Test target dimesions: (4128,)
Numerical missing values imputed.
New numerical attributes created.
Numerical missing values imputed.
New numerical attributes created.
Categorical missing values imputed.
Categorical data encoded.
All features processed.
Doing model training and prediction.
Model scored.


True

# Testing

In [9]:
model_path = get_model_path(config=cfg, estimator=RandomForestRegressor(), input_kwargs=cfg.input_config,
                            training_kwargs=cfg.training_config, testing_kwargs=cfg.testing_config)

print("Execute Prediction pipeline on test data.")

databolt_prediction_dataflow(
    config=cfg, input_kwargs=cfg.input_config, training_kwargs=cfg.training_config,
    testing_kwargs=cfg.testing_config, do_split=True, test=True, 
    model_path=model_path, evaluation=True
)

Execute Prediction pipeline on test data.
Model scored.


True

# Production

In [10]:
model_path = get_model_path(config=cfg, estimator=RandomForestRegressor(), input_kwargs=cfg.input_config,
                            training_kwargs=cfg.training_config, testing_kwargs=cfg.testing_config)

print("Execute Prediction pipeline on housing data.")

databolt_prediction_dataflow(
    config=cfg, input_kwargs=cfg.input_config, training_kwargs=cfg.training_config,
    testing_kwargs=cfg.testing_config, do_split=False, 
    model_path=model_path, evaluation=False
)

Execute Prediction pipeline on housing data.
Categorical missing values imputed.
Categorical data encoded.
All features processed.
Using trained model and doing predictions.


True