## Importing Libraries

In [1]:
import os
import tarfile
import urllib
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import mlflow
import mlflow.sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import cross_val_score
from sklearn.compose import ColumnTransformer


## Download the dataset 

In [2]:
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"


def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

fetch_housing_data()


## Load the dataset  

In [3]:
def load_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

housing=load_data()

In [4]:

col_names = "total_rooms", "total_bedrooms", "population", "households"
rooms_ix, bedrooms_ix, population_ix, households_ix = [
    housing.columns.get_loc(c) for c in col_names]

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

def train_test(data):
    housing=data
    train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
    housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    for train_index, test_index in split.split(housing, housing["income_cat"]):
        strat_train_set = housing.loc[train_index]
        strat_test_set = housing.loc[test_index]
    
    return strat_train_set,strat_test_set
    
    
    
def data_prep(data):
    housing=data
    
    housing_labels = housing["median_house_value"].copy()
    housing = housing.drop("median_house_value", axis=1)
    
    
    housing_num = housing.drop("ocean_proximity", axis=1)
    
    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])
    
    
    num_attribs = list(housing_num)
    cat_attribs = ["ocean_proximity"]

    full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])
    
    housing_prepared = full_pipeline.fit_transform(housing)
    
    return housing_prepared,housing_labels
    
    
    

In [5]:
# mlflow server --backend-store-uri mlruns/ --default-artifact-root mlruns/ --host 0.0.0.0 --port 5000

remote_server_uri = "http://0.0.0.0:5000" # set to your server URI
mlflow.set_tracking_uri(remote_server_uri)  # or set the MLFLOW_TRACKING_URI in the env

In [6]:
mlflow.tracking.get_tracking_uri()

'http://0.0.0.0:5000'

In [7]:
exp_name = "Housing"
mlflow.set_experiment(exp_name)

<Experiment: artifact_location='mlruns/2', experiment_id='2', lifecycle_stage='active', name='Housing', tags={}>

In [8]:
def eval_metrics(forest_reg, actual, pred):
    # compute relevant metrics
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2


In [9]:
def train(max_features=8, n_estimators=30):
    # train a model with given parameters
    warnings.filterwarnings("ignore")
    

    data_path = "datasets/housing/housing.csv"
    

    # Useful for multiple runs (only doing one run in this sample notebook)    
    with mlflow.start_run(run_name='PARENT_housing') as parent_run:
        mlflow.log_param("parent", "yes")
        with mlflow.start_run(run_name='CHILD_DATA_PREP', nested=True) as child_run:
            mlflow.log_param("child", "yes")
            data = load_data()
    
            train_data, test_data = train_test(data)
    
            train_x, train_y = data_prep(train_data)
    
            test_x, test_y = data_prep(test_data)
        
        with mlflow.start_run(run_name='CHILD_TRAIN_MODEL', nested=True) as child_run:
            mlflow.log_param("child", "yes")
        
            forest_reg=RandomForestRegressor(max_features=max_features, n_estimators=n_estimators, random_state=42)
       
            forest_reg.fit(train_x, train_y)
        
        with mlflow.start_run(run_name='CHILD_SCORING', nested=True) as child_run:
            mlflow.log_param("child", "yes")
        
            # Evaluate Metrics
            predicted = forest_reg.predict(test_x)
            (rmse, mae, r2) = eval_metrics(forest_reg, test_y, predicted)

        # Print out metrics
        print("Random Forest model (max_features=%f, n_estimators=%f):" % (max_features, n_estimators))
        print("  rmse: %s" % rmse)
        print("  mae: %s" % mae)
        print("  r2_score: %s" % r2)

        # Log parameter, metrics, and model to MLflow
        mlflow.log_param(key="max_features", value=max_features)
        mlflow.log_param(key="n_estimators", value=n_estimators)
        mlflow.log_metric(key="rmse", value=rmse)
        mlflow.log_metrics({"mae": mae, "r2_score": r2})
        mlflow.log_artifact(data_path)
        print("Save to: {}".format(mlflow.get_artifact_uri()))
        
        mlflow.sklearn.log_model(forest_reg, "model")

In [10]:
train(8,30)

Random Forest model (max_features=8.000000, n_estimators=30.000000):
  rmse: 70466.35579577027
  mae: 52196.55785691215
  r2_score: 0.619009430380601
Save to: mlruns/2/37217c37ecbd4556871aa5c9760c8a2d/artifacts


In [11]:
train(7,25)

Random Forest model (max_features=7.000000, n_estimators=25.000000):
  rmse: 67489.71449387146
  mae: 49968.91187984496
  r2_score: 0.6505172187496944
Save to: mlruns/2/dca4a931b94a4aa5a08033999d5475f3/artifacts


In [12]:
train(5,10)

Random Forest model (max_features=5.000000, n_estimators=10.000000):
  rmse: 66118.25234969275
  mae: 48755.6238372093
  r2_score: 0.6645766185566665
Save to: mlruns/2/c29dc5355aec44d5b23c063f3ec7fb95/artifacts


In [13]:
train(2,5)

Random Forest model (max_features=2.000000, n_estimators=5.000000):
  rmse: 67249.17211282428
  mae: 49138.40993217054
  r2_score: 0.6530039861262631
Save to: mlruns/2/65f6f31380504c0fbd05971569ab85b4/artifacts


In [14]:
train(6,20)

Random Forest model (max_features=6.000000, n_estimators=20.000000):
  rmse: 66732.50340117687
  mae: 49607.764643895345
  r2_score: 0.6583153758426179
Save to: mlruns/2/c3e56c9783124e64ad78f08d39ecf16b/artifacts
