# <font color='red'> LESSON 4C: Production Deployment</font>

In [126]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [127]:
data = pd.read_csv('./data/train.csv')

## Scikit pipelines

Pipeline can be used to chain multiple estimators into one. This is useful as there is often a fixed sequence of steps in processing the data, for example feature selection, normalization and classification. Pipeline serves multiple purposes here:

**Convenience and encapsulation**

You only have to call fit and predict once on your data to fit a whole sequence of estimators.

**Joint parameter selection**

You can grid search over parameters of all estimators in the pipeline at once.

**Safety**

Pipelines help avoid leaking statistics from your test data into the trained model in cross-validation, by ensuring that the same samples are used to train the transformers and predictors.

All estimators in a pipeline, except the last one, must be transformers (i.e. must have a transform method). The last estimator may be any type (transformer, classifier, etc.).

## Numeric vs. Categorical Features

At the beginning, we have to be sure that we are able to distinguish between continuous and categorical features. 

The reason is simple. The work and processing both type of features are different!!!

  - **Continuous(numeric) feature**: Continuous variables are numeric variables that have an infinite number of values between any two values. A continuous variable can be numeric or date/time. *For example, the length of a part or the date and time a payment is received.*
  
  - **Categorical feature**: Categorical variables contain a finite number of categories or distinct groups. Categorical data might not have a logical order. *For example, categorical predictors include gender, material type, and payment method.*

In general, type of object is not satisfactory and always correct. 

In [128]:
from pandas.api.types import is_numeric_dtype

unique_count_ratio = 0.05
empty_ratio = 0.9
min_distinct_values = 2
columns = data.columns
# define label columns
label = 'SalePrice'
date_features = ['YearBuilt', 'GarageYrBlt', 'YearRemodAdd', 'YrSold']
#manually remove features
drop_features = ['PriceNtile', 'Neighbourhood_Rank']
# define empty list of categorical features which we will fill
cat_features = []
# define empty list of numerical features which we will fill
num_features = []


#consider feature categorical if ratio is below given value and number of unique records is >= 2
for var in columns:
    if \
      1.*data[var].nunique()/data[var].count() < unique_count_ratio \
      and data[var].nunique() >= min_distinct_values \
      and data[var].isna().sum() / data[var].count() < empty_ratio:
        cat_features.append(var)
    
#consider feature numerical if ratio is greater than or equal to given value and feature has numerival data type
for var in columns:
    if \
      1.*data[var].nunique()/data[var].count() >= unique_count_ratio \
      and is_numeric_dtype(data[var]) \
      and data[var].isna().sum() / data[var].count() < empty_ratio:
        num_features.append(var)
        
# remove data features from feature lists
cat_features = [f for f in cat_features if f not in date_features + drop_features ]
num_features = [f for f in num_features if f not in date_features + drop_features ]
leftover_features = [c for c in columns if c not in num_features + cat_features + date_features]

## OutlierRemover
Create a custom object for outlier handling

In [129]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

class OutlierRemover(BaseEstimator, TransformerMixin):
    """Categorical data missing value imputer."""

    def __init__(self, variables=None) -> None:
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables

    def fit(self, X: pd.DataFrame, y: pd.Series = None
            ) -> 'OutlierRemover':
        """Fit statement to accomodate the sklearn pipeline."""
        self.var_iqrs = {}
        for var in self.variables:
            Q3 = np.quantile(X[var], 0.75)
            Q1 = np.quantile(X[var], 0.25)
            IQR = Q3 - Q1
            lower_range = Q1 - 1.5 * IQR
            upper_range = Q3 + 1.5 * IQR
            self.var_iqrs[var] = {
                'lw_rng': lower_range
                , 'up_rng': upper_range}
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """Apply the transforms to the dataframe."""

        X = X.copy()
        for var in self.variables:
            lower_range = self.var_iqrs[var]['lw_rng']
            upper_range = self.var_iqrs[var]['up_rng']
            if not lower_range == upper_range == 0:
                X[var] = np.where(X[var] > upper_range, upper_range, X[var])
                X[var] = np.where(X[var] < lower_range, lower_range, X[var])

        return X

## DateProcessor
Custom date processor

In [130]:
class DateProcessor(BaseEstimator, TransformerMixin):
    """Categorical data missing value imputer."""

    def __init__(self, variables=None, date_treshold=1900) -> None:
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables
        self.date_treshold = date_treshold

    def fit(self, X: pd.DataFrame, y: pd.Series = None
            ) -> 'DateProcessor':
        """Fit statement to accomodate the sklearn pipeline."""
        self.max_dates = {}
        self.quantiles = {}
        for var in self.variables:
            self.quantiles[var] = X[var].quantile([0, .2, .4, .6, .8, 1]).unique()
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """Apply the transforms to the dataframe."""

        X = X.copy()
        for var in self.variables:
            X[var] = np.where(X[var] <= self.date_treshold, np.nan, X[var]).astype(float)
            X[var] = pd.cut(X[var], bins=self.quantiles[var], labels=False)
            X[var] = X[var].replace(np.nan, 'UNKNOWN').astype(str)
        return X

## CategoryMerger

Custom category merger for categories with low counts

In [131]:
class CategoryMerger(BaseEstimator, TransformerMixin):
    """Categorical data missing value imputer."""

    def __init__(self, variables=None ) -> None:
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables
        self.merge_treshold = .05

    def fit(self, X: pd.DataFrame, y: pd.Series = None
            ) -> 'CategoryMerger':
        """Fit statement to accomodate the sklearn pipeline."""
        self.merge_cats = {}
        df_size = X.shape[0]
        for var in self.variables:
            #group by category value and get count
            cat_distribution = pd.DataFrame(X[[var]].groupby(var).size() / df_size).reset_index().rename(columns={0: 'count'}).sort_values(by='count')
            #get those values with count <= that minimum value
            insufficient_count_data = cat_distribution[cat_distribution['count'] <= self.merge_treshold]
            #if there are more of those per category
            if insufficient_count_data.shape[0] > 1:
                self.merge_cats[var] = insufficient_count_data[var].tolist()
            else:
                self.merge_cats[var] = None

        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """Apply the transforms to the dataframe."""

        X = X.copy()
        for var in self.variables:
            if self.merge_cats[var] is not None:
                X[var] = X[var].replace(self.merge_cats[var], 'OTHER')
        return X

## CategoryEncoder

Custom category encoder

In [132]:
class CategoryEncoder(BaseEstimator, TransformerMixin):
    """Categorical data missing value imputer."""

    def __init__(self, variables=None ) -> None:
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables
        self.merge_treshold = .05

    def fit(self, X: pd.DataFrame, y: pd.Series = None
            ) -> 'CategoryEncoder':
        """Fit statement to accomodate the sklearn pipeline."""
        
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """Apply the transforms to the dataframe."""

        X = X.copy()
        X = pd.get_dummies(X , columns = self.variables, prefix = self.variables) \
        
        return X

## Scaler & Imputer

In [133]:
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

class ScalerImputer(BaseEstimator, TransformerMixin):
    """Categorical data missing value imputer."""

    def __init__(self, variables=None ) -> None:
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables
        self.merge_treshold = .05
        self.imputer = SimpleImputer(missing_values=np.nan, strategy='median')
        #self.scaler = StandardScaler()

    def fit(self, X: pd.DataFrame, y: pd.Series = None
            ) -> 'ScalerImputer':
        """Fit statement to accomodate the sklearn pipeline."""
        self.imputer.fit(X)
        #self.scaler.fit(X)
        
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """Apply the transforms to the dataframe."""

        X = X.copy()
        X_c = X.copy()
        X = pd.DataFrame(self.imputer.transform(X))
        #X = pd.DataFrame(self.scaler.transform(X))
        X.columns = X_c.columns
        X.index = X_c.index
        
        return X

## Cleaner
Final step top format the data frame

In [134]:
class Cleaner(BaseEstimator, TransformerMixin):
    """Categorical data missing value imputer."""

    def __init__(self, variables=None ) -> None:
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables
        self.merge_treshold = .05

    def fit(self, X: pd.DataFrame, y: pd.Series = None
            ) -> 'Cleaner':
        """Fit statement to accomodate the sklearn pipeline."""
        
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """Apply the transforms to the dataframe."""

        X = X.copy()
        X = X \
            .set_index('Id') \
            .rename(columns={'SalePrice': 'label'}) \
            .drop(columns=self.variables)
        
        return X

## Assembling the pipeline

Sklearn pipelines allows us to combine multiple estimators into a single pipeline

In [135]:
from sklearn.pipeline import Pipeline

Our pipeline has following steps:
  - outlierRemover
  - dateProcessor
  - categoryMerger
  - categoryEncoder
  - cleaner
  - scaler

In [141]:
data_engineering_pipe = Pipeline([
      ('outlierRemover', OutlierRemover(variables=num_features))
    , ('dateProcessor', DateProcessor(variables=date_features))
    , ('categoryMerger', CategoryMerger(variables=cat_features + date_features))
    , ('categoryEncoder', CategoryEncoder(variables=cat_features + date_features))
    , ('cleaner', Cleaner(variables=leftover_features))
    , ('scaler', ScalerImputer())
])

We can call `.fit` on whole pipeline at once

In [142]:
data_engineering_pipe.fit(data)

Pipeline(steps=[('outlierRemover',
                 OutlierRemover(variables=['Id', 'LotFrontage', 'LotArea',
                                           'MasVnrArea', 'BsmtFinSF1',
                                           'BsmtFinSF2', 'BsmtUnfSF',
                                           'TotalBsmtSF', '1stFlrSF',
                                           '2ndFlrSF', 'GrLivArea',
                                           'GarageArea', 'WoodDeckSF',
                                           'OpenPorchSF', 'EnclosedPorch',
                                           'ScreenPorch', 'SalePrice'])),
                ('dateProcessor',
                 DateProcessor(variables=['YearBuilt', 'GarageYrBlt',
                                          'YearRemo...
                                            'BldgType', 'HouseStyle',
                                            'OverallQual', 'OverallCond',
                                            'RoofStyle', 'RoofMatl',
                   

Also, we can transform data using this pipeline

In [143]:
processed_data = data_engineering_pipe.transform(data)

In [144]:
processed_data

Unnamed: 0_level_0,LotFrontage,LotArea,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,GrLivArea,...,YearRemodAdd_1.0,YearRemodAdd_2.0,YearRemodAdd_3.0,YearRemodAdd_4.0,YearRemodAdd_UNKNOWN,YrSold_0.0,YrSold_1.0,YrSold_2.0,YrSold_3.0,YrSold_UNKNOWN
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,65.0,8450.0,196.0,706.0,0.0,150.0,856.0,856.0,854.0,1710.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2.0,80.0,9600.0,0.0,978.0,0.0,284.0,1262.0,1262.0,0.0,1262.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3.0,68.0,11250.0,162.0,486.0,0.0,434.0,920.0,920.0,866.0,1786.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4.0,60.0,9550.0,0.0,216.0,0.0,540.0,756.0,961.0,756.0,1717.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5.0,84.0,14260.0,350.0,655.0,0.0,490.0,1145.0,1145.0,1053.0,2198.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456.0,62.0,7917.0,0.0,0.0,0.0,953.0,953.0,953.0,694.0,1647.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1457.0,85.0,13175.0,119.0,790.0,163.0,589.0,1542.0,2073.0,0.0,2073.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1458.0,66.0,9042.0,0.0,275.0,0.0,877.0,1152.0,1188.0,1152.0,2340.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1459.0,68.0,9717.0,0.0,49.0,1029.0,0.0,1078.0,1078.0,0.0,1078.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


## MLflow

MLflow is an open source platform to manage the ML lifecycle, including experimentation, reproducibility, deployment, and a central model registry.

Firstly, we will log into ML Flow (the link is just for demonstration, will be shut down after the class)

In [146]:
# Import mlflow
import mlflow
import mlflow.sklearn
mlflow.set_tracking_uri("http://20.52.18.106:5000/")
mlflow.set_experiment("house-price-experiment")

#### MLflow UI

http://20.52.18.106:5000/

In [147]:
import os
os.environ["AWS_ACCESS_KEY_ID"] = "admin"
os.environ["AWS_SECRET_ACCESS_KEY"] = "sample_key"
os.environ["MLFLOW_S3_ENDPOINT_URL"]= "http://20.52.18.106:9000"
os.environ["MLFLOW_TRACKING_URI"] = "http://20.52.18.106:5000"

Next we will train the model

In [148]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    processed_data.drop(columns=['label'])
    , processed_data[['label']]
    , test_size = 0.3, random_state=42)

In [149]:
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import lasso_path, enet_path
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Evaluate metrics
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

alpha = 0.05
l1_ratio = 0.05

# Run ElasticNet
elnModel = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
elnModel.fit(x_train, y_train)
predicted_qualities = elnModel.predict(x_test)
(rmse, mae, r2) = eval_metrics(y_test, predicted_qualities)

# Print out ElasticNet model metrics
print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio))
print("  RMSE: %s" % rmse)
print("  MAE: %s" % mae)
print("  R2: %s" % r2)

Elasticnet model (alpha=0.050000, l1_ratio=0.050000):
  RMSE: 21233.556000833283
  MAE: 15443.970517176716
  R2: 0.9035227604636454


Finally we can save and log model into MLflow.

In [165]:
# Log mlflow attributes for mlflow UI
from mlflow.models.signature import infer_signature
signature = infer_signature(x_train, elnModel.predict(x_train))

with mlflow.start_run():
    mlflow.log_param("alpha", alpha)
    mlflow.log_param("l1_ratio", l1_ratio)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)
    mlflow.sklearn.log_model(sk_model = elnModel, artifact_path = "elnModel", registered_model_name="elnModel")

Registered model 'elnModel' already exists. Creating a new version of this model...
Created version '17' of model 'elnModel'.


In [None]:
mlflow.sklearn.save_model(
    elnModel, "elnModel/",
    serialization_format=mlflow.sklearn.SERIALIZATION_FORMAT_CLOUDPICKLE)

In [154]:
!curl -X POST -H "Content-Type:application/json; format=pandas-split" --data '{"columns":["LotFrontage", "LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "GrLivArea", "GarageArea", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "ScreenPorch", "MSSubClass_20", "MSSubClass_50", "MSSubClass_60", "MSSubClass_120", "MSSubClass_OTHER", "MSZoning_OTHER", "MSZoning_RL", "MSZoning_RM", "Street_Grvl", "Street_Pave", "LotShape_IR1", "LotShape_OTHER", "LotShape_Reg", "LandContour_Lvl", "LandContour_OTHER", "Utilities_AllPub", "Utilities_NoSeWa", "LotConfig_Corner", "LotConfig_CulDSac", "LotConfig_Inside", "LotConfig_OTHER", "LandSlope_Gtl", "LandSlope_OTHER", "Neighborhood_CollgCr", "Neighborhood_Edwards", "Neighborhood_Gilbert", "Neighborhood_NAmes", "Neighborhood_NridgHt", "Neighborhood_OTHER", "Neighborhood_OldTown", "Neighborhood_Sawyer", "Neighborhood_Somerst", "Condition1_Feedr", "Condition1_Norm", "Condition1_OTHER", "Condition2_Norm", "Condition2_OTHER", "BldgType_1Fam", "BldgType_OTHER", "BldgType_TwnhsE", "HouseStyle_1.5Fin", "HouseStyle_1Story", "HouseStyle_2Story", "HouseStyle_OTHER", "OverallQual_4", "OverallQual_5", "OverallQual_6", "OverallQual_7", "OverallQual_8", "OverallQual_OTHER", "OverallCond_5", "OverallCond_6", "OverallCond_7", "OverallCond_OTHER", "RoofStyle_Gable", "RoofStyle_Hip", "RoofStyle_OTHER", "RoofMatl_CompShg", "RoofMatl_OTHER", "Exterior1st_HdBoard", "Exterior1st_MetalSd", "Exterior1st_OTHER", "Exterior1st_Plywood", "Exterior1st_VinylSd", "Exterior1st_Wd Sdng", "Exterior2nd_HdBoard", "Exterior2nd_MetalSd", "Exterior2nd_OTHER", "Exterior2nd_Plywood", "Exterior2nd_VinylSd", "Exterior2nd_Wd Sdng", "MasVnrType_BrkCmn", "MasVnrType_BrkFace", "MasVnrType_None", "MasVnrType_Stone", "ExterQual_Gd", "ExterQual_OTHER", "ExterQual_TA", "ExterCond_Gd", "ExterCond_OTHER", "ExterCond_TA", "Foundation_BrkTil", "Foundation_CBlock", "Foundation_OTHER", "Foundation_PConc", "BsmtQual_Ex", "BsmtQual_Fa", "BsmtQual_Gd", "BsmtQual_TA", "BsmtCond_OTHER", "BsmtCond_TA", "BsmtExposure_Av", "BsmtExposure_Gd", "BsmtExposure_Mn", "BsmtExposure_No", "BsmtFinType1_ALQ", "BsmtFinType1_BLQ", "BsmtFinType1_GLQ", "BsmtFinType1_LwQ", "BsmtFinType1_Rec", "BsmtFinType1_Unf", "BsmtFinType2_OTHER", "BsmtFinType2_Unf", "Heating_GasA", "Heating_OTHER", "HeatingQC_Ex", "HeatingQC_Gd", "HeatingQC_OTHER", "HeatingQC_TA", "CentralAir_N", "CentralAir_Y", "Electrical_FuseA", "Electrical_OTHER", "Electrical_SBrkr", "LowQualFinSF_0", "LowQualFinSF_OTHER", "BsmtFullBath_0", "BsmtFullBath_1", "BsmtFullBath_OTHER", "BsmtHalfBath_0", "BsmtHalfBath_1", "BsmtHalfBath_2", "FullBath_1", "FullBath_2", "FullBath_OTHER", "HalfBath_0", "HalfBath_1", "HalfBath_2", "BedroomAbvGr_2", "BedroomAbvGr_3", "BedroomAbvGr_4", "BedroomAbvGr_OTHER", "KitchenAbvGr_1", "KitchenAbvGr_OTHER", "KitchenQual_Ex", "KitchenQual_Fa", "KitchenQual_Gd", "KitchenQual_TA", "TotRmsAbvGrd_4", "TotRmsAbvGrd_5", "TotRmsAbvGrd_6", "TotRmsAbvGrd_7", "TotRmsAbvGrd_8", "TotRmsAbvGrd_9", "TotRmsAbvGrd_OTHER", "Functional_OTHER", "Functional_Typ", "Fireplaces_0", "Fireplaces_1", "Fireplaces_2", "Fireplaces_3", "FireplaceQu_Gd", "FireplaceQu_OTHER", "FireplaceQu_TA", "GarageType_Attchd", "GarageType_BuiltIn", "GarageType_Detchd", "GarageType_OTHER", "GarageFinish_Fin", "GarageFinish_RFn", "GarageFinish_Unf", "GarageCars_0", "GarageCars_1", "GarageCars_2", "GarageCars_3", "GarageCars_4", "GarageQual_OTHER", "GarageQual_TA", "GarageCond_OTHER", "GarageCond_TA", "PavedDrive_N", "PavedDrive_P", "PavedDrive_Y", "3SsnPorch_0", "3SsnPorch_OTHER", "PoolArea_0", "PoolArea_OTHER", "MiscVal_0", "MiscVal_OTHER", "MoSold_3", "MoSold_4", "MoSold_5", "MoSold_6", "MoSold_7", "MoSold_8", "MoSold_10", "MoSold_11", "MoSold_OTHER", "SaleType_New", "SaleType_OTHER", "SaleType_WD", "SaleCondition_Abnorml", "SaleCondition_Normal", "SaleCondition_OTHER", "SaleCondition_Partial", "YearBuilt_0.0", "YearBuilt_1.0", "YearBuilt_2.0", "YearBuilt_3.0", "YearBuilt_4.0", "YearBuilt_UNKNOWN", "GarageYrBlt_0.0", "GarageYrBlt_1.0", "GarageYrBlt_2.0", "GarageYrBlt_3.0", "GarageYrBlt_4.0", "GarageYrBlt_UNKNOWN", "YearRemodAdd_0.0", "YearRemodAdd_1.0", "YearRemodAdd_2.0", "YearRemodAdd_3.0", "YearRemodAdd_4.0", "YearRemodAdd_UNKNOWN", "YrSold_0.0", "YrSold_1.0", "YrSold_2.0", "YrSold_3.0", "YrSold_UNKNOWNh"],"data":[[70.0, 8414.0, 0.0, 663.0, 0.0, 396.0, 1059.0, 1068.0, 0.0, 1068.0, 264.0, 192.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]]}' http://20.52.18.106:5001/invocations

[136204.8692541481]

And another model for comparison


In [166]:
from sklearn.linear_model import LinearRegression

lin_model = LinearRegression()
lin_model.fit(x_train, y_train)
predicted_qualities = lin_model.predict(x_test)
(rmse, mae, r2) = eval_metrics(y_test, predicted_qualities)

# Print out ElasticNet model metrics
print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio))
print("  RMSE: %s" % rmse)
print("  MAE: %s" % mae)
print("  R2: %s" % r2)

with mlflow.start_run():
    mlflow.log_param("alpha", alpha)
    mlflow.log_param("l1_ratio", l1_ratio)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)
    mlflow.sklearn.log_model(lin_model, "linModel", registered_model_name="linModel")

Elasticnet model (alpha=0.050000, l1_ratio=0.050000):
  RMSE: 23304.75284704685
  MAE: 16730.87883107138
  R2: 0.8837833351995802


Registered model 'linModel' already exists. Creating a new version of this model...
Created version '2' of model 'linModel'.


In [None]:
mlflow.sklearn.save_model(
    lin_model, "lin_model/",
    serialization_format=mlflow.sklearn.SERIALIZATION_FORMAT_CLOUDPICKLE)

### Load model from MlFlow

In [170]:
import mlflow.sklearn
logged_model = 'elnModel'

# Load model as a PyFuncModel.
loaded_model = mlflow.sklearn.load_model("elnModel")

# Predict on a Pandas DataFrame.
import pandas as pd
loaded_model.predict(pd.DataFrame(x_test))

array([136204.86925415, 320881.17423756, 104018.53295819, 158049.47254768,
       294449.97900112,  80426.81761941, 216871.81518044, 138870.51575027,
        69144.90869622, 148021.65109409, 139370.89433189, 113650.4710205 ,
       155408.13264919, 204941.6841277 , 167959.22149492, 139776.2008711 ,
       199511.0543025 , 137493.51219702, 120035.08158085, 216529.97968087,
       168445.144816  , 205818.91912456, 176553.96596628, 131837.10109003,
       202093.16975854, 167296.54176832, 196713.90978979, 110439.62705418,
       174027.58606126, 197378.66887531, 113642.76217657, 265098.89453759,
       230012.34507223, 117113.91713803, 257987.61454103, 150238.44160755,
       145635.0679262 , 203949.70904346, 301555.76348303,  99810.62071827,
       117817.96384557, 245034.47251784, 111720.89657031, 328524.13659129,
       128355.48391422, 165640.03161004, 101073.14982563, 125648.76953245,
       346071.70726269, 122629.4844828 , 121619.15971645, 233019.27018101,
       130547.84755377, 3

### Set model as production version

In [171]:
from mlflow.tracking import MlflowClient

client = MlflowClient()

In [172]:
client = MlflowClient()
client.transition_model_version_stage(
    name="elnModel",
    version=11,
    stage="Production"
)

<ModelVersion: creation_timestamp=1652177507347, current_stage='Production', description='', last_updated_timestamp=1652177756622, name='elnModel', run_id='e1efa5db5e0c451d99dc6f908ad130b8', source='s3://mlflow/1/e1efa5db5e0c451d99dc6f908ad130b8/artifacts/elnModel', status='READY', status_message='', user_id='', version='11'>