In [1]:
import json
import boto3
import pickle
import pyarrow

import numpy as np
import pandas as pd

from datetime import datetime
from dateutil.relativedelta import relativedelta

from sklearn.compose import ColumnTransformer
# from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction import DictVectorizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor

# import xgboost as xgb
from xgboost import XGBRegressor

import mlflow
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType

from hyperopt import STATUS_OK, Trials, fmin, hp, tpe, space_eval
from hyperopt.pyll import scope

MLFLOW_TRACKING_URI = 'sqlite:///mlops-project.db'
mlflow.set_tracking_uri("http://127.0.0.1:5000/")

def read_file(key, bucket='kkr-mlops-zoomcamp'):

    session = boto3.session.Session()
    s3 = session.client(
        service_name='s3',
        endpoint_url='https://storage.yandexcloud.net',
        region_name='ru-central1',
        # aws_access_key_id = "id",
        # aws_secret_access_key = "key")
    )
    obj = s3.get_object(Bucket=bucket, Key=key)

    data = pd.read_csv(obj['Body'], sep=",")

    return data

def load_data(current_date = "2015-5-17", periods = 1):
    
    dt_current = datetime.strptime(current_date, "%Y-%m-%d")
    
    if periods == 1:
        date_file = dt_current + relativedelta(months = - 1)
        print(f"Getting TEST data for {date_file.year}-{date_file.month} period")
        test_data = read_file(key = f"datasets/car-prices-{date_file.year}-{date_file.month}.csv")

        return test_data

    else:
        train_data = pd.DataFrame()
        for i in range(periods+1, 1, -1):
            date_file = dt_current + relativedelta(months = - i)
            try:
                data = read_file(key = f"datasets/car-prices-{date_file.year}-{date_file.month}.csv")
                print(f"Getting TRAIN data for {date_file.year}-{date_file.month} period")
            except:
                print(f"Cannot find file car-prices-{date_file.year}-{date_file.month}.csv",
                    "using blank")
                data = None
                
            train_data = pd.concat([train_data, data])
        
        return train_data


def na_filter(data):
    work_data = data.copy()
    non_type = work_data[data['make'].isna() | data['model'].isna() | data['trim'].isna()].index
    work_data.drop(non_type, axis=0, inplace=True)

    y = work_data.pop('sellingprice')

    return work_data, y


class FeaturesModifier:
    def __init__(self, columns):
        self.columns = columns

    def fit(self, work_data, _ = None):
        return self

    def transform(self, work_data, _ = None):

        work_data = pd.DataFrame(work_data, columns = self.columns)
        work_data['make_model_trim'] = work_data['make'] + '_'  + work_data['model'] + '_' + work_data['trim']
        work_data['year'] = work_data['year'].astype('str')
        
        cat_cols = ['year', 'make_model_trim', 'body', 'transmission', 'color', 'interior']
        num_cols = ['condition', 'odometer', 'mmr']

        X = work_data[cat_cols + num_cols].copy()
        X_dict = X.to_dict(orient = 'records')

        return X_dict

    def fit_transform(self, work_data, _ = None):
        return self.transform(work_data)


def prepare_features(work_data, preprocessor = None):

    num_2_impute = ['condition', 'odometer', 'mmr']
    cat_2_impute = ['body', 'transmission']
    constant_2_impute = ['color', 'interior']
    others = ['year', 'make', 'model', 'trim']
    
    if not preprocessor:
        features_filler = ColumnTransformer([
            ('num_imputer', SimpleImputer(missing_values=np.nan, strategy='mean'), num_2_impute),
            ('cat_imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent'), cat_2_impute),
            ('cat_constant', SimpleImputer(missing_values=np.nan, strategy='most_frequent'), constant_2_impute),
            ('others', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='-1'), others )
            ]
        )

        fm = FeaturesModifier(columns = num_2_impute + cat_2_impute + constant_2_impute + others)

        dv = DictVectorizer() 

        preprocessor = Pipeline(steps = [
            ('filler', features_filler),
            ('modifier', fm),
            ('dict_vectorizer', dv)

        ])
        
        X = preprocessor.fit_transform(work_data)

    else:
        X = preprocessor.transform(work_data)

    return X, preprocessor


def params_search(train, valid, y_train, y_valid, train_data_file, models):
    
    best_models = []

    for baseline in models:
        
        mlflow.set_experiment(f"{baseline.__name__}-models")
        search_space = models[baseline]

        def objective(params):

            with mlflow.start_run():
                mlflow.set_tag("baseline", f"{baseline.__name__}")
                mlflow.log_param("training-data", train_data_file)
                mlflow.log_param("parameters", params)
                
                print('Serching for the best parameters...')

                training_model = baseline(**params)
                training_model.fit(train, y_train)

                print('Predicting on the valid dataset...')
                prediction_valid = training_model.predict(valid)
                rmse_valid = mean_squared_error(y_valid, prediction_valid, squared = False)

                print('RMSE on valid', 
                    rmse_valid
                    )
                mlflow.log_metric('rmse', rmse_valid)
            

            return {'loss': rmse_valid, 'status': STATUS_OK}
        
        best_result = fmin(fn = objective,
                    space = search_space,
                    algo = tpe.suggest,
                    max_evals = 10, # int(2**(len(models[baseline].items())-2)), #3,
                    trials = Trials(),
                    ) 
        
        print("Best model", baseline(**space_eval(search_space, best_result)))
        best_models.append(baseline(**space_eval(search_space, best_result)))

        mlflow.end_run()
    
    return best_models #ML_model(**space_eval(search_space, best_result))


def train_best_models(train, y_train, X_valid, y_valid, X_test, y_test, preprocessor, models):

    mlflow_client = MlflowClient(tracking_uri = MLFLOW_TRACKING_URI)

    best_pipelines = []

    mlflow.autolog()
    for model in models:
    
        experiment = mlflow.set_experiment(f"{model.__name__}-models")

        best_run = mlflow_client.search_runs(
                experiment_ids = experiment.experiment_id,
                run_view_type=ViewType.ACTIVE_ONLY,
                max_results = 2,
                order_by = ['metrics.rmse ASC']
            )
        
        print(f"Training {model.__name__} with best params")

        mlflow.set_experiment("Auction-car-prices-best-models")
    
        with mlflow.start_run():

            best_params = json.loads(best_run[0].data.params['parameters'].replace("'", "\""))
            staged_model = model(**best_params).fit(train, y_train)
            
            pipeline = Pipeline(
                steps = [
                    ('preprocessor', preprocessor),
                    ('model', staged_model)
                ]
            )
            predict_valid = pipeline.predict(X_valid)
            rmse_valid = mean_squared_error(y_valid, predict_valid, squared = False)

            predict_test = pipeline.predict(X_test)
            rmse_test = mean_squared_error(y_test, predict_test, squared = False)

            mlflow.log_metric("rmse_valid", rmse_valid)
            mlflow.log_metric("rmse_test", rmse_test)
            mlflow.sklearn.log_model(pipeline, artifact_path='full-pipeline')
            
            best_pipelines.append((model.__name__, pipeline))

            print("{:} MODEL was saved with RUN".format(model.__name__))

            mlflow.end_run()

    return best_pipelines

def model_to_registry(MLFLOW_TRACKING_URI, model_name):

    mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
    mlflow_client = MlflowClient(tracking_uri = MLFLOW_TRACKING_URI)
    
    experiment = mlflow.set_experiment('Auction-car-prices-best-models')

    best_model_run = mlflow_client.search_runs(
        experiment_ids=experiment.experiment_id,
        run_view_type=ViewType.ACTIVE_ONLY,
        max_results=1,
        order_by=["metrics.rmse_test ASC"]
            
        )
    RUN_ID = best_model_run[0].info.run_id
    model_uri = "runs:/{:}/full-pipeline".format(RUN_ID)

    # try: 
    #     mlflow_client.get_registered_model("MLOps-project-models")
    # except:
    #     print("no such one, creating")
    #     mlflow_client.create_registered_model("MLOps-project-models")

    # timestamp = datetime.strftime(datetime.today(), "%y-%m-%d_%H:%M")
   
    print("Registering model", model_name)
    mlflow.register_model(
            model_uri=model_uri,
            name = model_name
        )

def model_promotion(current_date, version_to_production, to_stage):

    mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
    mlflow_client = MlflowClient(tracking_uri = MLFLOW_TRACKING_URI)
    model_name = "Auction-car-prices-prediction"

    versions = mlflow_client.get_latest_versions(
        model_name,
        # stages=['Production']
        )

    for version in versions:
        print(f"Model: {version.name:40s} Version: {version.version:3} Stage: {version.current_stage:10s} run_id: {version.run_id:30s}")

    mlflow_client.transition_model_version_stage(
        name = model_name,
        version = version_to_production,
        stage = to_stage,
        archive_existing_versions=False
        )

    mlflow_client.update_model_version(
        name = model_name,
        version = version_to_production,
        description=f'The model was promoted to Production {current_date}'
        )

def main(current_date = "2015-6-20", periods = 5):
 
    train_data = load_data(current_date = current_date, periods = periods)
    X, y = na_filter(train_data)

    test_data = load_data(current_date = current_date)
    X_test, y_test = na_filter(test_data)

    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=42)

    print("Training preprocessor...")
    train, preprocessor = prepare_features(X_train, preprocessor = None )
    valid, _  = prepare_features(X_valid, preprocessor)

    print("Initializing parameters for baseline models")
    models = {
        LinearRegression: {
            "fit_intercept": hp.choice("fit_intercept", ('True', 'False'))
            },
        Ridge: {"alpha": hp.loguniform("alpha", -5, 5),
                "fit_intercept": hp.choice("fit_intercept", ('True', 'False'))
            },
        RandomForestRegressor: {
                'max_depth': scope.int(hp.quniform('max_depth', 1, 20, 1)),
                'n_estimators': scope.int(hp.quniform('n_estimators', 10, 50, 1)),
                'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)),
                'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 4, 1)),
                'random_state': 42
                },
        XGBRegressor: {
                'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
                'learning_rate': hp.loguniform('learning_rate', -3, 0),
                'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
                'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
                'max_child_weight': hp.loguniform('max_child_weight', -1, 3),
                'num_boost_rounds': 100,
                # 'early_stopping_rounds': 20,
                'objective': 'reg:squarederror',
                'seed': 42,
                }
        }

    best_models = params_search(train, valid, y_train, y_valid, current_date, models)

    train_best_models(
        train, y_train,
        X_valid, y_valid,
        X_test, y_test,
        preprocessor,
        models = models
        )

    model_name = "Auction-car-prices-prediction"
    model_to_registry(MLFLOW_TRACKING_URI, model_name)

    model_promotion(current_date, version_to_production=3, to_stage = "Production")


In [2]:
main()

Getting TRAIN data for 2014-12 period
Getting TRAIN data for 2015-1 period
Getting TRAIN data for 2015-2 period
Getting TRAIN data for 2015-3 period
Getting TRAIN data for 2015-4 period
Getting TEST data for 2015-5 period
Training preprocessor...
Initializing parameters for baseline models
Serching for the best parameters...                   
Predicting on the valid dataset...                    
RMSE on valid                                         
1618.6086765949942                                    
Serching for the best parameters...                                             
Predicting on the valid dataset...                                              
RMSE on valid                                                                   
1618.6086765949942                                                              
Serching for the best parameters...                                             
Predicting on the valid dataset...                                              
RMS

2022/08/20 22:00:41 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/08/20 22:00:41 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.


Training LinearRegression with best params




LinearRegression MODEL was saved with RUN
Training Ridge with best params
Ridge MODEL was saved with RUN
Training RandomForestRegressor with best params
RandomForestRegressor MODEL was saved with RUN
Training XGBRegressor with best params
Parameters: { "max_child_weight", "num_boost_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




Registered model 'Auction-car-prices-prediction' already exists. Creating a new version of this model...
2022/08/20 22:04:42 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: Auction-car-prices-prediction, version 3


XGBRegressor MODEL was saved with RUN
Registering model Auction-car-prices-prediction
Model: Auction-car-prices-prediction            Version:   2 Stage: Production run_id: a5117c5652d54554bfa3b81463ada9b8


Created version '3' of model 'Auction-car-prices-prediction'.
