In [91]:
import json
import boto3
import pickle
import pyarrow

import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
# from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction import DictVectorizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor

# import xgboost as xgb
from xgboost import XGBRegressor

import mlflow
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType

from hyperopt import STATUS_OK, Trials, fmin, hp, tpe, space_eval
from hyperopt.pyll import scope

MLFLOW_TRACKING_URI = 'sqlite:///mlops-project.db'
mlflow.set_tracking_uri("http://127.0.0.1:5000/")
# mlflow.set_experiment("mlops-project-model-training")


def read_data(key, bucket='kkr-mlops-zoomcamp'):

    session = boto3.session.Session()
    s3 = session.client(
        service_name='s3',
        endpoint_url='https://storage.yandexcloud.net',
        region_name='ru-central1',
        # aws_access_key_id = "id",
        # aws_secret_access_key = "key")
    )
    obj = s3.get_object(Bucket=bucket, Key=key)

    data = pd.read_csv(obj['Body'])

    return data


def na_filter(data):
    work_data = data.copy()
    non_type = work_data[data['make'].isna() | data['model'].isna() | data['trim'].isna()].index
    work_data.drop(non_type, axis=0, inplace=True)

    y = work_data.pop('sellingprice')

    return work_data, y


class FeaturesModifier:
    def __init__(self, columns):
        self.columns = columns

    def fit(self, work_data, _ = None):
        return self

    def transform(self, work_data, _ = None):

        work_data = pd.DataFrame(work_data, columns = self.columns)
        work_data['make_model_trim'] = work_data['make'] + '_'  + work_data['model'] + '_' + work_data['trim']
        work_data['year'] = work_data['year'].astype('str')
        
        cat_cols = ['year', 'make_model_trim', 'body', 'transmission', 'color', 'interior']
        num_cols = ['condition', 'odometer', 'mmr']

        X = work_data[cat_cols + num_cols].copy()
        X_dict = X.to_dict(orient = 'records')

        return X_dict

    def fit_transform(self, work_data, _ = None):
        return self.transform(work_data)


def prepare_features(work_data, preprocessor = None):

    num_2_impute = ['condition', 'odometer', 'mmr']
    cat_2_impute = ['body', 'transmission']
    constant_2_impute = ['color', 'interior']
    others = ['year', 'make', 'model', 'trim']
    
    if not preprocessor:
        features_filler = ColumnTransformer([
            ('num_imputer', SimpleImputer(missing_values=np.nan, strategy='mean'), num_2_impute),
            ('cat_imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent'), cat_2_impute),
            ('cat_constant', SimpleImputer(missing_values=np.nan, strategy='most_frequent'), constant_2_impute),
            ('others', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='-1'), others )
            ]
        )

        fm = FeaturesModifier(columns = num_2_impute + cat_2_impute + constant_2_impute + others)

        dv = DictVectorizer() 

        preprocessor = Pipeline(steps = [
            ('filler', features_filler),
            ('modifier', fm),
            ('dict_vectorizer', dv)

        ])
        
        X = preprocessor.fit_transform(work_data)

    else:
        X = preprocessor.transform(work_data)

    return X, preprocessor


def params_search(train, valid, y_train, y_valid, train_data_file, models):
    
    best_models = []

    for baseline in models:
        
        mlflow.set_experiment(f"{baseline.__name__}-models")
        search_space = models[baseline]

        def objective(params):

            with mlflow.start_run():
                mlflow.set_tag("baseline", f"{baseline.__name__}")
                mlflow.log_param("training-data", train_data_file)
                mlflow.log_param("parameters", params)
                
                print('Serching for the best parameters...')

                training_model = baseline(**params)
                training_model.fit(train, y_train)

                print('Predicting on the valid dataset...')
                prediction_valid = training_model.predict(valid)
                rmse_valid = mean_squared_error(y_valid, prediction_valid, squared = False)

                print('RMSE on valid', 
                    rmse_valid
                    )
                mlflow.log_metric('rmse', rmse_valid)
            

            return {'loss': rmse_valid, 'status': STATUS_OK}
        
        best_result = fmin(fn = objective,
                    space = search_space,
                    algo = tpe.suggest,
                    max_evals = 3,
                    trials = Trials(),
                    ) 
        
        print("Best model", baseline(**space_eval(search_space, best_result)))
        best_models.append(baseline(**space_eval(search_space, best_result)))

        mlflow.end_run()
    
    return best_models #ML_model(**space_eval(search_space, best_result))


def train_best_models(train, y_train, X_valid, y_valid, X_test, y_test, preprocessor, models):

    mlflow_client = MlflowClient(tracking_uri = MLFLOW_TRACKING_URI)

    best_pipelines = []

    mlflow.autolog()
    for model in models:
    
        experiment = mlflow.set_experiment(f"{model.__name__}-models")

        best_run = mlflow_client.search_runs(
                experiment_ids = experiment.experiment_id,
                run_view_type=ViewType.ACTIVE_ONLY,
                max_results = 2,
                order_by = ['metrics.rmse ASC']
            )
        
        print(f"Training {model.__name__} with best params")

        mlflow.set_experiment("MLOps-project-best-models-test")
    
        with mlflow.start_run():

            best_params = json.loads(best_run[0].data.params['parameters'].replace("'", "\""))
            staged_model = model(**best_params).fit(train, y_train)
            
            pipeline = Pipeline(
                steps = [
                    ('preprocessor', preprocessor),
                    ('model', staged_model)
                ]
            )
            predict_valid = pipeline.predict(X_valid)
            rmse_valid = mean_squared_error(y_valid, predict_valid, squared = False)

            predict_test = pipeline.predict(X_test)
            rmse_test = mean_squared_error(y_test, predict_test, squared = False)

            mlflow.log_metric("rmse_valid", rmse_valid)
            mlflow.log_metric("rmse_test", rmse_test)
            mlflow.sklearn.log_model(pipeline, artifact_path='full-pipeline')
            
            best_pipelines.append((model.__name__, pipeline))

            print("{:} MODEL was saved with RUN".format(model.__name__))

            mlflow.end_run()

    return best_pipelines

   # def main():


In [89]:
train_data_file = 'datasets/car-prices-train.csv'
test_data_file = 'datasets/car-prices-test.csv'

train_data = read_data(key=train_data_file)
X, y = na_filter(train_data)

test_data = read_data(key = test_data_file)
X_test, y_test = na_filter(test_data)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=42)

train, preprocessor = prepare_features(X_train, preprocessor = None )
valid, _  = prepare_features(X_valid, preprocessor)

models = {
    LinearRegression: {
        "fit_intercept": hp.choice("fit_intercept", ('True', 'False'))
        },
    Ridge: {"alpha": hp.loguniform("alpha", -5, 5),
            "fit_intercept": hp.choice("fit_intercept", ('True', 'False'))
        },
    # RandomForestRegressor: {
    #         'max_depth': scope.int(hp.quniform('max_depth', 1, 20, 1)),
    #         'n_estimators': scope.int(hp.quniform('n_estimators', 10, 50, 1)),
    #         'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)),
    #         'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 4, 1)),
    #         'random_state': 42
    #         },
    # XGBRegressor: {
    #         'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    #         'learning_rate': hp.loguniform('learning_rate', -3, 0),
    #         'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    #         'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    #         'max_child_weight': hp.loguniform('max_child_weight', -1, 3),
    #         'num_boost_rounds': 100,
    #         # 'early_stopping_rounds': 20,
    #         'objective': 'reg:squarederror',
    #         'seed': 42,
    #         }
    }

# best_models = params_search(train, valid, y_train, y_valid, train_data_file, models)

# print(best_models)


2022/08/18 00:20:18 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '606e8f143a47406981992e013c621f6e', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
                                 ['condition', 'odometer', 'mmr']),
                                ('cat_imputer',
                                 SimpleImputer(strategy='most_frequent'),
                                 ['body', 'transmission']),
                                ('cat_constant',
                                 SimpleImputer(strategy='most_frequent'),
                                 ['color', 'int...`
                                 ['condition', 'odometer', 'mmr']),
                                ('cat_imputer',
                                 SimpleImputer(strategy='most_frequent'),
                                 ['body', 'transmission']),
                                ('cat_constant',
               

In [92]:
train_best_models(
    train, y_train,
    X_valid, y_valid,
    X_test, y_test,
    preprocessor,
    models = models
    )



2022/08/18 00:27:21 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.
2022/08/18 00:27:21 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/08/18 00:27:21 INFO mlflow.tracking.fluent: Experiment with name 'MLOps-project-best-models-test' does not exist. Creating a new experiment.


Training LinearRegression with best params




EndpointConnectionError: Could not connect to the endpoint URL: "https://kkr-mlops-zoomcamp.s3.ru-central1.amazonaws.com/Project-artifacts/8/2baa9d90bd304f388ff9d0e29cb7cf26/artifacts/full-pipeline/python_env.yaml"

In [82]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow_client = MlflowClient(tracking_uri = MLFLOW_TRACKING_URI)
experiment = mlflow.set_experiment('MLOps-project-best-models')

best_model_run = mlflow_client.search_runs(
    experiment_ids=experiment.experiment_id,
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=1,
    order_by=["metrics.rmse_test ASC"]
        
    )
RUN_ID = best_model_run[0].info.run_id
# model_uri = "runs:/{:}/full-pipeline".format(RUN_ID) #!!! pipeline
model_uri = "runs:/{:}/full-pipline".format(RUN_ID)

try: 
    mlflow_client.get_registered_model("MLOps-project-models")
except:
    print("no such one, creating")
    mlflow_client.create_registered_model("MLOps-project-models")

# from datetime import datetime
# timestamp = datetime.strftime(datetime.today(), "%y-%m-%d_%H:%M")

model_name = "Auction-car-prices-prediction"
print("Registering model", model_name)
mlflow.register_model(
        model_uri=model_uri,
        name = model_name
    )     


Registered model 'Auction-car-prices-prediction' already exists. Creating a new version of this model...
2022/08/18 00:03:04 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: Auction-car-prices-prediction, version 2


Registering model Auction-car-prices-prediction


Created version '2' of model 'Auction-car-prices-prediction'.


<ModelVersion: creation_timestamp=1660780984049, current_stage='None', description=None, last_updated_timestamp=1660780984049, name='Auction-car-prices-prediction', run_id='f513ab6b127945a89dcd9077b99158b0', run_link=None, source='./project-artifacts/5/f513ab6b127945a89dcd9077b99158b0/artifacts/full-pipline', status='READY', status_message=None, tags={}, user_id=None, version=2>

In [95]:
# mlflow_client.delete_registered_model("Auction-car-prices-prediction")
mlflow.delete_experiment(experiment_id=4)



In [45]:
loaded_model = mlflow.pyfunc.load_model(model_uri=model_uri)

print(loaded_model)

loaded_model.predict(X_test)


mlflow.pyfunc.loaded_model:
  artifact_path: full-pipline
  flavor: mlflow.sklearn
  run_id: f513ab6b127945a89dcd9077b99158b0