In [17]:
from re import S
import boto3
import pyarrow
import pickle

import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
# from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction import DictVectorizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression, Ridge
from xgboost import XGBRegressor

import mlflow

from hyperopt import STATUS_OK, Trials, fmin, hp, tpe, space_eval
from hyperopt.pyll import scope

mlflow.set_tracking_uri("http://127.0.0.1:5000/")
mlflow.set_experiment("mlops-project-model-training")

def read_data(key, bucket='kkr-mlops-zoomcamp'):

    session = boto3.session.Session()
    s3 = session.client(
        service_name='s3',
        endpoint_url='https://storage.yandexcloud.net',
        region_name='ru-central1',
        # aws_access_key_id = "id",
        # aws_secret_access_key = "key")
    )
    obj = s3.get_object(Bucket=bucket, Key=key)

    data = pd.read_csv(obj['Body'])

    return data
    
def na_filter(data):
    work_data = data.copy()
    non_type = work_data[data['make'].isna() | data['model'].isna() | data['trim'].isna()].index
    work_data.drop(non_type, axis=0, inplace=True)

    y = work_data.pop('sellingprice')

    return work_data, y

class FeaturesModifier:
    def __init__(self, columns):
        self.columns = columns

    def fit(self, work_data, _ = None):
        return self

    def transform(self, work_data, _ = None):

        work_data = pd.DataFrame(work_data, columns = self.columns)
        work_data['make_model_trim'] = work_data['make'] + '_'  + work_data['model'] + '_' + work_data['trim']
        work_data['year'] = work_data['year'].astype('str')
        
        cat_cols = ['year', 'make_model_trim', 'body', 'transmission', 'color', 'interior']
        num_cols = ['condition', 'odometer', 'mmr']

        X = work_data[cat_cols + num_cols].copy()
        X_dict = X.to_dict(orient = 'records')

        return X_dict

    def fit_transform(self, work_data, _ = None):
        return self.transform(work_data)

def prepare_features(work_data, preprocessor = None):

    num_2_impute = ['condition', 'odometer', 'mmr']
    cat_2_impute = ['body', 'transmission']
    constant_2_impute = ['color', 'interior']
    others = ['year', 'make', 'model', 'trim']
    
    if not preprocessor:
        features_filler = ColumnTransformer([
            ('num_imputer', SimpleImputer(missing_values=np.nan, strategy='mean'), num_2_impute),
            ('cat_imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent'), cat_2_impute),
            ('cat_constant', SimpleImputer(missing_values=np.nan, strategy='most_frequent'), constant_2_impute),
            ('others', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='-1'), others )
            ]
        )

        fm = FeaturesModifier(columns = num_2_impute + cat_2_impute + constant_2_impute + others)

        dv = DictVectorizer() 

        preprocessor = Pipeline(steps = [
            ('filler', features_filler),
            ('modifier', fm),
            ('dict_vectorizer', dv)

        ])
        
        X = preprocessor.fit_transform(work_data)

    else:
        X = preprocessor.transform(work_data)

    return X, preprocessor

def main():
    training_data = 'datasets/car-prices-train.csv'
    data = read_data(key=training_data)
    
    work_data, y = na_filter(data)
    X_train, X_valid, y_train, y_valid = train_test_split(work_data, y, test_size=0.25, random_state=42)

    train, preprocessor = prepare_features(X_train, preprocessor = None )
    # valid, _  = prepare_features(X_valid, preprocessor)
    
    models = {'xgboost-regressor': XGBRegressor(), 'lin-reg': LinearRegression()}
    
    
    for baseline in models:
        with mlflow.start_run():
            mlflow.set_tag("baseline", baseline)

            mlflow.log_param('training-data', training_data)
            training_model = models[baseline]
            training_model.fit(train, y_train)
            
            pipeline_model = Pipeline(steps =[
                    ('preprocess', preprocessor),
                    ('XGBoost', training_model)
                    ]
                )

            prediction_valid = pipeline_model.predict(X_valid)
            rmse_valid = mean_squared_error(y_valid, prediction_valid, squared = False)
            print(f'{baseline:s} RMSE on valid', 
                rmse_valid
                )
            mlflow.log_metric('rmse', rmse_valid)


# mean_squared_error(y_valid, lr.predict(valid), squared=False)
    # model = Pipeline(steps =[
    #         ('preprocess', preprocessor),
    #         ('XGBoost', xgb)
    #         ]
    #     )
# print ('RMSE on train', 
#     mean_squared_error(y_train, model.predict(X_train), squared = False)
#     )
    
    return pipeline_model


In [18]:
model = main()


xgboost-regressor RMSE on valid 1438.312342063069
lin-reg RMSE on valid 1546.8247863218944


In [19]:
with open('model.bin', 'wb') as f_out:
    pickle.dump(model, f_out)

In [20]:
with open('model.bin', 'rb') as f_in:
    loaded_model = pickle.load(f_in)


In [21]:
data_test = read_data(key = 'datasets/car-prices-test.csv')
test, y_test = na_filter(data_test)

test_prediction = loaded_model.predict(test)

mean_squared_error(y_test, test_prediction, squared=False)


AttributeError: 'NoneType' object has no attribute 'predict'

In [23]:
model