In [1]:
import boto3
import pyarrow
import pickle

import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
# from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction import DictVectorizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression, Ridge
from xgboost import XGBRegressor

def read_data(key, bucket='kkr-mlops-zoomcamp'):

    session = boto3.session.Session()
    s3 = session.client(
        service_name='s3',
        endpoint_url='https://storage.yandexcloud.net',
        region_name='ru-central1',
        # aws_access_key_id = "id",
        # aws_secret_access_key = "key")
    )
    obj = s3.get_object(Bucket=bucket, Key=key)

    data = pd.read_csv(obj['Body'])

    return data
    
def na_filter(data):
    work_data = data.copy()
    non_type = work_data[data['make'].isna() | data['model'].isna() | data['trim'].isna()].index
    work_data.drop(non_type, axis=0, inplace=True)

    y = work_data.pop('sellingprice')

    return work_data, y

def prepare_features(work_data, features_filler=None, dv=None):
    
    num_2_impute = ['condition', 'odometer', 'mmr']
    cat_2_impute = ['body', 'transmission']
    constant_2_impute = ['color', 'interior']

    if not features_filler:
        features_filler = ColumnTransformer([
            ('num_imputer', SimpleImputer(missing_values=np.nan, strategy='mean'), num_2_impute),
            ('cat_imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent'), cat_2_impute),
            ('cat_constant', SimpleImputer(missing_values=np.nan, strategy='most_frequent'), constant_2_impute)
            ]
        )

        work_data[num_2_impute + cat_2_impute + constant_2_impute] = features_filler.fit_transform(work_data[num_2_impute + cat_2_impute + constant_2_impute])
    else:
        work_data[num_2_impute + cat_2_impute + constant_2_impute] = features_filler.transform(work_data[num_2_impute + cat_2_impute + constant_2_impute])

    cat_cols = ['year', 'make_model_trim', 'body', 'transmission', 'color', 'interior']
    num_cols = ['condition', 'odometer', 'mmr']

    work_data['make_model_trim'] = work_data['make'] + '_'  + work_data['model'] + '_' + work_data['trim']
    work_data['year'] = work_data['year'].astype('str')

    X = work_data[cat_cols + num_cols].copy()
    X_dict = X[cat_cols + num_cols].to_dict(orient = 'records')
    X_dict = X.to_dict(orient = 'records')

    if not dv:
        dv = DictVectorizer()      
        X = dv.fit_transform(X_dict)
    else:
        X = dv.transform(X_dict)

    return X , features_filler, dv

def main():
    data = read_data(key='datasets/car-prices-train.csv')
    
    work_data, y = na_filter(data)
    X_train, X_valid, y_train, y_valid = train_test_split(work_data, y, test_size=0.25, random_state=42)

    train, features_filler, dv = prepare_features(X_train, None, None)
    
    valid, _, _  = prepare_features(X_valid, features_filler, dv)
    
    xgb = XGBRegressor()
    xgb.fit(train, y_train)
    
    prediction = xgb.predict(valid)

    print(mean_squared_error(y_valid, prediction, squared = False))
    
    return features_filler, dv, xgb


In [2]:
features_filler, dv, xgb = main()

1438.312342063069


In [4]:
data_test = read_data(key = 'datasets/car-prices-test.csv')
test, y_test = na_filter(data_test)

X_test, _, _ = prepare_features(test, features_filler, dv)

test_prediction = xgb.predict(X_test)

mean_squared_error(y_test, test_prediction, squared=False)


1602.0459517002184

In [None]:
# lr = LinearRegression()
# lr.fit(train, y_train)
# lr_pred = lr.predict(valid)

# mean_squared_error(y_valid, lr.predict(valid), squared=False)

1522.575548333558