In [1]:
import boto3
import pyarrow
import pickle

import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
# from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction import DictVectorizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression, Ridge
from xgboost import XGBRegressor

def read_data(key, bucket='kkr-mlops-zoomcamp'):

    session = boto3.session.Session()
    s3 = session.client(
        service_name='s3',
        endpoint_url='https://storage.yandexcloud.net',
        region_name='ru-central1',
        # aws_access_key_id = "id",
        # aws_secret_access_key = "key")
    )
    obj = s3.get_object(Bucket=bucket, Key=key)

    data = pd.read_csv(obj['Body'])

    return data
    
def na_filter(data):
    work_data = data.copy()
    non_type = work_data[data['make'].isna() | data['model'].isna() | data['trim'].isna()].index
    work_data.drop(non_type, axis=0, inplace=True)

    y = work_data.pop('sellingprice')

    return work_data, y

def prepare_features(work_data, features_filler=None, dv=None):
    
    num_2_impute = ['condition', 'odometer', 'mmr']
    cat_2_impute = ['body', 'transmission']
    constant_2_impute = ['color', 'interior']

    if not features_filler:
        features_filler = ColumnTransformer([
            ('num_imputer', SimpleImputer(missing_values=np.nan, strategy='mean'), num_2_impute),
            ('cat_imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent'), cat_2_impute),
            ('cat_constant', SimpleImputer(missing_values=np.nan, strategy='most_frequent'), constant_2_impute)
            ]
        )

        work_data[num_2_impute + cat_2_impute + constant_2_impute] = features_filler.fit_transform(work_data[num_2_impute + cat_2_impute + constant_2_impute])
    else:
        work_data[num_2_impute + cat_2_impute + constant_2_impute] = features_filler.transform(work_data[num_2_impute + cat_2_impute + constant_2_impute])

    cat_cols = ['year', 'make_model_trim', 'body', 'transmission', 'color', 'interior']
    num_cols = ['condition', 'odometer', 'mmr']

    work_data['make_model_trim'] = work_data['make'] + '_'  + work_data['model'] + '_' + work_data['trim']
    work_data['year'] = work_data['year'].astype('str')

    X = work_data[cat_cols + num_cols].copy()
    X_dict = X[cat_cols + num_cols].to_dict(orient = 'records')
    X_dict = X.to_dict(orient = 'records')

    if not dv:
        dv = DictVectorizer()      
        X = dv.fit_transform(X_dict)
    else:
        X = dv.transform(X_dict)

    return X , features_filler, dv

def main():
    data = read_data(key='datasets/car-prices-2015-6.csv')
    
    work_data, y = na_filter(data)
    X_train, X_valid, y_train, y_valid = train_test_split(work_data, y, test_size=0.25, random_state=42)

    train, features_filler, dv = prepare_features(X_train, None, None)
    
    valid, _, _  = prepare_features(X_valid, features_filler, dv)
    
    xgb = XGBRegressor()
    xgb.fit(train, y_train)
    
    prediction = xgb.predict(valid)

    print(mean_squared_error(y_valid, prediction, squared = False))
    
    return features_filler, dv, xgb


In [2]:
features_filler, dv, xgb = main()

1438.312342063069


In [5]:
data_test = read_data(key = 'datasets/car-prices-2015-7.csv')
test, y_test = na_filter(data_test)

X_test, _, _ = prepare_features(test, features_filler, dv)

test_prediction = xgb.predict(X_test)

print(mean_squared_error(y_test, test_prediction, squared=False))


Unnamed: 0,year,make,model,trim,body,transmission,vin,state,condition,odometer,color,interior,seller,mmr,sellingprice,saledate
0,2015,Chevrolet,Impala,LTZ,Sedan,automatic,2g1165s30f9103921,ca,1.9,14538.0,silver,black,enterprise vehicle exchange / tra / rental / t...,24300,7200,2015-07-07 09:30:00
1,2014,Dodge,Grand Caravan,SXT,Minivan,automatic,2c4rdgcgxer263526,ca,3.0,26199.0,—,black,enterprise holdings/gdp,29400,15800,2015-07-01 09:30:00
2,2010,Mercedes-Benz,E-Class,E350,Sedan,automatic,wddhf5gb2aa236009,ca,4.3,43888.0,white,off-white,mercedes-benz usa,21500,23500,2015-07-07 09:30:00
3,2013,Ford,Fusion,SE,Sedan,automatic,3fa6p0h71dr349266,ca,2.6,1.0,white,black,fox rent a car tra,10000,3500,2015-07-02 09:30:00
4,2014,Infiniti,QX70,Base,SUV,,jn8cs1mu3em452152,ca,4.2,31864.0,black,gray,enterprise veh exchange/rental,31800,29500,2015-07-07 09:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1295,2011,BMW,5 Series,528i,Sedan,automatic,wbafr1c53bc744672,fl,3.9,66403.0,white,brown,lauderdale imports ltd bmw pembrok pines,20300,22800,2015-07-07 06:15:00
1296,2015,Kia,K900,Luxury,Sedan,,knalw4d4xf6019304,in,4.5,18255.0,silver,black,avis corporation,35300,33000,2015-07-09 07:00:00
1297,2012,Ram,2500,Power Wagon,Crew Cab,automatic,3c6td5et6cg112407,wa,5.0,54393.0,white,black,i -5 uhlmann rv,30200,30800,2015-07-08 09:30:00
1298,2012,BMW,X5,xDrive35d,SUV,automatic,5uxzw0c58cl668465,ca,4.8,50561.0,black,black,financial services remarketing (lease),29800,34000,2015-07-08 09:30:00


In [7]:
data_test.iloc[122].to_dict()

{'year': 2012,
 'make': 'Mercedes-Benz',
 'model': 'E-Class',
 'trim': 'E350 Luxury',
 'body': 'Sedan',
 'transmission': 'automatic',
 'vin': 'wddhf5kb5ca633577',
 'state': 'nv',
 'condition': 4.6,
 'odometer': 34915.0,
 'color': 'black',
 'interior': 'off-white',
 'seller': 'mercedes-benz financial services',
 'mmr': 24800,
 'sellingprice': 29000,
 'saledate': '2015-07-01 08:45:00'}

In [None]:
# lr = LinearRegression()
# lr.fit(train, y_train)
# lr_pred = lr.predict(valid)

# mean_squared_error(y_valid, lr.predict(valid), squared=False)

1522.575548333558