In [21]:
import boto3
import pyarrow
import pickle

import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction import DictVectorizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression, Ridge
from xgboost import XGBRegressor

session = boto3.session.Session()
s3 = session.client(
    service_name='s3',
    endpoint_url='https://storage.yandexcloud.net',
    region_name='ru-central1',
    # aws_access_key_id = "id",
    # aws_secret_access_key = "key")
)
obj = s3.get_object(Bucket='kkr-mlops-zoomcamp', Key='datasets/car-prices-train.csv')

data = pd.read_csv(obj['Body'])
data.head()

Unnamed: 0,year,make,model,trim,body,transmission,vin,state,condition,odometer,color,interior,seller,mmr,sellingprice,saledate
0,2013,Nissan,Altima,2.5,Sedan,,1n4al3ap7dc292314,sc,4.2,37240.0,burgundy,beige,enterprise veh exchange/rental,13700,12600,2014-01-01 09:15:00
1,2006,Chrysler,300,C,Sedan,automatic,2c3la63h46h291271,sc,3.0,94901.0,silver,gray,bethpage fcu,7275,6300,2014-01-01 09:15:00
2,2014,Chevrolet,Malibu,LT,Sedan,automatic,1g11e5slxef160832,sc,2.8,36385.0,black,gray,enterprise veh exchange/rental,14200,13300,2014-01-01 09:15:00
3,2014,Chevrolet,Malibu,LT,Sedan,automatic,1g11e5sl4ef137451,sc,2.7,27495.0,black,black,enterprise veh exchange/rental,14800,13800,2014-01-01 09:15:00
4,2000,jeep,wrangler,4x4 4c se,,manual,1j4fa29p0yp701279,sc,1.9,161099.0,black,tan,enterprise car sales,5025,4000,2014-01-01 09:15:00


In [22]:
def na_filter(data):
    work_data = data.copy()
    non_type = work_data[data['make'].isna() | data['model'].isna() | data['trim'].isna()].index
    work_data.drop(non_type, axis=0, inplace=True)

    y = work_data.pop('sellingprice')

    return work_data, y

def preprocessor(work_data, features_filler=None, DV=None):
    # work_data = data.copy()
    num_2_impute = ['condition', 'odometer', 'mmr']
    cat_2_impute = ['body', 'transmission']
    constant_2_impute = ['color', 'interior']

    if not features_filler:
        features_filler = ColumnTransformer([
            ('num_imputer', SimpleImputer(missing_values=np.nan, strategy='mean'), num_2_impute),
            ('cat_imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent'), cat_2_impute),
            ('cat_constant', SimpleImputer(missing_values=np.nan, strategy='most_frequent'), constant_2_impute)
            ]
        )

        work_data[num_2_impute + cat_2_impute + constant_2_impute] = features_filler.fit_transform(work_data[num_2_impute + cat_2_impute + constant_2_impute])
    else:
        work_data[num_2_impute + cat_2_impute + constant_2_impute] = features_filler.transform(work_data[num_2_impute + cat_2_impute + constant_2_impute])

    cat_cols = ['year', 'make_model_trim', 'body', 'transmission', 'color', 'interior']
    num_cols = ['condition', 'odometer', 'mmr']

    work_data['make_model_trim'] = work_data['make'] + '_'  + work_data['model'] + '_' + work_data['trim']
    work_data['year'] = work_data['year'].astype('str')

    X = work_data[cat_cols + num_cols].copy()
    X_dict = X[cat_cols + num_cols].to_dict(orient = 'records')
    # X_dict = X.to_dict(orient = 'records')

    if not DV:
        DV = DictVectorizer()      
        X = DV.fit_transform(X_dict)
    else:
        X = DV.transform(X_dict)

    return X, features_filler, DV

In [23]:
work_data, y = na_filter(data)
X_train, X_valid, y_train, y_valid = train_test_split(work_data, y, test_size=0.25, random_state=42)

In [24]:
train, features_filler, DV = preprocessor(X_train, None, None)

In [25]:
valid, _, _ = preprocessor(X_valid, features_filler, DV)

In [26]:
xgb = XGBRegressor()

xgb.fit(train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

In [27]:
prediction = xgb.predict(valid)

In [28]:
mean_squared_error(y_valid, prediction, squared=False)

1438.312342063069

In [29]:
lr = LinearRegression()
lr.fit(train, y_train)
lr_pred = lr.predict(valid)

mean_squared_error(y_valid, lr.predict(valid), squared=False)

1522.575548333558

In [30]:
test_obj = s3.get_object(Bucket='kkr-mlops-zoomcamp', Key='datasets/car-prices-test.csv')

test_data = pd.read_csv(test_obj['Body'])
test_data.head()

Unnamed: 0,year,make,model,trim,body,transmission,vin,state,condition,odometer,color,interior,seller,mmr,sellingprice,saledate
0,2007,Hyundai,Elantra,GLS,Sedan,automatic,kmhdu46d37u028705,pa,2.0,141415.0,red,beige,1 cochran of monroeville,3425,2600,2015-04-01 02:00:00
1,2012,Nissan,Sentra,2.0 SR,Sedan,automatic,3n1ab6ap4cl748676,fl,4.8,31856.0,black,gray,nissan-infiniti lt,11200,12600,2015-04-01 02:00:00
2,2012,Nissan,Altima,2.5 S,Sedan,automatic,1n4al2ap8cn505834,fl,4.4,34642.0,gray,black,nissan-infiniti lt,12650,17600,2015-04-01 02:00:00
3,2013,Chevrolet,Equinox,LS,SUV,automatic,2gnalbek1d6349853,fl,4.4,56184.0,black,black,ge fleet services for itself/servicer,14550,16400,2015-04-01 02:00:00
4,1998,BMW,3 Series,323is,Coupe,automatic,wbabf8328weh62085,va,1.0,155648.0,silver,gray,select remarketing group llc/anderson financial,1650,400,2015-04-01 02:05:00


In [31]:
test, y_test = na_filter(test_data)
X_test, _, _ = preprocessor(test, features_filler, DV)

In [32]:
y_test_predictions = xgb.predict(X_test)
mean_squared_error(y_test, y_test_predictions, squared=False)

1602.0459517002184