In [69]:
import boto3
import pyarrow
import pickle

import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction import DictVectorizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor

session = boto3.session.Session()
s3 = session.client(
    service_name='s3',
    endpoint_url='https://storage.yandexcloud.net',
    region_name='ru-central1',
    # aws_access_key_id = "id",
    # aws_secret_access_key = "key")
)
obj = s3.get_object(Bucket='kkr-mlops-zoomcamp', Key='datasets/car-prices-train.csv')

data = pd.read_csv(obj['Body'])
data.head()

Unnamed: 0,year,make,model,trim,body,transmission,vin,state,condition,odometer,color,interior,seller,mmr,sellingprice,saledate
0,2013,Nissan,Altima,2.5,Sedan,,1n4al3ap7dc292314,sc,4.2,37240.0,burgundy,beige,enterprise veh exchange/rental,13700,12600,2014-01-01 09:15:00
1,2006,Chrysler,300,C,Sedan,automatic,2c3la63h46h291271,sc,3.0,94901.0,silver,gray,bethpage fcu,7275,6300,2014-01-01 09:15:00
2,2014,Chevrolet,Malibu,LT,Sedan,automatic,1g11e5slxef160832,sc,2.8,36385.0,black,gray,enterprise veh exchange/rental,14200,13300,2014-01-01 09:15:00
3,2014,Chevrolet,Malibu,LT,Sedan,automatic,1g11e5sl4ef137451,sc,2.7,27495.0,black,black,enterprise veh exchange/rental,14800,13800,2014-01-01 09:15:00
4,2000,jeep,wrangler,4x4 4c se,,manual,1j4fa29p0yp701279,sc,1.9,161099.0,black,tan,enterprise car sales,5025,4000,2014-01-01 09:15:00


In [70]:
def na_filter(data):
    work_data = data.copy()
    non_type = work_data[data['make'].isna() | data['model'].isna() | data['trim'].isna()].index
    work_data.drop(non_type, axis=0, inplace=True)

    y = work_data.pop('sellingprice')

    return work_data, y

def preprocessor(work_data, features_filler=None, DV=None):
    # work_data = data.copy()
    num_2_impute = ['condition', 'odometer', 'mmr']
    cat_2_impute = ['body', 'transmission']
    constant_2_impute = ['color', 'interior']

    if not features_filler:
        features_filler = ColumnTransformer([
            ('num_imputer', SimpleImputer(missing_values=np.nan, strategy='mean'), num_2_impute),
            ('cat_imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent'), cat_2_impute),
            ('cat_constant', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='unknown'), constant_2_impute)
            ]
        )

        work_data[num_2_impute + cat_2_impute + constant_2_impute] = features_filler.fit_transform(work_data[num_2_impute + cat_2_impute + constant_2_impute])
    else:
        work_data[num_2_impute + cat_2_impute + constant_2_impute] = features_filler.transform(work_data[num_2_impute + cat_2_impute + constant_2_impute])

    cat_cols = ['year', 'make_model_trim', 'body', 'transmission', 'color', 'interior']
    num_cols = ['condition', 'odometer', 'mmr']

    work_data['make_model_trim'] = work_data['make'] + '_'  + work_data['model'] + '_' + work_data['trim']
    work_data['year'] = work_data['year'].astype('str')

    X = work_data[cat_cols + num_cols].copy()

    if not DV:
        DV = DictVectorizer()

        X_dict = X[cat_cols + num_cols].to_dict(orient = 'records')
        X = DV.fit_transform(X_dict)
    else:
        X = DV.transform(X_dict)

    # features_preprocessor = pipeline(steps = [
    #     ('imputer', features_filler),
    #     ('dict_vectorizer', DV)
    # ])

    return X, features_filler, DV

In [71]:
work_data, y = na_filter(data)
X, features_filler, DV = preprocessor(work_data, None, None)

In [74]:
DV

DictVectorizer()

In [64]:
work_data.isna().sum()

year                0
make                0
model               0
trim                0
body             2542
transmission    43141
vin                 0
state               0
condition       11634
odometer           80
color             542
interior          542
seller              0
mmr                 0
saledate            0
dtype: int64

In [65]:
y

0         12600
1          6300
2         13300
3         13800
4          4000
          ...  
403661    17000
403662    13500
403663      225
403664      350
403665    11000
Name: sellingprice, Length: 396085, dtype: int64

In [6]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=42)

In [16]:


X_valid = X_valid.fillna(-1)
X_valid_dict = X_valid[cat_cols + num_cols].to_dict(orient = 'records')
valid = DV.transform(X_valid_dict)
# OHE = OneHotEncoder(sparse=False)
# OHE.fit(X_train)
# X_cats_train = OHE.transform(X_train)
# np.hstack([X_cats_train, X_train[num_cols].values]).shape

In [20]:

xgb = XGBRegressor()

xgb.fit(train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

In [21]:
prediction = xgb.predict(valid)

In [25]:

mean_squared_error(y_valid, prediction, squared=False)


1428.5883189665612

In [29]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge

lr = LinearRegression()
lr.fit(train, y_train)
lr_pred = lr.predict(valid)

mean_squared_error(y_valid, lr.predict(valid), squared=False)

1490.6557620587303

In [32]:
lr = Lasso()
lr.fit(train, y_train)
lr_pred = lr.predict(valid)

mean_squared_error(y_train, lr.predict(train), squared=False)

1555.9927051420846

In [33]:
mean_squared_error(y_valid, lr.predict(valid), squared=False)

1495.9684971730578