## Выполним предобработку полученного ранее тренировочного датасета

In [19]:
!pip install catboost
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from catboost import CatBoostRegressor

Collecting catboost
  Downloading catboost-0.24.4-cp38-none-win_amd64.whl (65.4 MB)
Collecting graphviz
  Downloading graphviz-0.16-py2.py3-none-any.whl (19 kB)
Collecting plotly
  Downloading plotly-4.14.3-py2.py3-none-any.whl (13.2 MB)
Collecting retrying>=1.3.3
  Downloading retrying-1.3.3.tar.gz (10 kB)
Building wheels for collected packages: retrying
  Building wheel for retrying (setup.py): started
  Building wheel for retrying (setup.py): finished with status 'done'
  Created wheel for retrying: filename=retrying-1.3.3-py3-none-any.whl size=11435 sha256=610ba1afcaac84f3901d6e2f766ab6d713b1c88e07ed2c5a6241364b3b458a1a
  Stored in directory: c:\users\user-pc\appdata\local\pip\cache\wheels\c4\a7\48\0a434133f6d56e878ca511c0e6c38326907c0792f67b476e56
Successfully built retrying
Installing collected packages: graphviz, retrying, plotly, catboost
Successfully installed catboost-0.24.4 graphviz-0.16 plotly-4.14.3 retrying-1.3.3


In [26]:
RANDOM_SEED = 42
VAL_SIZE = 0.20   #20%
train = pd.read_csv('train.csv', index_col=0)
test = pd.read_csv('test.csv')

In [27]:
print(f"{train.shape=}")
print(f"{test.shape=}")

train.shape=(37079, 35)
test.shape=(34686, 32)


### Для удобства работы объединим train и test в один набор

In [28]:
train.drop(['index',
            'location'], axis=1, inplace=True)
train['is_train'] = True
test['is_train'] = False
test['price'] = 0

# чтобы индексы теста не потерялись, а в трейне продолжилась нумерация
train.index = range(test.shape[0], test.shape[0]+train.shape[0]) 

df = test.append(train, verify_integrity=True)

In [29]:
df.shape

(71765, 34)

### Простая предобработка данных

In [30]:
def fill_na(df):
    df.loc[df.engineDisplacement == ' LTR', 'engineDisplacement'] = None
    df.loc[df.ПТС.isna(),'ПТС'] = 'Оригинал'
    df.loc[df.Владельцы.isna(),'Владельцы'] = 'Неизвестно'
    
def change_column_dtypes(df):
    df.engineDisplacement = df.engineDisplacement.str.replace('LTR','').astype(float)
    df.enginePower = df.enginePower.str.replace('N12','').astype(float)

In [31]:
fill_na(df)
change_column_dtypes(df)

### Выведу полученный датасет

In [32]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df.head(3))

Unnamed: 0,bodyType,brand,car_url,color,complectation_dict,description,engineDisplacement,enginePower,equipment_dict,fuelType,image,mileage,modelDate,model_info,model_name,name,numberOfDoors,parsing_unixtime,priceCurrency,productionDate,sell_id,super_gen,vehicleConfiguration,vehicleTransmission,vendor,Владельцы,Владение,ПТС,Привод,Руль,Состояние,Таможня,is_train,price
0,лифтбек,SKODA,https://auto.ru/cars/used/sale/skoda/octavia/1...,синий,,"Все автомобили, представленные в продаже, прох...",1.2,105.0,"{""engine-proof"":true,""tinted-glass"":true,""airb...",бензин,https://autoru.naydex.net/o9DBXQ270/5ac010hAY0...,74000,2013.0,"{""code"":""OCTAVIA"",""name"":""Octavia"",""ru_name"":""...",OCTAVIA,1.2 AMT (105 л.с.),5.0,1603226273,RUB,2014,1100575026,"{""id"":""10373605"",""displacement"":1197,""engine_t...",LIFTBACK ROBOT 1.2,роботизированная,EUROPEAN,3 или более,,Оригинал,передний,Левый,Не требует ремонта,Растаможен,False,0
1,лифтбек,SKODA,https://auto.ru/cars/used/sale/skoda/octavia/1...,чёрный,,ЛОТ: 01217195\nАвтопрага Север\nДанный автомоб...,1.6,110.0,"{""cruise-control"":true,""asr"":true,""esp"":true,""...",бензин,https://autoru.naydex.net/o9DBXQ270/5ac010hAY0...,60563,2017.0,"{""code"":""OCTAVIA"",""name"":""Octavia"",""ru_name"":""...",OCTAVIA,1.6 MT (110 л.с.),5.0,1603226277,RUB,2017,1100549428,"{""id"":""20913311"",""displacement"":1598,""engine_t...",LIFTBACK MECHANICAL 1.6,механическая,EUROPEAN,1 владелец,,Оригинал,передний,Левый,Не требует ремонта,Растаможен,False,0
2,лифтбек,SKODA,https://auto.ru/cars/used/sale/skoda/superb/11...,серый,"{""id"":""20026336"",""name"":""Ambition"",""available_...","Все автомобили, представленные в продаже, прох...",1.8,152.0,"{""cruise-control"":true,""tinted-glass"":true,""es...",бензин,https://avatars.mds.yandex.net/get-autoru-vos/...,88000,2013.0,"{""code"":""SUPERB"",""name"":""Superb"",""ru_name"":""Су...",SUPERB,DSG 1.8 AMT (152 л.с.),5.0,1603226280,RUB,2014,1100658222,"{""id"":""20026323"",""nameplate"":""DSG"",""displaceme...",LIFTBACK ROBOT 1.8,роботизированная,EUROPEAN,1 владелец,,Оригинал,передний,Левый,Не требует ремонта,Растаможен,False,0


In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71765 entries, 0 to 71764
Data columns (total 34 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   bodyType              71765 non-null  object 
 1   brand                 71765 non-null  object 
 2   car_url               71765 non-null  object 
 3   color                 71765 non-null  object 
 4   complectation_dict    43497 non-null  object 
 5   description           71033 non-null  object 
 6   engineDisplacement    71710 non-null  float64
 7   enginePower           71765 non-null  float64
 8   equipment_dict        61769 non-null  object 
 9   fuelType              71765 non-null  object 
 10  image                 71765 non-null  object 
 11  mileage               71765 non-null  int64  
 12  modelDate             71765 non-null  float64
 13  model_info            71765 non-null  object 
 14  model_name            71765 non-null  object 
 15  name               

In [34]:
df.isna().sum()

bodyType                    0
brand                       0
car_url                     0
color                       0
complectation_dict      28268
description               732
engineDisplacement         55
enginePower                 0
equipment_dict           9996
fuelType                    0
image                       0
mileage                     0
modelDate                   0
model_info                  0
model_name                  0
name                        0
numberOfDoors               0
parsing_unixtime            0
priceCurrency               0
productionDate              0
sell_id                     0
super_gen                   0
vehicleConfiguration        0
vehicleTransmission         0
vendor                      0
Владельцы                   0
Владение                52125
ПТС                         0
Привод                      0
Руль                        0
Состояние                   0
Таможня                     0
is_train                    0
price     

### Сохраним объединенный датасет

In [35]:
df.to_csv('data1.csv', index=False)

### Отбирем признаки для модели

In [36]:
cat_cols = ['bodyType', 'brand', 'fuelType', 'color', 'model_name', 'vehicleTransmission', 'vendor', 'ПТС', 'Владельцы', 'Привод', 'Руль']
num_cols = ['productionDate', 'numberOfDoors', 'engineDisplacement', 'enginePower', 'mileage', ]

### Обучаем

In [71]:
X = df[df.is_train].drop('price', axis=1).loc[:, cat_cols+num_cols]
y = df[df.is_train].price

In [73]:
X.isna().sum()

bodyType               0
brand                  0
fuelType               0
color                  0
model_name             0
vehicleTransmission    0
vendor                 0
ПТС                    0
Владельцы              0
Привод                 0
Руль                   0
productionDate         0
numberOfDoors          0
engineDisplacement     0
enginePower            0
mileage                0
dtype: int64

In [74]:
model = CatBoostRegressor(cat_features=cat_cols)
model.fit(X, y)

Learning rate set to 0.074252
0:	learn: 2311856.4089907	total: 45.6ms	remaining: 45.5s
1:	learn: 2181750.4426858	total: 92.7ms	remaining: 46.2s
2:	learn: 2060089.8465552	total: 139ms	remaining: 46.1s
3:	learn: 1944154.5782870	total: 188ms	remaining: 46.8s
4:	learn: 1840733.8316647	total: 238ms	remaining: 47.3s
5:	learn: 1747162.4599959	total: 288ms	remaining: 47.6s
6:	learn: 1657847.2222015	total: 339ms	remaining: 48.1s
7:	learn: 1577594.0086594	total: 389ms	remaining: 48.2s
8:	learn: 1502955.0967622	total: 438ms	remaining: 48.3s
9:	learn: 1436011.4340142	total: 486ms	remaining: 48.1s
10:	learn: 1376470.4431867	total: 534ms	remaining: 48s
11:	learn: 1317510.7208631	total: 584ms	remaining: 48s
12:	learn: 1263264.1099800	total: 633ms	remaining: 48.1s
13:	learn: 1213338.5949501	total: 683ms	remaining: 48.1s
14:	learn: 1169829.0323612	total: 734ms	remaining: 48.2s
15:	learn: 1130572.7754838	total: 783ms	remaining: 48.2s
16:	learn: 1093453.0572720	total: 833ms	remaining: 48.1s
17:	learn: 10

146:	learn: 545562.6973927	total: 7.08s	remaining: 41.1s
147:	learn: 545240.7941020	total: 7.12s	remaining: 41s
148:	learn: 544916.0574205	total: 7.17s	remaining: 41s
149:	learn: 544371.5362683	total: 7.22s	remaining: 40.9s
150:	learn: 543068.9485200	total: 7.27s	remaining: 40.9s
151:	learn: 542970.8547234	total: 7.32s	remaining: 40.8s
152:	learn: 542869.2717837	total: 7.36s	remaining: 40.7s
153:	learn: 542584.3606141	total: 7.41s	remaining: 40.7s
154:	learn: 541246.4893391	total: 7.46s	remaining: 40.7s
155:	learn: 540939.7722651	total: 7.51s	remaining: 40.7s
156:	learn: 540440.1985564	total: 7.56s	remaining: 40.6s
157:	learn: 540223.1640413	total: 7.62s	remaining: 40.6s
158:	learn: 540151.8809597	total: 7.67s	remaining: 40.6s
159:	learn: 539757.4473240	total: 7.72s	remaining: 40.5s
160:	learn: 539397.3564744	total: 7.77s	remaining: 40.5s
161:	learn: 539100.7805901	total: 7.82s	remaining: 40.4s
162:	learn: 539040.6232867	total: 7.87s	remaining: 40.4s
163:	learn: 538895.0044687	total: 7

291:	learn: 452425.7852501	total: 14.3s	remaining: 34.6s
292:	learn: 452189.5681705	total: 14.3s	remaining: 34.6s
293:	learn: 451818.9736773	total: 14.4s	remaining: 34.5s
294:	learn: 451471.8016588	total: 14.4s	remaining: 34.5s
295:	learn: 451123.5341102	total: 14.5s	remaining: 34.4s
296:	learn: 451029.9432759	total: 14.5s	remaining: 34.4s
297:	learn: 450545.9575490	total: 14.6s	remaining: 34.3s
298:	learn: 450089.3952351	total: 14.6s	remaining: 34.3s
299:	learn: 449620.0109315	total: 14.7s	remaining: 34.2s
300:	learn: 449298.5102184	total: 14.7s	remaining: 34.2s
301:	learn: 448931.1988721	total: 14.8s	remaining: 34.2s
302:	learn: 448741.6416796	total: 14.8s	remaining: 34.1s
303:	learn: 448627.6755807	total: 14.9s	remaining: 34s
304:	learn: 448621.1983959	total: 14.9s	remaining: 34s
305:	learn: 447833.1333432	total: 15s	remaining: 33.9s
306:	learn: 447800.2600801	total: 15s	remaining: 33.9s
307:	learn: 447546.5507535	total: 15.1s	remaining: 33.8s
308:	learn: 447456.3599712	total: 15.1s

436:	learn: 411490.2386260	total: 21.6s	remaining: 27.8s
437:	learn: 411252.6340566	total: 21.6s	remaining: 27.7s
438:	learn: 411240.7981453	total: 21.7s	remaining: 27.7s
439:	learn: 410990.5823194	total: 21.7s	remaining: 27.6s
440:	learn: 410524.8634322	total: 21.8s	remaining: 27.6s
441:	learn: 410510.3856400	total: 21.8s	remaining: 27.5s
442:	learn: 410131.6761091	total: 21.9s	remaining: 27.5s
443:	learn: 410117.8716827	total: 21.9s	remaining: 27.4s
444:	learn: 410107.0084628	total: 22s	remaining: 27.4s
445:	learn: 410078.2755106	total: 22s	remaining: 27.3s
446:	learn: 409728.3027054	total: 22.1s	remaining: 27.3s
447:	learn: 409565.1994414	total: 22.1s	remaining: 27.2s
448:	learn: 409438.7767726	total: 22.2s	remaining: 27.2s
449:	learn: 409258.4470204	total: 22.2s	remaining: 27.1s
450:	learn: 408985.3900385	total: 22.3s	remaining: 27.1s
451:	learn: 408968.9210704	total: 22.3s	remaining: 27s
452:	learn: 408913.0908958	total: 22.3s	remaining: 27s
453:	learn: 408901.1684474	total: 22.4s

585:	learn: 379176.4965284	total: 29.1s	remaining: 20.6s
586:	learn: 379006.1774257	total: 29.2s	remaining: 20.5s
587:	learn: 378732.9726916	total: 29.2s	remaining: 20.5s
588:	learn: 378496.0874774	total: 29.3s	remaining: 20.4s
589:	learn: 378186.4635637	total: 29.3s	remaining: 20.4s
590:	learn: 377983.2986044	total: 29.4s	remaining: 20.3s
591:	learn: 377738.7591868	total: 29.4s	remaining: 20.3s
592:	learn: 376808.4403563	total: 29.5s	remaining: 20.2s
593:	learn: 376581.8537450	total: 29.5s	remaining: 20.2s
594:	learn: 376054.4634969	total: 29.6s	remaining: 20.1s
595:	learn: 375649.9706870	total: 29.6s	remaining: 20.1s
596:	learn: 375253.9342755	total: 29.7s	remaining: 20s
597:	learn: 375177.1022713	total: 29.7s	remaining: 20s
598:	learn: 375050.5762859	total: 29.8s	remaining: 19.9s
599:	learn: 374869.2056574	total: 29.8s	remaining: 19.9s
600:	learn: 374602.1780907	total: 29.9s	remaining: 19.8s
601:	learn: 374529.0962954	total: 29.9s	remaining: 19.8s
602:	learn: 374396.0520091	total: 3

732:	learn: 357636.3341497	total: 36.6s	remaining: 13.3s
733:	learn: 357395.1216942	total: 36.7s	remaining: 13.3s
734:	learn: 357380.5521248	total: 36.7s	remaining: 13.2s
735:	learn: 357372.0664602	total: 36.8s	remaining: 13.2s
736:	learn: 357308.7950693	total: 36.8s	remaining: 13.1s
737:	learn: 357268.2190838	total: 36.9s	remaining: 13.1s
738:	learn: 357177.6182885	total: 36.9s	remaining: 13s
739:	learn: 357124.1352674	total: 37s	remaining: 13s
740:	learn: 356928.9653790	total: 37s	remaining: 12.9s
741:	learn: 356758.2311106	total: 37.1s	remaining: 12.9s
742:	learn: 356673.3700134	total: 37.1s	remaining: 12.8s
743:	learn: 356651.5662063	total: 37.1s	remaining: 12.8s
744:	learn: 356645.7188310	total: 37.2s	remaining: 12.7s
745:	learn: 356245.8622225	total: 37.3s	remaining: 12.7s
746:	learn: 356197.2094183	total: 37.3s	remaining: 12.6s
747:	learn: 356057.7437129	total: 37.4s	remaining: 12.6s
748:	learn: 355641.1154934	total: 37.4s	remaining: 12.5s
749:	learn: 355451.8784753	total: 37.5s

877:	learn: 343925.7698019	total: 44.1s	remaining: 6.12s
878:	learn: 343902.6781744	total: 44.1s	remaining: 6.08s
879:	learn: 343742.6819852	total: 44.2s	remaining: 6.03s
880:	learn: 343694.2740890	total: 44.3s	remaining: 5.98s
881:	learn: 343690.5724340	total: 44.3s	remaining: 5.93s
882:	learn: 343658.0700681	total: 44.4s	remaining: 5.88s
883:	learn: 343646.8498035	total: 44.4s	remaining: 5.83s
884:	learn: 343643.5276636	total: 44.5s	remaining: 5.78s
885:	learn: 343629.7174688	total: 44.5s	remaining: 5.73s
886:	learn: 343442.4061513	total: 44.6s	remaining: 5.68s
887:	learn: 343337.5508588	total: 44.6s	remaining: 5.63s
888:	learn: 343303.7556911	total: 44.7s	remaining: 5.58s
889:	learn: 343273.8772959	total: 44.7s	remaining: 5.53s
890:	learn: 343263.0701848	total: 44.8s	remaining: 5.48s
891:	learn: 343226.8090926	total: 44.8s	remaining: 5.43s
892:	learn: 343120.8560631	total: 44.9s	remaining: 5.38s
893:	learn: 342642.6611433	total: 44.9s	remaining: 5.33s
894:	learn: 342365.6088450	tota

<catboost.core.CatBoostRegressor at 0x21fd605dbb0>

In [75]:
model.get_best_score()

{'learn': {'RMSE': 334486.2854228151}}

### Признаки отобранные моделью

In [76]:
pd.Series(model.get_feature_importance(), index=model.feature_names_).sort_values(ascending=False)

enginePower            20.979788
productionDate         19.854027
mileage                17.603432
engineDisplacement     16.110220
brand                  12.098046
fuelType                3.265645
Привод                  2.974489
model_name              2.780468
bodyType                2.386751
Владельцы               0.653954
color                   0.495515
vendor                  0.411316
vehicleTransmission     0.259277
numberOfDoors           0.103316
ПТС                     0.023757
Руль                    0.000000
dtype: float64

### Сделаем сабмит на kaggle

In [77]:
X_test = df[~df.is_train].drop('price', axis=1).loc[:, cat_cols+num_cols]

In [78]:
def make_submit(model, X_test, submit_file, index=None):
    predict_test = np.round(model.predict(X_test))
    answer = pd.Series(predict_test, index=index, name='price')
    answer.to_csv(submit_file, index_label='sell_id')

In [79]:
make_submit(model, X_test, submit_file='predict.csv', index=df[~df.is_train].sell_id)

### Итоги

* Построили простую модель на основе CatBoostRegressor с дефолтными настройками
* Самыми важными признаками для модели оказались: enginePower, productionDate, mileage