# Определение стоимости автомобилей

Сервис по продаже автомобилей с пробегомразрабатывает приложение для привлечения новых клиентов. В нём можно быстро узнать рыночную стоимость своего автомобиля. В вашем распоряжении исторические данные: технические характеристики, комплектации и цены автомобилей. Необходимо построить модель для определения стоимости. 

Заказчику важны:

- качество предсказания;
- скорость предсказания;
- время обучения.

## Подготовка данных

In [76]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.metrics.scorer import make_scorer
from sklearn.datasets import make_regression

In [77]:
df = pd.read_csv('/datasets/autos.csv')

df.info()
display(df.shape)
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 354369 entries, 0 to 354368
Data columns (total 16 columns):
DateCrawled          354369 non-null object
Price                354369 non-null int64
VehicleType          316879 non-null object
RegistrationYear     354369 non-null int64
Gearbox              334536 non-null object
Power                354369 non-null int64
Model                334664 non-null object
Kilometer            354369 non-null int64
RegistrationMonth    354369 non-null int64
FuelType             321474 non-null object
Brand                354369 non-null object
NotRepaired          283215 non-null object
DateCreated          354369 non-null object
NumberOfPictures     354369 non-null int64
PostalCode           354369 non-null int64
LastSeen             354369 non-null object
dtypes: int64(7), object(9)
memory usage: 43.3+ MB


(354369, 16)

Unnamed: 0,DateCrawled,Price,VehicleType,RegistrationYear,Gearbox,Power,Model,Kilometer,RegistrationMonth,FuelType,Brand,NotRepaired,DateCreated,NumberOfPictures,PostalCode,LastSeen
0,2016-03-24 11:52:17,480,,1993,manual,0,golf,150000,0,petrol,volkswagen,,2016-03-24 00:00:00,0,70435,2016-04-07 03:16:57
1,2016-03-24 10:58:45,18300,coupe,2011,manual,190,,125000,5,gasoline,audi,yes,2016-03-24 00:00:00,0,66954,2016-04-07 01:46:50
2,2016-03-14 12:52:21,9800,suv,2004,auto,163,grand,125000,8,gasoline,jeep,,2016-03-14 00:00:00,0,90480,2016-04-05 12:47:46
3,2016-03-17 16:54:04,1500,small,2001,manual,75,golf,150000,6,petrol,volkswagen,no,2016-03-17 00:00:00,0,91074,2016-03-17 17:40:17
4,2016-03-31 17:25:20,3600,small,2008,manual,69,fabia,90000,7,gasoline,skoda,no,2016-03-31 00:00:00,0,60437,2016-04-06 10:17:21


Удалим ненужные для анализа столбцы. Условиями, определяющими цену, на наш взгляд, являются: год выпуска, пробег, марка и модель, а также подвергалась ли машина ремонту. Моделей существует большое множество, но у нас есть данные о мощности автомобилей. Чтобы ускорить работу моеделй, предположим, что у одной марки может существовать только одна модель заданной мощности, что также определяет тип автомобиля, вид коробки передач и тип топлива.

In [78]:
df.drop(['DateCrawled', 'VehicleType', 'RegistrationMonth', 'DateCreated', 'NumberOfPictures', 'PostalCode', 
         'LastSeen', 'Model', 'Gearbox', 'FuelType'], axis=1, inplace = True)

display(df.info())
display(df.shape)
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 354369 entries, 0 to 354368
Data columns (total 6 columns):
Price               354369 non-null int64
RegistrationYear    354369 non-null int64
Power               354369 non-null int64
Kilometer           354369 non-null int64
Brand               354369 non-null object
NotRepaired         283215 non-null object
dtypes: int64(4), object(2)
memory usage: 16.2+ MB


None

(354369, 6)

Unnamed: 0,Price,RegistrationYear,Power,Kilometer,Brand,NotRepaired
0,480,1993,0,150000,volkswagen,
1,18300,2011,190,125000,audi,yes
2,9800,2004,163,125000,jeep,
3,1500,2001,75,150000,volkswagen,no
4,3600,2008,69,90000,skoda,no


In [79]:
# Заменим категориальные признаки числовыми
df_ohe = pd.get_dummies(df, drop_first=True)
display(df_ohe.shape)
display(df_ohe.head())

(354369, 44)

Unnamed: 0,Price,RegistrationYear,Power,Kilometer,Brand_audi,Brand_bmw,Brand_chevrolet,Brand_chrysler,Brand_citroen,Brand_dacia,...,Brand_skoda,Brand_smart,Brand_sonstige_autos,Brand_subaru,Brand_suzuki,Brand_toyota,Brand_trabant,Brand_volkswagen,Brand_volvo,NotRepaired_yes
0,480,1993,0,150000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,18300,2011,190,125000,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,9800,2004,163,125000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1500,2001,75,150000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,3600,2008,69,90000,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [80]:
# Выделим целевые признаки и условия
target = df_ohe['Price']
features = df_ohe.drop('Price', axis=1)

# Разобьём данные на выборки
features_train_1, features_valid, target_train_1, target_valid = train_test_split(features, target, test_size=0.20, random_state=12345)
features_train, features_test, target_train, target_test = train_test_split(features_train_1, target_train_1, test_size=0.25, random_state=12345)

# Проверим, правильно ли распределились данные
display(features_train.shape)
display(features_valid.shape)
display(features_test.shape)

display (len(features_train) / (len(features_train) + len(features_valid) + len(features_test)))
display (len(features_valid) / (len(features_train) + len(features_valid) + len(features_test)))
display (len(features_test) / (len(features_train) + len(features_valid) + len(features_test)))


(212621, 43)

(70874, 43)

(70874, 43)

0.5999988712330931

0.2000005643834534

0.2000005643834534

## Обучение моделей

Подберем оптимальные гиперпараметры для различных моделей

In [65]:
%%time
model = LinearRegression()
model.fit(features_train, target_train)
predictions = model.predict(features_valid) 
result = mean_squared_error(target_valid, predictions) ** 0.5 
print("LinearRegression", result)

LinearRegression 3852.607989423627
CPU times: user 1.03 s, sys: 387 ms, total: 1.42 s
Wall time: 1.41 s


In [66]:
best_result = 5000
best_depth = 0
for depth in range(10, 20, 1):
    model = DecisionTreeRegressor(max_depth=depth, random_state=12345)
    model.fit(features_train, target_train)
    predictions_valid = model.predict(features_valid)
    result = mean_squared_error(target_valid, predictions_valid) ** 0.5
    if result < best_result:
        best_result = result
        best_depth = depth
    
print(best_depth, best_result)

14 2145.6947851677387


In [67]:
%%time
model = DecisionTreeRegressor(random_state=12345, max_depth=13) 
model.fit(features_train, target_train)
predictions = model.predict(features_valid)
result = mean_squared_error(target_valid, predictions) ** 0.5
print('DecisionTreeRegressor', result)

DecisionTreeRegressor 2147.1326066776414
CPU times: user 1.17 s, sys: 28.9 ms, total: 1.2 s
Wall time: 1.31 s


In [68]:
best_result = 5000
best_est = 0
best_depth = 0
for est in range(70, 91, 10):
    for depth in range (9, 11, 1):
        model = RandomForestRegressor(random_state=12345, n_estimators=est, max_depth=depth)
        model.fit(features_train, target_train) 
        predictions_valid = model.predict(features_valid) 
        result = mean_squared_error(target_valid, predictions_valid)**0.5 
        if result < best_result:
            best_result = result
            best_est = est
            best_depth = depth
            
print(best_est, best_depth, best_result)

80 10 2128.51862315896


In [69]:
%%time
model = RandomForestRegressor(random_state=12345, n_estimators=80, max_depth=10)
model.fit(features_train, target_train) 
predictions = model.predict(features_valid) 
result = mean_squared_error(target_valid, predictions)**0.5 
print('RandomForestRegressor', result)

RandomForestRegressor 2128.51862315896
CPU times: user 48.7 s, sys: 23 ms, total: 48.7 s
Wall time: 49 s


In [70]:
best_result = 5000
best_est = 0
best_depth = 0
for est in range(2000, 2201, 100):
    for depth in range (9, 11, 1):
        model = LGBMRegressor(n_estimators=est, max_depth=depth)
        model.fit(features_train, target_train) 
        predictions_valid = model.predict(features_valid) 
        result = mean_squared_error(target_valid, predictions_valid)**0.5 
        if result < best_result:
            best_result = result
            best_est = est
            best_depth = depth
            
print(best_est, best_depth, best_result)

2000 10 1929.3007566272267


In [71]:
%%time
model = LGBMRegressor(n_estimators=2200, max_depth=10)
model.fit(features_train, target_train) 
predictions_valid = model.predict(features_valid) 
result = mean_squared_error(target_valid, predictions_valid)**0.5 
print('LGBMRegressor', result)

LGBMRegressor 1929.718654917422
CPU times: user 1min 51s, sys: 681 ms, total: 1min 52s
Wall time: 1min 53s


## Анализ моделей

Проверим работу различных моделей на тестовой выборке и посмотрим, какое время требуется для их обучения

In [81]:
%%time
model = LinearRegression()
model.fit(features_train, target_train)

CPU times: user 1.04 s, sys: 495 ms, total: 1.53 s
Wall time: 1.54 s


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [82]:
%%time
predictions = model.predict(features_test) 
result = mean_squared_error(target_test, predictions) ** 0.5 
print("LinearRegression", result)

LinearRegression 3841.780370810574
CPU times: user 33.8 ms, sys: 4.26 ms, total: 38.1 ms
Wall time: 20.3 ms


In [83]:
%%time
model = DecisionTreeRegressor(random_state=12345, max_depth=13) 
model.fit(features_train, target_train)

CPU times: user 1.25 s, sys: 24.8 ms, total: 1.27 s
Wall time: 1.29 s


DecisionTreeRegressor(criterion='mse', max_depth=13, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=12345, splitter='best')

In [84]:
%%time
predictions = model.predict(features_test)
result = mean_squared_error(target_test, predictions) ** 0.5
print('DecisionTreeRegressor', result)

DecisionTreeRegressor 2096.399540937694
CPU times: user 27.4 ms, sys: 8.25 ms, total: 35.7 ms
Wall time: 33.2 ms


In [85]:
%%time
model = RandomForestRegressor(random_state=12345, n_estimators=80, max_depth=10)
model.fit(features_train, target_train) 

CPU times: user 55 s, sys: 20.8 ms, total: 55.1 s
Wall time: 55.8 s


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=80,
                      n_jobs=None, oob_score=False, random_state=12345,
                      verbose=0, warm_start=False)

In [86]:
%%time
predictions = model.predict(features_test) 
result = mean_squared_error(target_test, predictions)**0.5 
print('RandomForestRegressor', result)

RandomForestRegressor 2104.6924403219837
CPU times: user 687 ms, sys: 0 ns, total: 687 ms
Wall time: 698 ms


In [87]:
%%time
model = LGBMRegressor(n_estimators=2200, max_depth=10)
model.fit(features_train, target_train) 

CPU times: user 10min 48s, sys: 3.9 s, total: 10min 52s
Wall time: 10min 57s


LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=10,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=2200, n_jobs=-1, num_leaves=31, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [88]:
%%time
predictions = model.predict(features_test) 
result = mean_squared_error(target_test, predictions)**0.5 
print('LGBMRegressor', result)

LGBMRegressor 1900.7876821950622
CPU times: user 20.8 s, sys: 13.3 ms, total: 20.8 s
Wall time: 20.8 s


**Вывод**

Мы видим, что наибольшее качество, т.е. метрику RMSE показала модлель LGBMRegressor, при этом у данной модели самое значительное время обучения - более 10 минут, и время на предсказания - более 20 секунд. Представим полученные результаты в виде таблицы:

In [94]:
display(pd.DataFrame([['1.54 s', '20.3 ms', '3842'], 
                    ['1.29 s', '33.2 ms', '2096'], 
                    ['55.8 s', '698 ms', '2105'],
                    ['10min 57s', '20.8 s', '1901']], 
                     columns=["Время обучения","Время предсказания", "RMSE"],
                    index = ['LinearRegression', 'DecisionTreeRegressor', 'RandomForestRegressor', 'LGBMRegressor']))

Unnamed: 0,Время обучения,Время предсказания,RMSE
LinearRegression,1.54 s,20.3 ms,3842
DecisionTreeRegressor,1.29 s,33.2 ms,2096
RandomForestRegressor,55.8 s,698 ms,2105
LGBMRegressor,10min 57s,20.8 s,1901
