# Базовое решение
## Импорт данных

In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error 

import numpy as np

In [25]:
df=pd.read_csv('cars_train.csv')
df

Unnamed: 0,Price,Make,Model,Year,Style,Fuel,Engine,Distance,Cylinders,Transmission,Drive,Wheel,Color,ID
0,16621,CHEVROLET,Equinox,2011,Jeep,Petrol,3,192000 km,6.0,Tiptronic,4x4,Left wheel,Black,0
1,8467,HONDA,FIT,2006,Hatchback,Petrol,1.3,200000 km,4.0,Variator,Front,Right-hand drive,Black,1
2,11726,HONDA,FIT,2014,Hatchback,Petrol,1.3,91901 km,4.0,Automatic,Front,Left wheel,Silver,2
3,26657,LEXUS,RX 350,2007,Jeep,Petrol,,128500 km,6.0,Automatic,4x4,Left wheel,Silver,3
4,8781,FORD,Transit,1999,Microbus,CNG,4,0 km,8.0,Manual,Rear,Left wheel,Blue,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9614,5802,MERCEDES-BENZ,E 350,2013,Sedan,Diesel,3.5,107800 km,6.0,Automatic,Rear,Left wheel,Grey,9614
9615,8467,MERCEDES-BENZ,CLK 200,1999,Coupe,CNG,2.0 Turbo,300000 km,4.0,Manual,Rear,Left wheel,Silver,9615
9616,15681,HYUNDAI,Sonata,2011,Sedan,Petrol,2.4,161600 km,4.0,Tiptronic,Front,Left wheel,Red,9616
9617,26108,HYUNDAI,Tucson,2010,Jeep,Diesel,2,116365 km,4.0,Automatic,Front,Left wheel,Grey,9617


## Базовая очистка данных

In [26]:
def clear_df(df):
    df['Engine'] = df['Engine'].astype(str).str.extract(r'(\d+\.?\d*)')[0].astype(float)
    df['Distance'] = df['Distance'].astype(str).str.extract(r'(\d+\.?\d*)')[0].astype(float)
    df['Wheel'] = df['Wheel'].map({'Left wheel':0,'Right-hand drive':1})
    df['Transmission'] = (df['Transmission'] == 'Manual').astype(int)
    df['Drive'] = (df['Drive'] == '4x4').astype(int)
    return df

ID тоже пока не нужно

In [27]:
df = clear_df(df).drop(columns=['ID'])

In [28]:
df.describe()

Unnamed: 0,Price,Year,Engine,Distance,Cylinders,Transmission,Drive,Wheel
count,9619.0,9619.0,9523.0,9619.0,9523.0,9619.0,9619.0,9619.0
mean,19664.54,2010.955921,2.308548,1873337.0,4.566418,0.09502,0.207194,0.077243
std,268668.2,5.666966,0.872758,55888370.0,1.18037,0.293258,0.405317,0.26699
min,1.0,1939.0,0.0,0.0,1.0,0.0,0.0,0.0
25%,5331.0,2009.0,1.8,69407.5,4.0,0.0,0.0,0.0
50%,13000.0,2012.0,2.0,125000.0,4.0,0.0,0.0,0.0
75%,22063.0,2015.0,2.5,187417.0,4.0,0.0,0.0,0.0
max,26307500.0,2020.0,20.0,2147484000.0,16.0,1.0,1.0,1.0


## EDA должно быть тут

...

## Выбор колонок для обучения модели

X - входные данные, y - целевая колонка

In [29]:
numeric_df = df.select_dtypes(include='number')

numeric_df = numeric_df.dropna()

X = numeric_df.drop('Price', axis=1)
y = numeric_df['Price']

X.head(3)

Unnamed: 0,Year,Engine,Distance,Cylinders,Transmission,Drive,Wheel
0,2011,3.0,192000.0,6.0,0,1,0
1,2006,1.3,200000.0,4.0,0,0,1
2,2014,1.3,91901.0,4.0,0,0,0


## Обучение модели

In [18]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

model = RandomForestRegressor(n_estimators=10, random_state=42)
model.fit(X_train, y_train)



In [19]:
# Предсказание
y_pred = model.predict(X_train)


MAE = mean_absolute_error(y_train, y_pred)
MAPE = mean_absolute_percentage_error(y_train, y_pred)  

print("MAE:", MAE)
print("MAPE, %:", MAPE )

MAE: 3381.558765629821
MAPE, %: 4.09844113134101


## Предсказание на валидационной выборке для оценки точности

In [20]:
# Предсказание
y_pred = model.predict(X_test)


MAE = mean_absolute_error(y_test, y_pred)
MAPE = mean_absolute_percentage_error(y_test, y_pred)  

print("MAE:", MAE)
print("MAPE, %:", MAPE )

MAE: 35359.18003770443
MAPE, %: 15.137822580840846


## Тест

### Сформируем результат для тестового набора 

In [21]:
df_test=pd.read_csv('cars_test.csv')

df_test = clear_df(df_test) 

df_test = df_test.dropna().drop_duplicates()

X_test = df_test[X.columns]

y_pred = model.predict(X_test)

df_submit = df_test[['ID']].copy()

df_submit['Predict'] = y_pred

df_submit.shape

(9618, 2)

### Сохраним `Submit.csv` (для отправки)

In [577]:
df_submit.to_csv('submit.csv', index = False)
df_submit.head(3)

Unnamed: 0,ID,Predict
0,0,12890.233333
1,1,4864.3
2,2,4602.1


# Как будет проходить проверка (Для справки)

In [22]:
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error 

In [579]:
df_submit=pd.read_csv('submit.csv')
df_test=pd.read_csv('cars_test_full.csv')


df_merged = pd.merge(df_submit, df_test, on='ID', how='left').drop_duplicates()

y_true = df_merged['Price']
y_pred = df_merged['Predict']

MAE = mean_absolute_error(y_true, y_pred)
MAPE = mean_absolute_percentage_error(y_true, y_pred)  

print("MAE:", MAE)
print("MAPE, %:", MAPE )

MAE: 8479.287384826877
MAPE, %: 17.05526244051365
