# Easy

In [12]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, balanced_accuracy_score, f1_score, precision_score, recall_score, mean_absolute_error,  make_scorer
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("data/audi.csv")
print(df.head())

  model  year  price transmission  mileage fuelType  tax   mpg  engineSize
0    A1  2017  12500       Manual    15735   Petrol  150  55.4         1.4
1    A6  2016  16500    Automatic    36203   Diesel   20  64.2         2.0
2    A1  2016  11000       Manual    29946   Petrol   30  55.4         1.4
3    A4  2017  16800    Automatic    25952   Diesel  145  67.3         2.0
4    A3  2019  17300       Manual     1998   Petrol  145  49.6         1.0


In [3]:
df = df.drop(columns=['model', 'transmission', 'fuelType'])
print(df.head())

   year  price  mileage  tax   mpg  engineSize
0  2017  12500    15735  150  55.4         1.4
1  2016  16500    36203   20  64.2         2.0
2  2016  11000    29946   30  55.4         1.4
3  2017  16800    25952  145  67.3         2.0
4  2019  17300     1998  145  49.6         1.0


In [4]:
X = df.drop(columns='price')
Y = df['price']

x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=0)

In [5]:
model = DecisionTreeClassifier(random_state=10)
model.fit(x_train, y_train)
prediction = model.predict(x_test)

In [6]:
r2 = r2_score(y_test, prediction)
print(f'r2_score = {r2}')

r2_score = 0.8652360700494247


In [7]:
print(pd.DataFrame(zip(X.columns, model.feature_importances_)))

            0         1
0        year  0.067918
1     mileage  0.651864
2         tax  0.068616
3         mpg  0.182659
4  engineSize  0.028942


# Вывод:

Самым важным признаком оказался пробег, на втором месте идёт потребление топлива, остальные признаки имеют сильно меньший вклад.

# Medium

In [8]:
data_folder = 'data'
all_df = pd.DataFrame()
common_columns = None # Оказалось, что столбцы в разных файлах иногда немного отличаются, так что придется оставить только общие для всех

for file_name in os.listdir(data_folder):
    if file_name.endswith('.csv'):  
        brand_name = os.path.splitext(file_name)[0]
        file_path = os.path.join(data_folder, file_name)
        # Данных очень много, поэтому возьмём четверть
        df = pd.read_csv(file_path, skiprows=lambda x: x % 4 != 0)
        if common_columns is None:
            common_columns = set(df.columns)
        else:
            common_columns = common_columns.intersection(df.columns)
        df['brand'] = brand_name
        all_df = pd.concat([all_df, df], ignore_index=True)


common_columns = list(common_columns)
all_df = all_df[common_columns + ['brand']]

all_df

Unnamed: 0,model,mileage,price,fuelType,transmission,engineSize,year,brand
0,Corsa,25796,8500,Petrol,Manual,1.4,2016,vauxhall
1,Corsa,3953,9990,Petrol,Manual,1.4,2019,vauxhall
2,Corsa,52243,4350,Petrol,Manual,1.2,2013,vauxhall
3,Corsa,40584,5995,Petrol,Manual,1.2,2014,vauxhall
4,Corsa,7905,9495,Petrol,Manual,1.4,2018,vauxhall
...,...,...,...,...,...,...,...,...
27126,C Class,12794,23994,Petrol,Automatic,1.5,2019,merc
27127,C Class,10284,23444,Diesel,Automatic,2.0,2019,merc
27128,C Class,6064,24999,Diesel,Automatic,2.0,2019,merc
27129,B Class,15257,19344,Petrol,Automatic,1.3,2019,merc


In [9]:
df_encoded = pd.get_dummies(df, columns=['model', 'fuelType', 'transmission', 'brand'])
df_encoded

Unnamed: 0,year,price,mileage,tax,mpg,engineSize,model_ A Class,model_ B Class,model_ C Class,model_ CL Class,...,model_220,model_230,fuelType_Diesel,fuelType_Hybrid,fuelType_Other,fuelType_Petrol,transmission_Automatic,transmission_Manual,transmission_Semi-Auto,brand_merc
0,2016,61948,16000,325,30.4,4.0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,1
1,2012,10948,107000,265,36.7,3.5,0,0,0,0,...,0,0,0,0,0,1,1,0,0,1
2,2020,26980,1000,145,62.8,1.5,1,0,0,0,...,0,0,1,0,0,0,1,0,0,1
3,2017,15890,24841,150,68.9,1.5,0,1,0,0,...,0,0,1,0,0,0,1,0,0,1
4,2014,15701,20498,20,64.2,2.1,0,0,1,0,...,0,0,1,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3274,2019,23994,12794,145,46.3,1.5,0,0,1,0,...,0,0,0,0,0,1,1,0,0,1
3275,2019,23444,10284,145,61.4,2.0,0,0,1,0,...,0,0,1,0,0,0,1,0,0,1
3276,2019,24999,6064,145,61.4,2.0,0,0,1,0,...,0,0,1,0,0,0,1,0,0,1
3277,2019,19344,15257,145,45.6,1.3,0,1,0,0,...,0,0,0,0,0,1,1,0,0,1


In [14]:
X = df_encoded.drop(columns='price')
Y = df_encoded['price']
x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=10)
params = {
    'n_estimators': [100, 500],
    'max_depth': [100, 150],
    'criterion': ['gini', 'entropy']
}

search = GridSearchCV(RandomForestClassifier(), params, scoring= make_scorer(r2_score))
search.fit(x_train, y_train)
search.best_estimator_



In [18]:
results_df = pd.DataFrame(search.cv_results_)
results_df = results_df[['param_max_depth', 'param_n_estimators', 'param_criterion', 'mean_test_score', 'rank_test_score']]
# Вывод всех результатов поиска
print("Все результаты GridSearchCV:")
results_df

Все результаты GridSearchCV:


Unnamed: 0,param_max_depth,param_n_estimators,param_criterion,mean_test_score,rank_test_score
0,100,100,gini,0.843986,3
1,100,500,gini,0.837787,4
2,150,100,gini,0.854462,2
3,150,500,gini,0.856497,1
4,100,100,entropy,0.819692,5
5,100,500,entropy,0.800938,7
6,150,100,entropy,0.793461,8
7,150,500,entropy,0.815372,6


# Вывод:

Лучшие результаты показывают модели с критерием Джини, среди них лучего всего себя показывают модели с более высокой глубиной, количество деревьев дает улучшение только при более высокой глубине, иначе ухудшает результат