In [None]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
import time

In [None]:
try:
    data = pd.read_csv("autos.csv")
except:
    data = pd.read_csv("/datasets/autos.csv")

In [None]:
data.info()

In [None]:
before = len(data)

In [None]:
data = data[['Brand', 'Model', 'RegistrationYear', 'RegistrationMonth', 'Kilometer', 'Power', 
 'VehicleType', 'Gearbox', 'FuelType', 'Repaired', 
 'NumberOfPictures', 'LastSeen',
 'DateCreated', 'PostalCode', 'DateCrawled', 'Price']]

In [None]:
data.head(10)

In [None]:
data.describe()

In [None]:
print('Количество дубликатов', data[data.duplicated()]['Brand'].count())

In [None]:
data = data.drop_duplicates()

In [None]:
plt.boxplot(data["Kilometer"]) 
plt.title("Kilometer")

In [None]:
plt.boxplot(data["Power"])
plt.title("Power")

In [None]:
print(data[data["Power"] > 1500].count())
data = data.drop(data.loc[data["Power"] >= 700].index, axis=0)
data = data.drop(data.loc[data["Power"] <= 20].index, axis=0)

In [None]:
plt.boxplot(data["Price"])
plt.title("Price")
print(data[data["Price"] < 150].count())
data = data.drop(data.loc[data["Price"] < 150].index, axis=0)

In [None]:
plt.boxplot(data["Power"])
plt.title("Power")

In [None]:
data["DateCreated"] = pd.to_datetime(data["DateCreated"])

In [None]:
data["DateCreated_Year"] = data["DateCreated"].dt.year
data["DateCreated_Month"] = data["DateCreated"].dt.month
data = data.drop(columns="DateCreated", axis=1)

In [None]:
data["VehicleType"].unique()

In [None]:
data["Gearbox"].unique()

In [None]:
data["Model"].unique()

In [None]:
data["FuelType"].unique()

In [None]:
data["FuelType"].loc[data["FuelType"] == 'gasoline']= "petrol"

In [None]:
data["Brand"].unique()

In [None]:
data["Repaired"].unique()

In [None]:
data["Repaired"] = data["Repaired"].fillna("unknown")

In [None]:
data[["VehicleType", "Gearbox", "FuelType", "Brand", "Repaired", "Model"]] = data[["VehicleType", "Gearbox", "FuelType", "Brand", "Repaired", "Model"]].fillna("unknown")

In [None]:
after = len(data)

In [None]:
print(100 - after/before*100)

In [None]:
categories_list = ["VehicleType", "Gearbox", "FuelType", "Brand", "Repaired", "Model"]

In [None]:
features = data.drop(["Price", "LastSeen", "PostalCode", "DateCrawled"], axis=1)
target = data["Price"]

In [None]:
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.33, random_state=12345, shuffle=True,)
features_test, features_valid, target_test, target_valid = train_test_split(features_test, target_test, test_size=0.33, random_state=12345, shuffle=True,)

In [None]:
ohe = OneHotEncoder(sparse=False, handle_unknown="ignore", drop='first') # учтено

X_train = ohe.fit_transform(features_train[categories_list])
ohe_train = pd.DataFrame(X_train, columns=ohe.get_feature_names_out(categories_list))

features_train = features_train.drop(categories_list, axis=1)
features_train = features_train.join(ohe_train)

In [None]:
X_train = ohe.transform(features_test[categories_list])
ohe_train = pd.DataFrame(X_train, columns=ohe.get_feature_names_out(categories_list))

features_test = features_test.drop(categories_list, axis=1)
features_test = features_test.join(ohe_train)

In [None]:
X_train = ohe.transform(features_valid[categories_list])
ohe_train = pd.DataFrame(X_train, columns=ohe.get_feature_names_out(categories_list))

features_valid = features_valid.drop(categories_list, axis=1)
features_valid = features_valid.join(ohe_train)

In [None]:
features_train = features_train.fillna(0)
features_test = features_test.fillna(0)
features_valid = features_valid.fillna(0)

In [None]:
depth = [5, 10, 16]
best_catboost_model = None
best_rmse_catboost = 10 ** 10
best_depth = None
time_1 = None
pred_time = None

In [None]:
for i in depth:
    model = CatBoostRegressor(iterations=2_000, random_seed=12345, depth=i, devices="0:1",
                               task_type="GPU", loss_function="RMSE", early_stopping_rounds=10, silent=True)
    start = time.time()
    model.fit(features_train, target_train, use_best_model=True, eval_set=(features_valid, target_valid))
    end = time.time()
    pred_start = time.time()
    prediction = model.predict(features_valid)
    pred_end = time.time()
    rmse = mean_squared_error(prediction, target_valid, squared=False)

    if rmse < best_rmse_catboost:
        best_catboost_model = model
        best_rmse_catboost = rmse
        best_depth = i
        time_1 = (end - start) * 10**3
        pred_time = (pred_end - pred_start) * 10**3

In [None]:
print(f"Catboost RMSE = {mean_squared_error(best_catboost_model.predict(features_test), target_test, squared=False):0.2f}\nBest depth = {best_depth}\nTime ms = {time_1:0.2f}\nPrediction time = {pred_time:0.2f}")

In [None]:
depth = [5, 10, 16]
best_rfc_model = None
best_rmse_rfc = 10 ** 10
best_depth = None
ensembles = [25, 50, 75, 100]
max_ensembles = None
time_2 = None
pred_time = None

In [None]:
for i in depth:
    for g in ensembles:
        model = RandomForestRegressor(random_state=12345, max_depth=i, n_estimators=g, n_jobs=4)
        start = time.time()
        model.fit(features_train, target_train)
        end = time.time()
        pred_start = time.time()
        prediction = model.predict(features_valid)
        pred_end = time.time()
        rmse = mean_squared_error(prediction, target_valid, squared=False)

        if rmse < best_rmse_rfc:
            best_rfc_model = model
            best_rmse_rfc = rmse
            best_depth = i
            max_ensembles = g
            time_2 = (end - start) * 10**3
            pred_time = (pred_end - pred_start) * 10**3

In [None]:
print(f"RandomForest RMSE = {mean_squared_error(best_rfc_model.predict(features_test), target_test, squared=False):0.2f}\nBest depth = {best_depth}\nBest ensembles = {max_ensembles}\nTime ms = {time_2:0.2f}\nPrediction time = {pred_time:0.2f}")