<a href="https://colab.research.google.com/github/Mihail-Chr/projects/blob/main/ML/car_cost/car_deep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install catboost
! pip install gensim

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost


In [None]:
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import Ridge, LinearRegression, SGDRegressor, PassiveAggressiveRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score
import numpy as np

RANDOM_STATE = 42

def cv_model(clf, X, y):
    # Задаём сетки гиперпараметров только для тех моделей, которые были в коде
    param_grid = {}
    clf_name = clf.__class__.__name__

    if clf_name == 'SVR':
        param_grid = {'C': np.arange(0.1, 1.2, 0.4)}
    elif clf_name == 'Ridge':
        param_grid = {'alpha': np.arange(0.3, 1, 0.1)}

    if param_grid:
        grid = GridSearchCV(
            clf,
            param_grid=param_grid,
            cv=3,
            scoring='neg_root_mean_squared_error',
            n_jobs=-1,
            verbose=0
        )
        grid.fit(X, y)
        best_clf = grid.best_estimator_
        best_params = grid.best_params_
        best_score = -grid.best_score_
        fit_time = np.mean(grid.cv_results_['mean_fit_time'])
        score_time = np.mean(grid.cv_results_['mean_score_time'])
    else:
        # Для моделей без гиперпараметров просто кросс-валидация со средним RMSE
        scores = cross_val_score(clf, X, y, cv=3, scoring='neg_root_mean_squared_error', n_jobs=-1)
        best_clf = clf.fit(X, y)  # тренируем на всех данных
        best_params = {}
        best_score = -np.mean(scores)
        fit_time = None
        score_time = None

    print(f"\n{clf_name}")
    print(f"Best params: {best_params}")
    print(f"Best RMSE: {best_score:.4f}")
    if fit_time is not None:
        print(f"Avg fit time: {fit_time:.3f} sec, avg predict time: {score_time:.3f} sec")

    return {
        'model': clf_name,
        'best_params': best_params,
        'rmse': best_score,
        'fit_time': fit_time,
        'score_time': score_time
    }

def cross_validate_all_models(X, y):
    results = []
    models = [
        DummyRegressor(),
        Ridge(max_iter=300, random_state=RANDOM_STATE),
        KNeighborsRegressor(n_neighbors=80, n_jobs=-1),
        LinearRegression(),
        RandomForestRegressor(n_estimators=50, warm_start=True, n_jobs=-1, random_state=RANDOM_STATE),
        SVR(max_iter=200),
        SGDRegressor(alpha=1e-4, n_iter_no_change=3, early_stopping=True),
        PassiveAggressiveRegressor(),
        CatBoostRegressor(iterations=1000, learning_rate=0.001, depth=6,
                          loss_function='RMSE', random_seed=RANDOM_STATE, early_stopping_rounds=400,
                          allow_writing_files=False, verbose=False),
        LGBMRegressor(metric='neg_root_mean_squared_error', random_state=RANDOM_STATE, verbose=0)
    ]

    for clf in models:
        print("="*80)
        res = cv_model(clf, X, y)
        results.append(res)
    return results

# --- Пример использования ---
# X, y — подготовленные признаки и целевая переменная (например, из подготовленного polars -> pandas -> sklearn pipeline)
# results = cross_validate_all_models(X, y)


In [None]:
import time
import polars as pl
import cupy as cp
import numba
from numba import njit, prange
from cuml.ensemble import RandomForestRegressor
from cuml.preprocessing import LabelEncoder
from cuml.model_selection import train_test_split
from cuml.metrics import mean_squared_error

# Загрузка и предобработка данных
def load_and_preprocess_data():
    # Загрузка данных с помощью Polars
    df = pl.read_csv('autos.csv', sep=',', decimal='.')

    # Предобработка данных
    @njit(parallel=True)
    def preprocess(df):
        # Обработка пропусков
        df = df.fill_null(strategy="mean")

        # Кодирование категориальных признаков
        cat_cols = df.select_dtypes(include=['category']).columns
        df = df.with_columns(
            pl.col(col).cast(pl.UInt32) for col in cat_cols
        )

        # Нормализация числовых признаков
        num_cols = df.select_dtypes(include=['float64', 'int64']).columns
        df = df.with_columns(
            (pl.col(col) - pl.col(col).mean()) / pl.col(col).std()
            for col in num_cols
        )

        return df

    df = preprocess(df)
    return df

# Основная функция
def main():
    start_time = time.time()

    # Загрузка и обработка данных
    df = load_and_preprocess_data()

    # Разделение на признаки и целевую переменную
    X = df.drop_columns('Price')
    y = df['Price']

    # Преобразование в Cupy
    X_cupy = cp.array(X.to_numpy())
    y_cupy = cp.array(y.to_numpy())

    # Разделение на train/test
    X_train, X_test, y_train, y_test = train_test_split(
        X_cupy, y_cupy, test_size=0.2, random_state=42
    )

    # Инициализация модели
    model = RandomForestRegressor(n_estimators=100, max_depth=10)

    # Обучение модели
    train_start = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - train_start

    # Предсказание
    predict_start = time.time()
    y_pred = model.predict(X_test)
    predict_time = time.time() - predict_start

    # Оценка модели
    mse = mean_squared_error(y_test, y_pred)

    # Вывод результатов
    total_time = time.time() - start_time
    print(f"Общее время выполнения: {total_time:.2f} секунд")
    print(f"Время обучения: {train_time:.2f} секунд")
    print(f"Время предсказания: {predict_time:.2f} секунд")
    print(f"MSE: {mse:.2f}")

if __name__ == "__main__":
    main()


In [1]:
!pip install polars cuml-cu11 cupy-cuda11x catboost cudf-cu11 numba
# lightgbm scikit-learn matplotlib seaborn

Collecting cuml-cu11
  Downloading cuml_cu11-25.6.0.tar.gz (2.5 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting cupy-cuda11x
  Downloading cupy_cuda11x-13.5.1-cp311-cp311-manylinux2014_x86_64.whl.metadata (2.4 kB)
Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting cudf-cu11
  Downloading cudf_cu11-25.6.0.tar.gz (2.7 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting cuda-python<12.0a0,>=11.8.5 (from cuml-cu11)
  Downloading cuda_python-11.8.7-py3-none-any.whl.metadata (14 kB)
Collecting cuvs-cu11==25.6.* (from cuml-cu11)
  Downloading cuvs_cu11-25.6.1.tar.gz (1.0 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build whee

In [2]:
!pip install cudf-cu11 cuml-cu11 cugraph-cu11 --extra-index-url=https://pypi.nvidia.com
!pip install cupy-cuda11x

Looking in indexes: https://pypi.org/simple, https://pypi.nvidia.com
Collecting cugraph-cu11
  Downloading https://pypi.nvidia.com/cugraph-cu11/cugraph_cu11-25.6.0-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m59.1 MB/s[0m eta [36m0:00:00[0m
Collecting libcugraph-cu11==25.6.* (from cugraph-cu11)
  Downloading https://pypi.nvidia.com/libcugraph-cu11/libcugraph_cu11-25.6.0-py3-none-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (1029.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 GB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pylibcugraph-cu11==25.6.* (from cugraph-cu11)
  Downloading https://pypi.nvidia.com/pylibcugraph-cu11/pylibcugraph_cu11-25.6.0-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m90.4 MB/s[0m eta [36m0:00:00

In [3]:
import torch
print(torch.cuda.is_available())


True


In [1]:
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py

Cloning into 'rapidsai-csp-utils'...
remote: Enumerating objects: 597, done.[K
remote: Counting objects: 100% (163/163), done.[K
remote: Compressing objects: 100% (81/81), done.[K
remote: Total 597 (delta 128), reused 82 (delta 82), pack-reused 434 (from 3)[K
Receiving objects: 100% (597/597), 196.59 KiB | 448.00 KiB/s, done.
Resolving deltas: 100% (302/302), done.
Installing RAPIDS remaining 25.04 libraries
Using Python 3.11.13 environment at: /usr
Resolved 175 packages in 11.52s
Downloading cugraph-cu12 (3.0MiB)
Downloading rmm-cu12 (1.5MiB)
Downloading ucx-py-cu12 (2.2MiB)
Downloading libcuvs-cu12 (1.1GiB)
Downloading pylibcudf-cu12 (26.4MiB)
Downloading nvidia-nvcomp-cu12 (44.1MiB)
Downloading cudf-cu12 (1.7MiB)
Downloading libcudf-cu12 (538.8MiB)
Downloading libcugraph-cu12 (1.4GiB)
Downloading cuspatial-cu12 (4.1MiB)
Downloading cuml-cu12 (9.4MiB)
Downloading raft-dask-cu12 (274.9MiB)
Downloading pylibcugraph-cu12 (2.0MiB)
Downloading cuproj-cu12 (1.1MiB)
Downloading libkviki

In [5]:
import time
import polars as pl
import cudf
import cupy as cp
from numba import njit
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.dummy import DummyRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from cuml.linear_model import LinearRegression as cuLinearRegression, Ridge as cuRidge
from cuml.neighbors import KNeighborsRegressor as cuKNeighborsRegressor
from cuml.ensemble import RandomForestRegressor as cuRandomForestRegressor
from cuml.preprocessing import StandardScaler as cuStandardScaler
from catboost import CatBoostRegressor
import lightgbm as lgb
import warnings

warnings.filterwarnings('ignore')
RANDOM_STATE = 255

from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'cuda.bindings.runtime'

In [None]:
@njit
def transform_bool(data, col, true_val, false_val):
    return (data == true_val).astype(np.int32)

In [None]:
def preprocess_data(df):
    # Обработка пропусков
    df = df.with_columns([
        pl.col("VehicleType").fill_null("unknown"),
        pl.col("Gearbox").fill_null("unknown"),
        pl.col("Model").fill_null("unknown"),
        pl.col("FuelType").fill_null("unknown"),
        pl.col("Repaired").fill_null("unknown")
    ])

    # Фильтрация по году регистрации
    df = df.filter(
        (pl.col("RegistrationYear") >= 1950) &
        (pl.col("RegistrationYear") <= 2023)
    )

    # Фильтрация по цене
    df = df.filter(
        (pl.col("Price") >= 100) &
        (pl.col("Price") <= 150000)
    )

    # Преобразование bool
    df = df.with_columns([
        transform_bool(pl.col("Repaired"), "yes", "no").alias("Repaired")
    ])

    # Удаление ненужных столбцов
    df = df.drop(["DateCrawled", "DateCreated", "LastSeen", "NumberOfPictures", "PostalCode"])

    return df

In [None]:
df = pl.read_csv('/content/drive/MyDrive/car_data.csv')

In [None]:
df = preprocess_data(df)

cat_col = ['Brand','Model','FuelType','VehicleType','Gearbox','RegistrationMonth','Repaired']
num_col = ['Price','RegistrationYear','Power','Kilometer']

col_auto_df = ['Price','Brand','Model','RegistrationYear','RegistrationMonth',
           'FuelType','VehicleType','Gearbox','Power','Kilometer','Repaired']

interval_cols = num_col
avto_df = df.copy()
avto_df[cat_col] = avto_df[cat_col].astype('category')
# 'PHIK матрица'
display ('PHIK матрица ')

phik_overview = avto_df.phik_matrix( interval_cols=interval_cols)#,n_jobs=-1)
plot_correlation_matrix(
    phik_overview.values,
    x_labels=phik_overview.columns,
    y_labels=phik_overview.index,
    vmin=0, vmax=1, color_map='Greens',
    title=r'correlation $\phi_K$',
    fontsize_factor=1.3,
    figsize=(10,9))

for i in cat_col:
    if avto_df[i].nunique()<20:
        sns.countplot(y=i, data=avto_df)
        plt.show()

for n in num_col:
    display(n)
    fig, (ax_box, ax_hist) = plt.subplots(2,sharex = True,gridspec_kw = {'height_ratios': (.20, .80)})
    sns.boxplot(x = avto_df[n], ax = ax_box)
    plt.hist(avto_df[n],bins=100)
    ax_box.set(xlabel = '')
    ax_hist.set(xlabel = n)
    ax_hist.set(ylabel = 'count')
    plt.show()

In [None]:
# Разделение данных на признаки и целевую переменную
X = df.drop('Price')
y = df['Price']
# Разделение на train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
# Подготовка данных для моделей на CPU и GPU
# Для CPU моделей: оставляем в pandas (или numpy)
# Для GPU моделей: конвертируем в cuDF
# Конвертируем X_train, X_test, y_train, y_test в cuDF
X_train_cudf = cudf.DataFrame.from_pandas(X_train)
X_test_cudf = cudf.DataFrame.from_pandas(X_test)
y_train_cudf = cudf.Series(y_train.values)  # предполагая, что y_train - pandas Series
y_test_cudf = cudf.Series(y_test.values)

In [None]:
# Определение числовых и категориальных признаков
numerical_cols = ["RegistrationYear", "Power", "Kilometer", "RegistrationMonth"]
categorical_cols = ["VehicleType", "Gearbox", "Model", "FuelType", "Brand", "Repaired"]

# Создание предобработчика для CPU
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
    remainder='drop'
)

# Предобработка данных на CPU
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Предобработка на GPU
scaler = cuStandardScaler()
X_train_cudf[numerical_cols] = scaler.fit_transform(X_train_cudf[numerical_cols])
X_test_cudf[numerical_cols] = scaler.transform(X_test_cudf[numerical_cols])

# OneHotEncoding на GPU
for col in categorical_cols:
    onehot = X_train_cudf[col].one_hot_encoded(list(X_train_cudf[col].unique()))
    X_train_cudf = cudf.concat([X_train_cudf.drop(col), onehot], axis=1)

    onehot = X_test_cudf[col].one_hot_encoded(list(X_test_cudf[col].unique()))
    X_test_cudf = cudf.concat([X_test_cudf.drop(col), onehot], axis=1)

In [None]:
models_cpu = {
     "Dummy": DummyRegressor(),
     "SVR": SVR(),
     "PassiveAggressive": PassiveAggressiveRegressor(),
     "SGD": SGDRegressor(),

 }
# Модели, которые будем запускать на GPU через cuML
models_gpu_ml = {
     "cuLinearRegression": cuLinearRegression(),
     "cuRidge": cuRidge(),
     "cuKNeighbors": cuKNeighborsRegressor(),
     "cuRandomForest": cuRandomForestRegressor(),
 }
# Модели, которые могут использовать GPU через свои реализации (CatBoost, LGBM)
models_gpu_other = {
     "CatBoost": CatBoostRegressor(verbose=0, allow_writing_files=False, task_type='GPU'),
     "LGBM": LGBMRegressor(device='gpu'),
 }

In [None]:
 results = []
  # 1. Обучение моделей на CPU (используем X_train, y_train в формате pandas)
 for name, model in models_cpu.items():
        start = time.time()
        model.fit(X_train, y_train)
        train_time = time.time() - start
        start = time.time()
        y_pred = model.predict(X_test)
        predict_time = time.time() - start
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mae = mean_absolute_error(y_test, y_pred)
        results.append({
            'model': name,
            'train_time': train_time,
            'predict_time': predict_time,
            'rmse': rmse,
            'mae': mae,
            'type': 'CPU'
        })

# 2. Обучение моделей на GPU через cuML (используем X_train_cudf, y_train_cudf)
for name, model in models_gpu_ml.items():
     start = time.time()
     model.fit(X_train_cudf, y_train_cudf)
     train_time = time.time() - start
     start = time.time()
     y_pred = model.predict(X_test_cudf)
     predict_time = time.time() - start
     # Переведем предсказания в numpy для метрик
     y_pred_np = y_pred.to_array()  # для cuML
     rmse = np.sqrt(mean_squared_error(y_test, y_pred_np))
     mae = mean_absolute_error(y_test, y_pred_np)
     results.append({
         'model': name,
         'train_time': train_time,
         'predict_time': predict_time,
         'rmse': rmse,
         'mae': mae,
         'type': 'GPU (cuML)'
     })
# 3. Обучение CatBoost и LGBM на GPU (но данные в pandas)
for name, model in models_gpu_other.items():
     start = time.time()
     model.fit(X_train, y_train)
     train_time = time.time() - start
     start = time.time()
     y_pred = model.predict(X_test)
     predict_time = time.time() - start
     rmse = np.sqrt(mean_squared_error(y_test, y_pred))
     mae = mean_absolute_error(y_test, y_pred)
     results.append({
         'model': name,
         'train_time': train_time,
         'predict_time': predict_time,
         'rmse': rmse,
         'mae': mae,
         'type': 'GPU (other)'
     })
# Соберем результаты в DataFrame
results_df = pl.DataFrame(results)

In [None]:
results = []

# CPU модели
cpu_results = train_evaluate_models(
    models_cpu, X_train_processed, y_train, X_test_processed, y_test, 'CPU'
)
results.extend(cpu_results)

# cuML GPU модели
gpu_ml_results = train_evaluate_models(
    models_gpu_ml, X_train_cudf, y_train_cudf, X_test_cudf, y_test_cudf, 'GPU (cuML)'
)
results.extend(gpu_ml_results)

# Другие GPU модели
gpu_other_results = train_evaluate_models(
    models_gpu_other, X_train_processed, y_train, X_test_processed, y_test, 'GPU (other)'
)
results.extend(gpu_other_results)

# Создаем DataFrame с результатами
results_df = pl.DataFrame(results)

In [None]:
# Визуализация результатов
plt.figure(figsize=(14, 8))
sns.barplot(x='rmse', y='model', data=results_df.to_pandas(), hue='device')
plt.title('RMSE by Model and Device')
plt.xlabel('RMSE')
plt.ylabel('Model')
plt.show()

plt.figure(figsize=(14, 8))
sns.barplot(x='train_time', y='model', data=results_df.to_pandas(), hue='device')
plt.title('Training Time by Model and Device')
plt.xlabel('Training Time (s)')
plt.ylabel('Model')
plt.xscale('log')
plt.show()

# Выбор лучшей модели
best_model_info = results_df.sort('rmse').row(0)
best_model_name = best_model_info['model']
best_model_type = best_model_info['device']

print(f"Best model: {best_model_name} ({best_model_type})")
print(f"RMSE: {best_model_info['rmse']:.2f}")
print(f"MAE: {best_model_info['mae']:.2f}")
print(f"Training time: {best_model_info['train_time']:.2f}s")

In [None]:
 Оценка важности признаков для лучшей модели
if best_model_name == "cuRandomForest":
    model = models_gpu_ml["cuRandomForest"]
    importances = model.feature_importances_
    features = X_train_cudf.columns.to_arrow().to_pylist()
elif best_model_name == "RandomForest":
    model = models_cpu["RandomForest"]
    importances = model.feature_importances_
    features = preprocessor.get_feature_names_out()
elif best_model_name == "CatBoost":
    model = models_gpu_other["CatBoost"]
    importances = model.get_feature_importance()
    features = preprocessor.get_feature_names_out()
else:
    print("Feature importance not available for this model")
    importances = None

if importances is not None:
    # Сортировка важностей
    sorted_idx = np.argsort(importances)[::-1][:20]
    sorted_features = [features[i] for i in sorted_idx]
    sorted_importances = importances[sorted_idx]

    # Визуализация
    plt.figure(figsize=(12, 8))
    sns.barplot(x=sorted_importances, y=sorted_features)
    plt.title(f'Top 20 Feature Importances ({best_model_name})')
    plt.xlabel('Importance')
    plt.ylabel('Features')
    plt.show()

In [None]:
# Визуализация предсказаний лучшей модели
if "cu" in best_model_name:
    model = models_gpu_ml.get(best_model_name)
    y_pred = model.predict(X_test_cudf).to_array()
elif best_model_name in models_gpu_other:
    model = models_gpu_other[best_model_name]
    y_pred = model.predict(X_test_processed)
else:
    model = models_cpu[best_model_name]
    y_pred = model.predict(X_test_processed)

# График фактических и предсказанных значений
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.3)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title(f'Actual vs Predicted Prices ({best_model_name})')
plt.show()


In [None]:
# Распределение ошибок
errors = y_test - y_pred
plt.figure(figsize=(10, 6))
sns.histplot(errors, kde=True)
plt.xlabel('Prediction Error')
plt.title('Distribution of Prediction Errors')
plt.show()