In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from typing import List, Tuple

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import optuna
from sklearn.ensemble import VotingClassifier

In [2]:
def check_missing_values(df):
    # Вычисляем количество пропущенных значений для каждого столбца
    missing_values = df.isnull().sum()

    # Вычисляем процент пропущенных значений для каждого столбца
    total_values = len(df)
    missing_percent = ((missing_values / total_values) * 100).round(2)

    # Для каждого столбца вычисляем количество строк с пропущенными значениями
    missing_rows = df.isnull().sum(axis=0) 

    # Создаем датафрейм с результатами
    missing_data = pd.DataFrame({
        'Столбец': missing_values.index, 
        'Пропущено в %': missing_percent.values, 
        'Строк пропущено': missing_rows.values  # Используем подсчитанные значения
    })

    # Сортируем датафрейм по убыванию процента пропущенных значений
    missing_data = missing_data.sort_values(by='Пропущено в %', ascending=False)

    return missing_data

In [3]:
car_train = pd.read_csv('https://raw.githubusercontent.com/a-milenkin/Competitive_Data_Science/main/data/car_train.csv')
print(f'размер: {car_train.shape}')
car_train.sample(5)

размер: (2337, 10)


Unnamed: 0,car_id,model,car_type,fuel_type,car_rating,year_to_start,riders,year_to_work,target_reg,target_class
1263,S45472418H,VW Tiguan,economy,petrol,3.64,2017,133098,2017,32.26,wheel_shake
667,U57863590w,Renault Sandero,standart,petrol,3.52,2015,85666,2016,44.57,gear_stick
561,i93529341g,VW Polo,economy,petrol,3.62,2014,55979,2020,59.83,engine_check
1049,m-1183503n,Smart ForFour,economy,petrol,4.34,2015,83508,2019,47.66,engine_check
102,C38293729H,Smart ForTwo,economy,petrol,4.72,2016,111683,2016,43.06,engine_fuel


In [4]:
rides_info = pd.read_csv('https://raw.githubusercontent.com/a-milenkin/Competitive_Data_Science/main/data/rides_info.csv')
print(f'размер: {rides_info.shape}')
rides_info.sample(5)

размер: (739500, 14)


Unnamed: 0,user_id,car_id,ride_id,ride_date,rating,ride_duration,ride_cost,speed_avg,speed_max,stop_times,distance,refueling,user_ride_quality,deviation_normal
66091,e89938736n,E15850759P,O1w,2020-03-17,1.16,25,270,46,140.0,1,776.9975,0,13.323046,-29.12
318847,M11342554P,W16150857p,S1d,2020-02-11,3.05,43,339,40,82.0,3,92.53413,0,9.924616,12.87
55553,g67369412k,D44517833b,W1I,2020-01-26,3.76,115,1260,34,49.0,1,1673.296,0,-5.749485,27.646
679587,G18233768V,v53440686O,r1C,2020-03-02,2.05,14874,178482,78,104.81308,0,1170087.0,0,-2.19574,-5.779
245190,K13410024u,R-1701492N,P1i,2020-01-13,5.52,84,668,34,105.167259,0,591.7658,0,-2.822354,-0.0


In [5]:
driver_info = pd.read_csv('https://raw.githubusercontent.com/a-milenkin/Competitive_Data_Science/main/data/driver_info.csv')
print(f'размер: {driver_info.shape}')
driver_info.sample(5)

размер: (15153, 7)


Unnamed: 0,age,user_rating,user_rides,user_time_accident,user_id,sex,first_ride_date
6339,38,7.2,472,5.0,n10467710X,0,2020-6-15
10110,49,8.3,537,13.0,L99796268U,0,2020-6-30
14728,41,7.7,859,9.0,k77432002o,0,2019-3-30
11375,27,9.6,784,16.0,y16371035s,1,2020-9-7
10859,35,9.3,78,19.0,i34450975W,1,2019-12-7


In [6]:
fix_info = pd.read_csv('https://raw.githubusercontent.com/a-milenkin/Competitive_Data_Science/main/data/fix_info.csv')
print(f'размер: {fix_info.shape}')
fix_info.sample(5)

размер: (146000, 6)


Unnamed: 0,car_id,worker_id,fix_date,work_type,destroy_degree,work_duration
125580,o-2117199c,US,2020-10-30 0:26,reparking,1.0,21
46133,l79825102V,JF,2020-1-29 18:26,reparking,1.0,28
65631,j19148530u,DS,2020-7-11 10:14,repair,5.0,18
56569,L-1227525r,GV,2020-11-19 17:28,reparking,1.0,29
81948,y-1806571O,VS,2020-3-1 6:19,repair,10.0,65


In [7]:
# Группировка данных по каждой машине в датасете rides_info
grouped_rides = rides_info.groupby('car_id').agg(
    min_rating=('rating', 'min'),
    avg_rating=('rating', 'mean'),
    total_distance=('distance', 'sum'),
    max_speed=('speed_max', 'max'),
    total_rides=('ride_id', 'count')
).reset_index()


grouped_rides.head(1)

Unnamed: 0,car_id,min_rating,avg_rating,total_distance,max_speed,total_rides
0,A-1049127W,0.1,4.255172,11257530.0,179.729652,174


In [8]:
# Добавление трех дополнительных признаков:
# 1. Частота остановок на 100 км
grouped_rides['stops_per_100km'] = rides_info.groupby('car_id').apply(lambda x: (x['stop_times'].sum() / x['distance'].sum()) * 100).reset_index(level=0, drop=True)

# 2. Процент поездок с превышением средней скорости выше 45 км/ч
grouped_rides['percentage_speed_over_45'] = rides_info.groupby('car_id').apply(lambda x: ((x['speed_avg'] > 45).mean()) * 100).reset_index(level=0, drop=True)

# 3. Среднее отклонение скорости от средней по автопарку
grouped_rides['speed_deviation_from_avg'] = rides_info.groupby('car_id').apply(lambda x: abs(x['speed_avg'] - rides_info['speed_avg'].mean()).mean()).reset_index(level=0, drop=True)

In [9]:
# Объединение grouped_rides с car_train
full_car_info = pd.merge(car_train, grouped_rides, on='car_id', how='left')

full_car_info.head(1)

Unnamed: 0,car_id,model,car_type,fuel_type,car_rating,year_to_start,riders,year_to_work,target_reg,target_class,min_rating,avg_rating,total_distance,max_speed,total_rides,stops_per_100km,percentage_speed_over_45,speed_deviation_from_avg
0,y13744087j,Kia Rio X-line,economy,petrol,3.78,2015,76163,2021,108.53,another_bug,0.1,4.737759,12141310.0,180.855726,174,0.005189,45.402299,8.088746


In [10]:
# Агрегация данных из driver_info
aggregated_driver_info = driver_info.groupby('user_id').agg(
    avg_age=('age', 'mean'),
    avg_user_rating=('user_rating', 'mean'),
    user_time_accidents_total=('user_time_accident', 'sum')
).reset_index()

# Агрегация данных из fix_info
aggregated_fix_info = fix_info.groupby('car_id').agg(
    last_fix_date=('fix_date', 'max'),
    total_destroy_degree=('destroy_degree', 'sum'),
    fix_count=('car_id', 'count')
).reset_index()

# Добавляем информацию из driver_info к full_car_info через промежуточное объединение с rides_info для получения user_id
rides_with_user_info = pd.merge(rides_info[['car_id', 'user_id']].drop_duplicates(), aggregated_driver_info, on='user_id', how='left')

# Объединяем full_car_info с информацией о водителях и ремонтах
full_car_info_with_drivers = pd.merge(full_car_info, rides_with_user_info, on='car_id', how='left')
final_df = pd.merge(full_car_info_with_drivers, aggregated_fix_info, on='car_id', how='left')

print(f"Размер итогового датасета: {final_df.shape}")
final_df.head(1)

Размер итогового датасета: (401086, 25)


Unnamed: 0,car_id,model,car_type,fuel_type,car_rating,year_to_start,riders,year_to_work,target_reg,target_class,...,stops_per_100km,percentage_speed_over_45,speed_deviation_from_avg,user_id,avg_age,avg_user_rating,user_time_accidents_total,last_fix_date,total_destroy_degree,fix_count
0,y13744087j,Kia Rio X-line,economy,petrol,3.78,2015,76163,2021,108.53,another_bug,...,0.005189,45.402299,8.088746,n14703870u,38.0,7.4,2.0,2021-12-1 11:33,106.7,35


In [11]:
final_df = final_df.drop_duplicates(subset=['car_id'])

In [12]:
## Определение столбцов с константными значениями
constant_columns = [col for col in final_df.columns if final_df[col].nunique() == 1]

# Определение столбцов с уникальными значениями (исключая NaN)
unique_value_columns = [col for col in final_df.columns if final_df[col].nunique() == final_df.shape[0]]

# Уникальные значения
id_columns = ['user_id', 'car_id']

# Объединение списков столбцов для удаления
columns_to_drop = list(set(constant_columns + unique_value_columns + id_columns))

# Удаление столбцов из final_df
final_df_cleaned = final_df.drop(columns=columns_to_drop)

print(f"Удалены столбцы: {columns_to_drop}")

pd.set_option('display.max_columns', None)
final_df_cleaned.head(1)

Удалены столбцы: ['total_distance', 'car_id', 'user_id', 'stops_per_100km', 'total_rides']


Unnamed: 0,model,car_type,fuel_type,car_rating,year_to_start,riders,year_to_work,target_reg,target_class,min_rating,avg_rating,max_speed,percentage_speed_over_45,speed_deviation_from_avg,avg_age,avg_user_rating,user_time_accidents_total,last_fix_date,total_destroy_degree,fix_count
0,Kia Rio X-line,economy,petrol,3.78,2015,76163,2021,108.53,another_bug,0.1,4.737759,180.855726,45.402299,8.088746,38.0,7.4,2.0,2021-12-1 11:33,106.7,35


In [13]:
features2drop = ['year_to_start', 'riders', 'year_to_work', 'avg_age', 'last_fix_date','fix_count'] # то, что надо выбросить
targets = ['target_class', 'target_reg'] # таргеты 
cat_features = ['model', 'car_type', 'fuel_type', 'car_rating'] 

num_features = ['avg_user_rating', 'max_speed', 'min_rating','total_destroy_degree',
               'avg_rating', 'user_time_accidents_total', 'speed_deviation_from_avg',
               'percentage_speed_over_45'] # числовые признаки

# Отбираем итоговый набор признаков для использования моделью
filtered_features = [i for i in final_df_cleaned.columns if (i not in targets and i not in features2drop)]
num_features = [i for i in filtered_features if i not in cat_features]
print("cat_features", cat_features)
print("num_features", num_features)
print("targets", targets)

cat_features ['model', 'car_type', 'fuel_type', 'car_rating']
num_features ['min_rating', 'avg_rating', 'max_speed', 'percentage_speed_over_45', 'speed_deviation_from_avg', 'avg_user_rating', 'user_time_accidents_total', 'total_destroy_degree']
targets ['target_class', 'target_reg']


In [14]:
# Объединение категориальных и числовых признаков для создания списка всех признаков X
all_features_X = cat_features + num_features

# Выборка признаков для X из final_df_cleaned
X = final_df_cleaned[all_features_X]

# Выборка целевой переменной 'target_class' для y
y = final_df_cleaned['target_class']

In [15]:
def train_models(X: pd.DataFrame, y: pd.Series, cat_features: List[str]) -> pd.DataFrame:
    # Преобразование категориальных признаков в строки
    X[cat_features] = X[cat_features].astype(str)
    
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y)
    
    # Разбиение данных на обучающую и валидационную выборки
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Создание моделей с базовыми параметрами
    models = {
        "CatBoostClassifier": CatBoostClassifier(loss_function='MultiClass', cat_features=cat_features, verbose=False),
        "LGBMClassifier": LGBMClassifier(boosting_type='goss', objective='multiclass'),
        "XGBClassifier": XGBClassifier(booster='dart', objective='multi:softprob', enable_categorical=True)
    }
    
    results = []

    # Обучение каждой модели и оценка ее точности на валидационной выборке
    for name, model in models.items():
        if name == "LGBMClassifier":
            # Для LGBM преобразовываем категориальные признаки в 'category' после разделения на выборки
            X_train[cat_features] = X_train[cat_features].apply(lambda x: x.astype('category'))
            X_val[cat_features] = X_val[cat_features].apply(lambda x: x.astype('category'))
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        accuracy = accuracy_score(y_val, y_pred)
        results.append({"Model": name, "Accuracy": accuracy})
    
    # Возвращение результатов в виде DataFrame
    return pd.DataFrame(results)

# Замените X и y на ваши реальные данные и cat_features на список ваших категориальных признаков
results_df = train_models(X, y, cat_features)
results_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[cat_features] = X[cat_features].astype(str)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000243 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1434
[LightGBM] [Info] Number of data points in the train set: 1869, number of used features: 12
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] Start training from score -2.134996
[LightGBM] [Info] Start training from score -2.167183
[LightGBM] [Info] Start training from score -2.244892
[LightGBM] [Info] Start training from score -2.134996
[LightGBM] [Info] Start training from score -2.215039
[LightGBM] [Info] Start training from score -2.181301
[LightGBM] [Info] Start training from score -2.048362
[LightGBM] [Info] Start training from score -2.082120
[LightGBM] [Info] Start training from score -2.696877












Unnamed: 0,Model,Accuracy
0,CatBoostClassifier,0.878205
1,LGBMClassifier,0.865385
2,XGBClassifier,0.882479


In [16]:
def preprocess_data(X, y, cat_features):
    # Преобразование категориальных признаков из типа object в category
    for feature in cat_features:
        X[feature] = X[feature].astype('category')
    
    # Кодирование целевой переменной
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
    
    # Разбиение данных на обучающую и валидационную выборки
    X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
    
    return X_train, X_val, y_train, y_val

def objective_catboost(trial):
    param = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.3),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-8, 10.0),
        'task_type': 'GPU',
        'loss_function': 'MultiClass'
    }
    model = CatBoostClassifier(**param, cat_features=cat_features, verbose=False)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=50, verbose=False)
    preds = model.predict(X_val)
    accuracy = accuracy_score(y_val, preds)
    return accuracy

def objective_lgbm(trial):
    param = {
        'objective': 'multiclass',
        'metric': 'multi_logloss',
        'num_class': len(np.unique(y)),
        'boosting_type': 'goss',
        'num_leaves': trial.suggest_int('num_leaves', 31, 256),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.3),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.5, 1.0),
    }
    model = LGBMClassifier(**param)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)])
    preds = model.predict(X_val)
    accuracy = accuracy_score(y_val, preds)
    return accuracy

def objective_xgb(trial):
    param = {
        'booster': 'dart',
        'lambda': trial.suggest_loguniform('lambda', 1e-8, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-8, 10.0),
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'eta': trial.suggest_loguniform('eta', 1e-8, 1.0),
        'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 1, 0.1),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'objective': 'multi:softprob',
        'eval_metric': 'mlogloss',
        'tree_method': 'gpu_hist',
        'enable_categorical': True,
        'n_estimators': 1000
    }
    model = XGBClassifier(**param)
    model.fit(X_train, y_train, early_stopping_rounds=50, eval_set=[(X_val, y_val)], verbose=False)
    preds = model.predict(X_val)
    accuracy = accuracy_score(y_val, preds)
    return accuracy

In [None]:
# Определите X_train, X_val, y_train, y_val с использованием вашей функции preprocess_data
X_train, X_val, y_train, y_val = preprocess_data(X, y, cat_features)

# Создание и оптимизация моделей
study_cat = optuna.create_study(direction='maximize')
study_cat.optimize(objective_catboost, n_trials=15)

study_lgbm = optuna.create_study(direction='maximize')
study_lgbm.optimize(objective_lgbm, n_trials=15)

study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective_xgb, n_trials=15)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[feature] = X[feature].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[feature] = X[feature].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[feature] = X[feature].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

[I 2024-04-01 13:10:24,869] Trial 7 finished with value: 0.811965811965812 and parameters: {'iterations': 644, 'depth': 7, 'learning_rate': 0.14870790126284367, 'l2_leaf_reg': 8.329298932520695e-08}. Best is trial 3 with value: 0.8952991452991453.
  'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.3),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-8, 10.0),
[I 2024-04-01 13:10:26,390] Trial 8 finished with value: 0.8760683760683761 and parameters: {'iterations': 188, 'depth': 5, 'learning_rate': 0.20629366621309125, 'l2_leaf_reg': 0.0015664745993884638}. Best is trial 3 with value: 0.8952991452991453.
  'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.3),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-8, 10.0),
[I 2024-04-01 13:10:28,037] Trial 9 finished with value: 0.8867521367521367 and parameters: {'iterations': 925, 'depth': 4, 'learning_rate': 0.19082054271218646, 'l2_leaf_reg': 0.0001761941247845925}. Best is trial 3 with v

In [None]:
# Создание моделей с лучшими параметрами
model_cat = CatBoostClassifier(**study_cat.best_params, cat_features=cat_features, verbose=False)
model_lgbm = LGBMClassifier(**study_lgbm.best_params)
model_xgb = XGBClassifier(**study_xgb.best_params)

# Добавление моделей в список для ансамблирования
models = [
    ('catboost', model_cat),
    ('lgbm', model_lgbm),
    ('xgb', model_xgb)
]

# Создание и обучение ансамбля
voting_clf = VotingClassifier(estimators=models, voting='hard')
voting_clf.fit(X_train, y_train)

# Оценка ансамбля
y_pred = voting_clf.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy of the voting ensemble: {accuracy}")