In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor, Pool
from math import sqrt
from sklearn.metrics import confusion_matrix
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, LassoCV

In [2]:
train = pd.read_csv('https://raw.githubusercontent.com/a-milenkin/Competitive_Data_Science/main/data/quickstart_train.csv')

In [11]:
cat_features = ['model', 'car_type', 'fuel_type']
targets = ['target_reg']
features2drop = ['car_id', 'target_class'] 

# for c in cat_features:
#     train[c] = train[c].astype(str)

filtered_features = [i for i in train.columns if (i not in targets and i not in features2drop and i not in cat_features)]

In [12]:
n_splits = 3

X = train[filtered_features].drop(targets, axis=1, errors='ignore')
y = train['target_reg']

kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

In [104]:
def GetPreds(model, X, y, n_fold=5):
    folds = KFold(n_splits=n_fold)
    preds = np.empty((0,1), float)

    for train_indices, val_indices in folds.split(X, y):
        X_train, X_val = X.loc[train_indices], X.loc[val_indices]
        y_train, y_val = y[train_indices], y[val_indices]
        
        model.fit(X_train, y_train)
        model_preds = model.predict(X_val).reshape(-1, 1)
        
        preds = np.concatenate((preds, model_preds))

    return preds.reshape(-1, 1)

In [118]:
def GetPreds(model, X, y, X_test, n_fold=5):
    """
    model - имеет методы .fit(X, y) и .predict(X)
    X - pd.DataFrame
    y - np.array
    X_test - pd.DataFrame, для которого нужно сделать предсказания
    """
    folds = KFold(n_splits=n_fold)
    preds = np.empty((0,1),float)

    for train_indices, val_indices in folds.split(X, y):
        X_train, X_val = X.loc[train_indices], X.loc[val_indices]
        y_train, y_val = y[train_indices], y[val_indices]
        model.fit(X_train, y_train)
        preds = np.append(preds, model.predict(X_val))
    
    model.fit(X, y)
    
    test_preds = model.predict(X_test)
    
    return preds.reshape(len(X), 1), test_preds.reshape(len(X_test), 1)

In [119]:
print(GetPreds(LinearRegression(), X, y, X_test))

(array([[57.61582641],
       [49.66752675],
       [26.08321644],
       ...,
       [27.89215704],
       [59.73402582],
       [47.07680995]]), array([[49.34528318],
       [44.10805364],
       [41.51927392],
       ...,
       [31.29720144],
       [41.61108103],
       [43.05477689]]))


In [1691]:
# Убираем шумные записи для всех моделей

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_pool = Pool(data=X_train, label=y_train, cat_features=cat_features)
y_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)

source_model = CatBoostRegressor(random_seed=42, thread_count=-1,
                          cat_features=cat_features, eval_metric='RMSE')

source_model.fit(
    train_pool,
    eval_set=eval_pool,
    verbose=0,
    use_best_model=True,
    plot=False,
    early_stopping_rounds=100)

# Используем object importance и удаляем шумные записи
idxs, scores = source_model.get_object_importance(eval_pool, train_pool, 
                                           importance_values_sign="Positive", 
                                           thread_count=-1)
X = X.reset_index(drop=True).drop(idxs[:250])
y = y.reset_index(drop=True).drop(idxs[:250])

In [1692]:
# Обучаем CatBoost модели

catboost_models = []
catboost_scores = []

for train_index, test_index in kf.split(X):
    
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    train_pool = Pool(data=X_train, label=y_train, cat_features=cat_features)
    eval_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)
    
    model = CatBoostRegressor(random_seed=42, thread_count=-1,
                              cat_features=cat_features, eval_metric='RMSE')

    catboost_models.append(model)

    model.fit(
        train_pool,
        eval_set=eval_pool,
        verbose=0,
        use_best_model=True,
        plot=False,
        early_stopping_rounds=100)

    catboost_scores.append(np.mean([v for k, v in model.best_score_["validation"].items() if 'RMSE' in k], dtype="float16"))
    print(model.best_score_["validation"]['RMSE'])

11.875335706574408
11.166326420800113
11.699860019558951


In [1693]:
# Смотрим скоры и сохраняем лучшую модель в переменную best_catboost

best_catboost = catboost_models[catboost_scores.index(min(catboost_scores))]
print(catboost_scores)

[11.875, 11.164, 11.7]


In [1694]:
# Обучаем LightGBM модели

lgb_models = []
lgb_scores = []

X_copy = X.copy()

le = LabelEncoder()

for col in cat_features:
    X_copy[col] = le.fit_transform(X_copy[col])

for train_index, test_index in kf.split(X_copy):
    
    X_train, X_test = X_copy.iloc[train_index], X_copy.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    train_data = lgb.Dataset(
        X_train,
        y_train,
        categorical_feature=cat_features,
        free_raw_data=False)

    val_data = lgb.Dataset(
        X_test,
        y_test,
        categorical_feature=cat_features,
        free_raw_data=False)

    params = {
        'metric': 'rmse',
        'bagging_fraction': 0.8,
        'reg_alpha': 0.5,
        'reg_lambda': 0.3,
        'seed': 42,
        'max_depth': 30,
        'boosting_type': 'goss',
        'early_stopping_round': 20,
        'max_bin': 100,
        'num_leaves': 10,
        'verbose': -1}

    booster = lgb.train(
        params,
        train_set=train_data,
        valid_sets=(val_data,),
        categorical_feature=cat_features,
        num_boost_round=100,
        verbose_eval=False)
    
    lgb_models.append(booster)
    lgb_scores.append(booster.best_score['valid_0']['rmse']) 
    
    print(booster.best_score['valid_0']['rmse'])

12.557252203246863
11.43594391669928
12.138783965071363




In [1695]:
# Смотрим скоры и сохраняем лучшую модель в переменную best_lgb

best_lgb = lgb_models[lgb_scores.index(min(lgb_scores))]
print(lgb_scores)

[12.557252203246863, 11.43594391669928, 12.138783965071363]


In [1696]:
# Обучаем XGBoost модели

xgb_models = []
xgb_scores = []

X_copy = X.copy()

for col in cat_features:
        X_copy[col] = X_copy[col].astype('category')

for train_index, test_index in kf.split(X_copy):
    
    X_train, X_test = X_copy.iloc[train_index], X_copy.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    iters = 50
    learning_rates = np.linspace(0.3, 0.005, iters).tolist()
    scheduler = xgb.callback.LearningRateScheduler(learning_rates)
    
    dtrain = xgb.DMatrix(X_train, label=y_train, nthread=-1,
                         enable_categorical=True)

    dtest = xgb.DMatrix(X_test, y_test, nthread=-1,
                        enable_categorical=True)
    
    params = {
        'objective': 'reg:squarederror',
        'subsample': 0.7999,
        'lambda': 30,
        'gamma': 200,
        'booster': 'dart',
        'rate_drop': 0.1,
        'one_drop': 1,
        'eval_metric': 'rmse',
        'tree_method': 'hist',
        'seed': 42}
    
    booster = xgb.train(params,
                        dtrain=dtrain,
                        num_boost_round=50,
                        evals=[(dtrain, 'dtrain'), (dtest, 'dtest')],
                        early_stopping_rounds=5,
                        callbacks=[scheduler],
                        verbose_eval=500)
    
    xgb_models.append(booster)
    xgb_scores.append(booster.best_score)
    
    print(booster.best_score)

[0]	dtrain-rmse:15.04665	dtest-rmse:16.10540
[49]	dtrain-rmse:9.66245	dtest-rmse:12.21464
12.214636679928054
[0]	dtrain-rmse:15.60989	dtest-rmse:14.88538
[35]	dtrain-rmse:9.94785	dtest-rmse:11.43535
11.431465473904174
[0]	dtrain-rmse:15.24654	dtest-rmse:15.53254
[49]	dtrain-rmse:9.56985	dtest-rmse:12.03387
12.033426556451548


In [1697]:
# Смотрим скоры и сохраняем лучшую модель в переменную best_xgb

best_xgb = xgb_models[xgb_scores.index(min(xgb_scores))]
print(xgb_scores)

[12.214636679928054, 11.431465473904174, 12.033426556451548]


In [1698]:
# Сохраняем лучшие модели

catboost_models[catboost_scores.index(min(catboost_scores))].save_model('../models/catboost_model.cbm', format='cbm')
lgb_booster = lgb_models[lgb_scores.index(min(lgb_scores))]
lgb_booster.save_model('../models/lgb_model.mod')
xgb_models[xgb_scores.index(min(xgb_scores))].save_model('../models/xgb_model.json')

In [108]:
test = pd.read_csv('https://raw.githubusercontent.com/a-milenkin/Competitive_Data_Science/main/data/quickstart_test.csv')

In [1700]:
# Проводим предобработку и сохраняем id'шники машин

for c in cat_features:
    test[c] = test[c].astype(str)

filtered_features = [i for i in test.columns if (i not in targets and i not in features2drop)]

In [109]:
X_test = test[filtered_features].drop(targets, axis=1, errors='ignore')

In [1702]:
# Предсказываем reg у тестового датасета

y_catboost_pred  = best_catboost.predict(X_test)

X_test_copy = X_test.copy()
cat_columns = X_test_copy.select_dtypes(['object']).columns
X_test_copy[cat_columns] = X_test_copy[cat_columns].apply(lambda x: pd.factorize(x)[0])
y_lgb_pred = best_lgb.predict(X_test_copy)

for col in cat_features:
    X_test[col] = X_test[col].astype('category')
    
X_dtest = xgb.DMatrix(X_test, nthread=-1, enable_categorical=True)
y_xgb_pred = best_xgb.predict(X_dtest)

In [1703]:
# Сохраняем предикты лучших моделей

test['target_reg_catboost'] = y_catboost_pred
test['target_reg_lgm'] = y_lgb_pred
test['target_reg_xgb'] = y_xgb_pred

In [1704]:
# Усредняем предикты и сохраняем итоговый submission

test['target_reg'] = test.apply(lambda x: np.mean([x['target_reg_catboost'], 
                                                   x['target_reg_lgm'], x['target_reg_xgb']]), axis=1)
submission = test[['car_id', 'target_reg']]
submission.to_csv('../data/submission.csv', index=False)

In [1705]:
# Считаем средний RMSE:

print(np.mean([catboost_scores, lgb_scores, xgb_scores]))

11.839299588366808


**Conclusions:** благодаря этому практическому заданию я научился подбирать гиперпараметры, делать нормальную k-fold валидацию, также разобрался с Object Importance. А, и ещё я понял что ненавижу LightGBM :) (оверфитинг в нём просто прекрасен).

In [1706]:
submission

Unnamed: 0,car_id,target_reg
0,P17494612l,49.315983
1,N-1530212S,38.059120
2,B-1154399t,35.643124
3,F12725233R,37.220514
4,l-1139189J,35.613622
...,...,...
1908,x13640960Q,59.824158
1909,Z-2276652N,46.188659
1910,F-2165841B,33.659334
1911,x-1331529J,51.040665


In [1707]:
train

Unnamed: 0,car_id,model,car_type,fuel_type,car_rating,year_to_start,riders,year_to_work,target_reg,target_class,mean_rating,distance_sum,rating_min,speed_max,user_ride_quality_median,deviation_normal_count,user_uniq
0,y13744087j,Kia Rio X-line,economy,petrol,3.78,2015,76163,2021,109.99,another_bug,4.737759,1.214131e+07,0.10,180.855726,0.023174,174,170
1,O41613818T,VW Polo VI,economy,petrol,3.90,2015,78218,2021,34.48,electro_bug,4.480517,1.803909e+07,0.00,187.862734,12.306011,174,174
2,d-2109686j,Renault Sandero,standart,petrol,6.30,2012,23340,2017,34.93,gear_stick,4.768391,1.588366e+07,0.10,102.382857,2.513319,174,173
3,u29695600e,Mercedes-Benz GLC,business,petrol,4.04,2011,1263,2020,32.22,engine_fuel,3.880920,1.651883e+07,0.10,172.793237,-5.029476,174,170
4,N-8915870N,Renault Sandero,standart,petrol,4.70,2012,26428,2017,27.51,engine_fuel,4.181149,1.398317e+07,0.10,203.462289,-14.260456,174,171
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2332,j21246192N,Smart ForFour,economy,petrol,4.38,2017,121239,2018,24.62,wheel_shake,4.608908,1.739222e+07,0.10,141.502350,-6.624534,174,171
2333,h-1554287F,Audi A4,premium,petrol,4.30,2016,107793,2020,70.58,engine_check,4.683793,1.174052e+07,0.10,155.000000,-8.582467,174,169
2334,A15262612g,Kia Rio,economy,petrol,3.88,2015,80234,2019,45.50,gear_stick,4.655345,1.202022e+07,0.10,104.180940,-0.778524,174,172
2335,W-2514493U,Renault Sandero,standart,petrol,4.50,2014,60048,2020,75.48,another_bug,4.638333,1.788307e+07,0.10,200.000000,2.464975,174,171
