In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.metrics import confusion_matrix
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor, Pool, CatBoostClassifier

In [13]:
df = pd.read_csv('https://raw.githubusercontent.com/a-milenkin/Competitive_Data_Science/main/data/car_train.csv')

In [15]:
cat_columns = df.select_dtypes(['object']).columns
df[cat_columns] = df[cat_columns].apply ( lambda x: pd.factorize (x)[ 0 ])

In [16]:
X = df.drop(['target_reg', 'target_class'], axis=1)
y = df[['target_reg']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,)

In [17]:
def custom_l2_loss(y, data):
    t = data.get_label()
    n = len(t)
    grad = y - t
    hess = np.ones_like(y)
    
    return grad, hess

def custom_l2_eval(y, data):
    t = data.get_label()
    l2 = np.mean((y - t) ** 2)
    
    return 'l2', l2, False

In [51]:
lgbm_params = {
    'random_seed': 42
    }

model = lgb.train(lgbm_params, 
                  lgb.Dataset(X_train, y_train),
                  fobj=custom_l2_loss,
                  feval=custom_l2_eval)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 794
[LightGBM] [Info] Number of data points in the train set: 1869, number of used features: 8


In [3]:
df

Unnamed: 0,car_id,model,car_type,fuel_type,car_rating,year_to_start,riders,year_to_work,target_reg,target_class
0,y13744087j,Kia Rio X-line,economy,petrol,3.78,2015,76163,2021,108.53,another_bug
1,O41613818T,VW Polo VI,economy,petrol,3.90,2015,78218,2021,35.20,electro_bug
2,d-2109686j,Renault Sandero,standart,petrol,6.30,2012,23340,2017,38.62,gear_stick
3,u29695600e,Mercedes-Benz GLC,business,petrol,4.04,2011,1263,2020,30.34,engine_fuel
4,N-8915870N,Renault Sandero,standart,petrol,4.70,2012,26428,2017,30.45,engine_fuel
...,...,...,...,...,...,...,...,...,...,...
2332,j21246192N,Smart ForFour,economy,petrol,4.38,2017,121239,2018,25.48,wheel_shake
2333,h-1554287F,Audi A4,premium,petrol,4.30,2016,107793,2020,69.26,engine_check
2334,A15262612g,Kia Rio,economy,petrol,3.88,2015,80234,2019,46.03,gear_stick
2335,W-2514493U,Renault Sandero,standart,petrol,4.50,2014,60048,2020,77.19,another_bug


In [4]:
!pip install lightgbm -q


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [12]:
!pip install lightgbm==3.3.5

Defaulting to user installation because normal site-packages is not writeable
Collecting lightgbm==3.3.5
  Downloading lightgbm-3.3.5-py3-none-manylinux1_x86_64.whl (2.0 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m:01[0m0m
Installing collected packages: lightgbm
  Attempting uninstall: lightgbm
    Found existing installation: lightgbm 4.1.0
    Uninstalling lightgbm-4.1.0:
      Successfully uninstalled lightgbm-4.1.0
Successfully installed lightgbm-3.3.5

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [47]:
def generate_lr_list(start:float, stop:float, iterations:int, step:int)->list():
    generated_lr = []
    num_integer, num_remainder = iterations//step, iterations%step

    for i in np.linspace(start, stop, num_integer+(1 if num_remainder>0 else 0)): 
        if i == stop and num_remainder > 0:
            generated_lr.extend([i] * num_remainder)
        else:
            generated_lr.extend([i] * step)

    return generated_lr

In [49]:
generate_lr_list(1, 0.1, 10, 3)

[1.0, 1.0, 1.0, 0.7, 0.7, 0.7, 0.4, 0.4, 0.4, 0.1]

In [3]:
df = pd.read_csv('https://stepik.org/media/attachments/lesson/779917/CB_bin_target_data.csv')

In [159]:
class P4Metric(object):
    def get_final_error(self, error, weight):
        return error / (weight + 1e-38)

    def is_max_optimal(self):
        return True

    def evaluate(self, approxes, target, weight):
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])
        
        approx_exp = np.exp(approxes[0])
        probabilities = approx_exp / (1 + approx_exp)
        predicts = [1 if p > 0.5 else 0 for p in probabilities]
        tn, fp, fn, tp = confusion_matrix(target, predicts).ravel()
        p4 = (4 * tp * tn) / (4 * tp * tn + (tp + tn) * (fp + fn))
        
        return p4, 1

In [160]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split

rides_info = df
cat_cols = ['car_type', 'fuel_type', 'model']
X = rides_info.drop(['target_bin'], axis=1)
y = rides_info['target_bin']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = CatBoostClassifier(random_state=42,
                           cat_features=cat_cols,
                           eval_metric=P4Metric())

model.fit(X_train, y_train, 
          eval_set=(X_test, y_test),
          verbose=50, plot=False, 
          early_stopping_rounds=100)

Learning rate set to 0.036983
0:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 66.6ms	remaining: 1m 6s


Failed in nopython mode pipeline (step: nopython frontend)
[1mUntyped global name 'confusion_matrix':[0m [1m[1mCannot determine Numba type of <class 'function'>[0m
[1m
File "../../../../../tmp/ipykernel_37109/1577173455.py", line 15:[0m
[1m<source missing, REPL/exec in use?>[0m
[0m
  self._object._train(train_pool, test_pool, params, allow_clear_pool, init_model._object if init_model else None)


50:	learn: 0.6618212	test: 0.5148397	best: 0.5148397 (50)	total: 387ms	remaining: 7.21s
100:	learn: 0.8116578	test: 0.5942377	best: 0.5989402 (92)	total: 822ms	remaining: 7.32s
150:	learn: 0.8649428	test: 0.6610884	best: 0.6610884 (145)	total: 1.2s	remaining: 6.75s
200:	learn: 0.8952646	test: 0.6438122	best: 0.6779758 (178)	total: 1.53s	remaining: 6.08s
250:	learn: 0.9362061	test: 0.6305286	best: 0.6779758 (178)	total: 1.85s	remaining: 5.52s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.6779758482
bestIteration = 178

Shrink model to first 179 iterations.


<catboost.core.CatBoostClassifier at 0x7fe344c80790>

In [4]:
df

Unnamed: 0,model,car_type,fuel_type,car_rating,year_to_start,riders,year_to_work,mean_rating,distance_sum,rating_min,speed_max,user_ride_quality_median,deviation_normal_count,user_uniq,target_bin
0,Kia Rio X-line,economy,petrol,3.78,2015,76163,2021,4.737759,1.214131e+07,0.10,180.855726,0.023174,174,170,0
1,VW Polo VI,economy,petrol,3.90,2015,78218,2021,4.480517,1.803909e+07,0.00,187.862734,12.306011,174,174,0
2,Renault Sandero,standart,petrol,6.30,2012,23340,2017,4.768391,1.588366e+07,0.10,102.382857,2.513319,174,173,0
3,Mercedes-Benz GLC,business,petrol,4.04,2011,1263,2020,3.880920,1.651883e+07,0.10,172.793237,-5.029476,174,170,0
4,Renault Sandero,standart,petrol,4.70,2012,26428,2017,4.181149,1.398317e+07,0.10,203.462289,-14.260456,174,171,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2332,Smart ForFour,economy,petrol,4.38,2017,121239,2018,4.608908,1.739222e+07,0.10,141.502350,-6.624534,174,171,0
2333,Audi A4,premium,petrol,4.30,2016,107793,2020,4.683793,1.174052e+07,0.10,155.000000,-8.582467,174,169,0
2334,Kia Rio,economy,petrol,3.88,2015,80234,2019,4.655345,1.202022e+07,0.10,104.180940,-0.778524,174,172,0
2335,Renault Sandero,standart,petrol,4.50,2014,60048,2020,4.638333,1.788307e+07,0.10,200.000000,2.464975,174,171,0


In [14]:
import xgboost as xgb

In [12]:
df = pd.read_csv('https://raw.githubusercontent.com/a-milenkin/Competitive_Data_Science/main/data/car_train.csv')

In [127]:
cat_columns = df.select_dtypes(['object']).columns
df[cat_columns] = df[cat_columns].apply(lambda x: pd.factorize(x)[0])

X = df.drop(['target_reg', 'target_class'], axis=1)
y = df['target_reg']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

dtrain = xgb.DMatrix(X_train,
                     label=y_train,
                     nthread=-1,
                     enable_categorical=True,)

dtest = xgb.DMatrix(X_test,
                    y_test,
                    nthread=-1,
                    enable_categorical=True)

In [91]:
def custom_rmsle_eval(predictions, dmat):
    labels = dmat.get_label()
    vec = np.vectorize(float)
    rmsle = np.sqrt((1 / len(predictions)) * np.sum((np.log(vec(predictions) + 1) - np.log(vec(labels) + 1))**2))
    
    return 'RMSLE', rmsle

In [198]:
def custom_log_cosh_loss(predictions, dmat):
    labels = dmat.get_label()
    diff = predictions - labels
    grad = len(diff) * np.tanh(diff)
    hess = len(labels) / np.power(np.cosh(diff), 2)
    
    return grad, hess

In [199]:
params = {
    'objective': 'reg:squarederror',
    'tree_method': 'hist',
    'seed': 42,
}

booster = xgb.train(params,
                    dtrain=dtrain,
                    num_boost_round=30,
                    evals=[(dtrain, 'dtrain'), (dtest, 'dtest')],
                    obj=custom_log_cosh_loss,
                    early_stopping_rounds=10,
                    verbose_eval=5)

[0]	dtrain-rmse:47.56752	dtest-rmse:48.69590
[5]	dtrain-rmse:47.56752	dtest-rmse:48.69590
[9]	dtrain-rmse:47.56752	dtest-rmse:48.69590


  hess = len(labels) / np.power(np.cosh(diff), 2)
  hess = len(labels) / np.power(np.cosh(diff), 2)


In [13]:
df

Unnamed: 0,car_id,model,car_type,fuel_type,car_rating,year_to_start,riders,year_to_work,target_reg,target_class
0,y13744087j,Kia Rio X-line,economy,petrol,3.78,2015,76163,2021,108.53,another_bug
1,O41613818T,VW Polo VI,economy,petrol,3.90,2015,78218,2021,35.20,electro_bug
2,d-2109686j,Renault Sandero,standart,petrol,6.30,2012,23340,2017,38.62,gear_stick
3,u29695600e,Mercedes-Benz GLC,business,petrol,4.04,2011,1263,2020,30.34,engine_fuel
4,N-8915870N,Renault Sandero,standart,petrol,4.70,2012,26428,2017,30.45,engine_fuel
...,...,...,...,...,...,...,...,...,...,...
2332,j21246192N,Smart ForFour,economy,petrol,4.38,2017,121239,2018,25.48,wheel_shake
2333,h-1554287F,Audi A4,premium,petrol,4.30,2016,107793,2020,69.26,engine_check
2334,A15262612g,Kia Rio,economy,petrol,3.88,2015,80234,2019,46.03,gear_stick
2335,W-2514493U,Renault Sandero,standart,petrol,4.50,2014,60048,2020,77.19,another_bug
