In [None]:
https://medium.com/almabetter/catboost-the-fastest-algorithm-c21d44f8b990
https://medium.com/nerd-for-tech/catboost-quickstart-ml-classification-f1d7fb70fea8
https://catboost.ai/en/docs/references/training-parameters/common#min_data_in_leaf
https://catboost.ai/en/docs/concepts/python-reference_catboostclassifier
https://dhavalthakur.medium.com/what-is-catboost-algorithm-step-by-step-tutorial-2c93aa566068

In [1]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1.1-cp38-none-manylinux1_x86_64.whl (76.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.6/76.6 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.1.1


In [2]:
from catboost import CatBoostClassifier, Pool, cv
import pandas as pd
import numpy as np
import dask
import scipy.stats as ss
import missingno as mno
from sklearn.neighbors import BallTree, DistanceMetric
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, confusion_matrix, auc, roc_curve
from copy import copy, deepcopy
import zipfile
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
class MultiAgent:
    def __init__(self, objf, population_size, var_type, dim, lb, ub, max_i, metric, mode, stepsize, k=None, ccf=None,
                 file_save_path=None, ys=-np.inf, custome_init_func=None, start_prob=0.5, callback=None):
        self.objf, self.var_type, self.ccf = objf, var_type, ccf
        self.population_size, self.dim = population_size, dim
        self.lb, self.ub = np.full(self.dim, lb), np.full(self.dim, ub)
        self.start_prob = start_prob
        self.max_i, self.stepsize = max_i, stepsize
        self.k, self.mode = k, mode
        self.custome_init_func = custome_init_func
        self.population = self.init_population()
        self.fitness, self.ys = None, ys
        self.gbest, self.lbest = None, None
        self.gbest_curve, self.convergence_curve = {'x': [], 'y': []}, {'x': [], 'y': []}
        self.metric = DistanceMetric.get_metric(metric) if metric else None
        self.file_save_path = file_save_path
        self.callback = callback

    def init_population(self):
        if self.custome_init_func is not None:
            assert self.custome_init_func.shape == (self.population_size, self.dim)
            return self.custome_init_func
        else:
            if self.var_type == 'continuous':
                return np.random.uniform(self.lb, self.ub, size=(self.population_size, self.dim))
            elif self.var_type == 'binary':
                return np.random.binomial(1, self.start_prob, (self.population_size, self.dim)).astype(np.bool)
            elif self.var_type == 'discrete':
                return np.random.randint(0, self.dim, (self.population_size, self.dim))
            elif self.var_type == 'lhs':
                print('LHS')
                lhs = Lhs(criterion="maximin", iterations=1000)
                return np.asarray(lhs.generate(list(zip(self.lb, self.ub)), self.population_size))

    def update_gbest(self):
        self.fitness = np.nan_to_num(self.fitness, nan=np.inf)
        idx = np.argmin(self.fitness)
        if self.gbest is None or self.fitness[idx] < self.gbest['fitness']:
            self.gbest = {'fitness': np.copy(self.fitness[idx]), 'position': np.copy(self.population[idx]),
                          'index': np.copy(idx)}
            '''if self.file_save_path is not None:
                self.save()
            if self.callback is not None:'''


    def update_lbest(self, pos_arg, fit_arg, dist=False, rnd_k=True):
        k = np.random.randint(1, self.k, 1)[0] if rnd_k else self.k
        tree, size = BallTree(pos_arg, metric=self.metric), len(pos_arg)
        distance, knn_index = tree.query([*pos_arg], k)
        best_knn_index = np.argmin(fit_arg[knn_index], axis=1)
        lbest_index = knn_index[np.arange(size), best_knn_index]  # index of lbest in population
        lbest_dist = distance[np.arange(size), best_knn_index] if dist else None
        self.lbest = {'position': pos_arg[lbest_index], 'fitness': fit_arg[lbest_index], 'distance': lbest_dist,
                      'index': lbest_index, 'knn_index': knn_index}

    def exploration_exploitation_tradoff(self, i, max, min):
        if self.mode == 'cyclic':
            cycle = np.floor(1 + i / (2 * self.stepsize))
            x = np.abs((i / self.stepsize) - 2 * cycle + 1)
            return max - (max - min) * np.maximum(0, 1 - x)
        elif self.mode == 'linear':
            return min + (max - min) * (1 - i / self.max_i)
        elif self.mode == 'random':
            return np.random.uniform(min, max, 1)[0]
        elif self.mode == 'cst':
            return max

    def proba_from_fit(self, fit):
        i = fit < 0
        i = np.where(i, 1 + np.abs(fit), 1 / (1 + fit))
        s = np.sum(i)
        return i / s

    def levy_flight(self, pop, local_search_prob):
        beta, sigma, lf_index = 1.5, 0.6966, np.random.binomial(1, local_search_prob, (pop.shape[0], 1))
        r1, r2 = np.random.normal(size=pop.shape), np.random.normal(size=pop.shape)
        step = 0.01 * ((r1 * sigma) / np.power(np.abs(r2), 1.0 / beta)) * pop
        lf = pop + step * np.random.normal(size=pop.shape)
        lf = np.where(lf_index, lf, pop)
        return np.clip(lf, self.lb, self.ub)

    def selection(self, pos, fit, trials, trials_fit=None):
        #trials_fit = self.objf(trials, self.dim) if trials_fit is None else trials_fit
        trials, trials_fit = self.objf(trials) if trials_fit is None else (trials, trials_fit)
        improvement = trials_fit < fit
        pos, fit = np.where(improvement[:, np.newaxis], trials, pos), np.where(improvement, trials_fit, fit)
        return pos, fit, improvement

    def plot(self, i):
        self.gbest_curve['x'].append(i), self.convergence_curve['x'].append(i)
        self.gbest_curve['y'].append(self.gbest['fitness']), self.convergence_curve['y'].append(np.mean(self.fitness))

    def save(self):
        col = ['hp_' + str(i) for i in range(self.dim)]
        data = dict(zip(col, self.gbest['position']))
        data['fitness'] = self.gbest['fitness']
        pd.DataFrame(data=data, index=[0]).to_csv(self.file_save_path)


class PSO(MultiAgent):
    def __init__(self, objf, ccf, vat_type, dim, lb, ub, population_size, max_i, mode, stepsize, metric='euclidean', k=7,
                 local_search_prob=0.5, ys=-np.inf, file_save_path=None, custome_init_func=None, start_prob=0.5):
        super().__init__(objf=objf, ccf=ccf, var_type=vat_type, population_size=population_size, dim=dim, ub=ub, lb=lb, start_prob=start_prob,
                         max_i=max_i, metric=metric, k=k, mode=mode, ys=ys, stepsize=stepsize, file_save_path=file_save_path,
                         custome_init_func=custome_init_func)
        self.pbest = None
        self.local_search_prob = local_search_prob

    def update_pbest(self):
        if self.pbest is None:
            self.pbest = {'fitness': self.fitness, 'position': self.population}
        else:
            self.pbest['position'], self.pbest['fitness'], _ = self.selection(self.pbest['position'],
                                                                self.pbest['fitness'], self.population, self.fitness)

    def update_velocity(self, param, index, i):
        w = self.exploration_exploitation_tradoff(i, 0.9, 0.1)
        #c = self.exploration_exploitation_tradoff(i, 3.9, 0.01)
        r1 = np.random.uniform(0, 1, (self.population_size, self.dim))
        r2 = np.random.uniform(0, 1, (self.population_size, self.dim))
        delta = r1 * (self.pbest['position'] - self.population) + r2 * (self.lbest['position'] - self.population)
        self.velocity = np.where(index, w * param[0] + delta, w * param[1] + delta)

    def update_position(self, i):
        index = np.random.binomial(1, self.local_search_prob, size=(self.population_size, 1))
        lf = self.levy_flight(self.population, 1)
        self.update_velocity([lf, self.velocity], index, i)
        self.population = np.where(index, self.velocity, self.population + self.velocity)
        self.population = np.clip(self.population, self.lb, self.ub)


class CPSO(PSO):
    def __init__(self, objf, ccf, dim, lb, ub, population_size, max_i, local_search_prob=0.15, metric='euclidean', k=7,
                 mode='linear', stepsize=200, ys=-np.inf, file_save_path=None, custome_init_func=None, var_type='continuous'):
        super().__init__(objf=objf, ccf=ccf, vat_type=var_type, population_size=population_size, dim=dim,ub=ub, lb=lb,
                         max_i=max_i, local_search_prob=local_search_prob, k=k, metric=metric, mode=mode,
                         stepsize=stepsize, ys=ys, file_save_path=file_save_path, custome_init_func=custome_init_func)
        self.velocity = np.random.uniform(self.lb, self.ub, (self.population_size, self.dim))
        self.pbest = None
        self.run()

    def run(self):
        i = 0
        while i < self.max_i:
            self.population = self.ccf(self.population)
            self.population, self.fitness = self.objf(self.population)
            self.update_gbest()
            print('iteration ', i, ': ', self.gbest)
            self.plot(i)
            self.update_pbest()
            self.update_lbest(self.pbest['position'], self.pbest['fitness'])
            self.update_position(i)
            i += 1 if self.ys < self.gbest['fitness'] else self.max_i

In [3]:
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/classification/hotel_cancellation/cleaned_hotel_bookings.csv')
df

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_week_number,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,...,required_car_parking_spaces,total_of_special_requests,Date_sin_dayofweek,Date_cos_dayofweek,Date_sin_dayofyear,Date_cos_dayofyear,reservation_status_date_sin_dayofweek,reservation_status_date_cos_dayofweek,reservation_status_date_sin_dayofyear,reservation_status_date_cos_dayofyear
0,Resort Hotel,0,342,27,0,0,2,0.0,0,BB,...,0,0,8.660254e-01,-0.5,0.017166,-0.999853,8.660254e-01,-0.5,1.716633e-02,-0.999853
1,Resort Hotel,0,737,27,0,0,2,0.0,0,BB,...,0,0,8.660254e-01,-0.5,0.017166,-0.999853,8.660254e-01,-0.5,1.716633e-02,-0.999853
2,Resort Hotel,0,7,27,0,1,1,0.0,0,BB,...,0,0,8.660254e-01,-0.5,0.017166,-0.999853,1.224647e-16,-1.0,1.224647e-16,-1.000000
3,Resort Hotel,0,13,27,0,1,1,0.0,0,BB,...,0,0,8.660254e-01,-0.5,0.017166,-0.999853,1.224647e-16,-1.0,1.224647e-16,-1.000000
4,Resort Hotel,0,14,27,0,2,2,0.0,0,BB,...,0,1,8.660254e-01,-0.5,0.017166,-0.999853,-8.660254e-01,-0.5,-1.716633e-02,-0.999853
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119201,City Hotel,0,23,35,2,5,2,0.0,0,BB,...,0,0,8.660254e-01,-0.5,-0.848351,-0.529434,8.660254e-01,-0.5,-9.057023e-01,-0.423914
119202,City Hotel,0,102,35,2,5,3,0.0,0,BB,...,0,2,1.224647e-16,-1.0,-0.857315,-0.514793,1.224647e-16,-1.0,-9.128459e-01,-0.408304
119203,City Hotel,0,34,35,2,5,2,0.0,0,BB,...,0,4,1.224647e-16,-1.0,-0.857315,-0.514793,1.224647e-16,-1.0,-9.128459e-01,-0.408304
119204,City Hotel,0,109,35,2,5,2,0.0,0,BB,...,0,0,1.224647e-16,-1.0,-0.857315,-0.514793,1.224647e-16,-1.0,-9.128459e-01,-0.408304


In [4]:
out_column = 'is_canceled'
to_remove = []
features = [c for c in df.columns if (c != out_column) and (c not in to_remove)]
categorical = [c for c in features if (df[c].dtype=='object') and (df[c].nunique() > 2)]
binary = [c for c in features if df[c].nunique() == 2]
numerical = [col for col in features if col not in categorical + binary]
df[numerical] = df[numerical].apply(pd.to_numeric,1)
fig = go.Figure(data=go.Heatmap(z=df[numerical].corr(),x=numerical,y=numerical))
fig.show()

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(df[features], df[out_column], test_size=0.2, random_state=42, stratify=df[out_column])
train_dataset = Pool(X_train, Y_train, feature_names=features, cat_features=categorical+binary)
test_dataset = Pool(X_test, feature_names=features, cat_features=categorical+binary)

X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((95364, 28), (23842, 28), (95364,), (23842,))

In [None]:
def cv_score(hp):
    hp = [np.round(hp[0]).astype(int), hp[1], np.round(hp[2]).astype(int), hp[3]]
    params = {"iterations": hp[0], 'learning_rate': hp[1], "min_data_in_leaf": hp[2], 'reg_lambda': hp[3], "loss_function": "Logloss", 'eval_metric': 'F1', "verbose": False}
    cv_res = cv(train_dataset, params, fold_count=7, stratified=True, verbose=False)
    return hp, -cv_res['test-F1-mean'].values[-1]

def objf(pop, dim=None):
    population, fitness = [], []
    for hp in pop:
        _hp, mse = dask.delayed(cv_score, nout=2)(hp)
        population.append(_hp), fitness.append(mse)
    population, fitness = dask.compute(population, fitness)
    population, fitness = np.asarray(population), np.asarray(fitness)
    return population, fitness

def ccf(population):
    return population

#lb, ub = [2, 1e-5, 1e-5, 1], [100, 1, 0.5, X_train.shape[1]]
lb, ub = [2, 1e-15, 1, 1e-5], [100, 2, len(X_train)/2, 100]
dim = len(lb)
optimizer = CPSO(objf, ccf, dim, lb, ub, 50, 5000)

In [6]:
hp = [5.10000000e+01, 1.00000000e+00, 5.06620000e+04, 9.78179653e+00]
hp = [np.round(hp[0]).astype(int), hp[1], np.round(hp[2]).astype(int), hp[3]]
params = {"iterations": hp[0], 'learning_rate': hp[1], "min_data_in_leaf": hp[2], 'reg_lambda': hp[3], "loss_function": "Logloss", 'eval_metric': 'F1', "verbose": True, 'use_best_model': True}
cv_res = cv(train_dataset, params, fold_count=7, stratified=True)
cv_res

Training on fold [0/7]
0:	learn: 0.6386222	test: 0.6398439	best: 0.6398439 (0)	total: 121ms	remaining: 6.07s
1:	learn: 0.7833352	test: 0.7802285	best: 0.7802285 (1)	total: 186ms	remaining: 4.56s
2:	learn: 0.8625525	test: 0.8597621	best: 0.8597621 (2)	total: 248ms	remaining: 3.96s
3:	learn: 0.8979383	test: 0.8978899	best: 0.8978899 (3)	total: 310ms	remaining: 3.64s
4:	learn: 0.9111418	test: 0.9116504	best: 0.9116504 (4)	total: 376ms	remaining: 3.46s
5:	learn: 0.9253887	test: 0.9273335	best: 0.9273335 (5)	total: 469ms	remaining: 3.52s
6:	learn: 0.9345749	test: 0.9352892	best: 0.9352892 (6)	total: 579ms	remaining: 3.64s
7:	learn: 0.9457707	test: 0.9443129	best: 0.9443129 (7)	total: 691ms	remaining: 3.71s
8:	learn: 0.9604787	test: 0.9598775	best: 0.9598775 (8)	total: 799ms	remaining: 3.73s
9:	learn: 0.9633549	test: 0.9618601	best: 0.9618601 (9)	total: 930ms	remaining: 3.81s
10:	learn: 0.9650335	test: 0.9628120	best: 0.9628120 (10)	total: 1.07s	remaining: 3.91s
11:	learn: 0.9675937	test: 0.

Unnamed: 0,iterations,test-F1-mean,test-F1-std,train-F1-mean,train-F1-std,test-Logloss-mean,test-Logloss-std,train-Logloss-mean,train-Logloss-std
0,0,0.654121,0.058331,0.657883,0.061279,0.435853,0.025407,0.434425,0.026296
1,1,0.794646,0.024173,0.797371,0.023571,0.322774,0.024722,0.320913,0.022941
2,2,0.861347,0.012554,0.862995,0.011013,0.24601,0.014352,0.24373,0.013061
3,3,0.888452,0.008133,0.889468,0.006179,0.212868,0.009259,0.210792,0.007706
4,4,0.906847,0.005847,0.908445,0.005026,0.186821,0.010729,0.183829,0.008493
5,5,0.921124,0.008747,0.922013,0.007412,0.165838,0.01495,0.163928,0.012813
6,6,0.926155,0.01079,0.927387,0.00881,0.156329,0.017573,0.153455,0.015595
7,7,0.936853,0.008986,0.937904,0.007491,0.139364,0.016265,0.136889,0.015835
8,8,0.942851,0.011777,0.944384,0.010634,0.127277,0.021381,0.125249,0.021911
9,9,0.946734,0.010075,0.948438,0.009193,0.119903,0.018884,0.11771,0.01954


In [7]:
model = CatBoostClassifier(iterations=hp[0],  learning_rate=hp[1], min_data_in_leaf=hp[2], reg_lambda=hp[3], loss_function='Logloss')
# train the model
model.fit(train_dataset, verbose=True)
pred = model.predict_proba(train_dataset)[:,1]
auc = roc_auc_score(Y_train, pred)
pd.DataFrame({'AUC': auc, 'ACC': accuracy_score(Y_train, np.round(pred)), 'PRE': precision_score(Y_train, np.round(pred)), 'REC': recall_score(Y_train, np.round(pred)), 'F1':f1_score(Y_train, np.round(pred))}, index=[0])

0:	learn: 0.4618540	total: 72.4ms	remaining: 3.62s
1:	learn: 0.3277144	total: 146ms	remaining: 3.57s
2:	learn: 0.2553226	total: 212ms	remaining: 3.4s
3:	learn: 0.2472661	total: 281ms	remaining: 3.3s
4:	learn: 0.2220047	total: 343ms	remaining: 3.16s
5:	learn: 0.1977416	total: 412ms	remaining: 3.09s
6:	learn: 0.1798861	total: 487ms	remaining: 3.06s
7:	learn: 0.1633433	total: 546ms	remaining: 2.93s
8:	learn: 0.1346812	total: 620ms	remaining: 2.89s
9:	learn: 0.1168682	total: 686ms	remaining: 2.81s
10:	learn: 0.1000937	total: 754ms	remaining: 2.74s
11:	learn: 0.0948265	total: 818ms	remaining: 2.66s
12:	learn: 0.0868629	total: 877ms	remaining: 2.56s
13:	learn: 0.0830806	total: 939ms	remaining: 2.48s
14:	learn: 0.0768322	total: 1.01s	remaining: 2.41s
15:	learn: 0.0684136	total: 1.08s	remaining: 2.36s
16:	learn: 0.0662867	total: 1.14s	remaining: 2.29s
17:	learn: 0.0613268	total: 1.21s	remaining: 2.22s
18:	learn: 0.0562479	total: 1.27s	remaining: 2.14s
19:	learn: 0.0562317	total: 1.34s	remainin

Unnamed: 0,AUC,ACC,PRE,REC,F1
0,0.999113,0.993541,0.99888,0.98368,0.991222


In [8]:
pred = model.predict_proba(test_dataset)[:,1]
auc = roc_auc_score(Y_test, pred)
pd.DataFrame({'AUC': auc, 'ACC': accuracy_score(Y_test, np.round(pred)), 'PRE': precision_score(Y_test, np.round(pred)), 'REC': recall_score(Y_test, np.round(pred)), 'F1':f1_score(Y_test, np.round(pred))}, index=[0])

Unnamed: 0,AUC,ACC,PRE,REC,F1
0,0.998108,0.991653,0.997581,0.979862,0.988642


In [9]:
fig = go.Figure(data=[go.Bar(x=list(X_test.columns), y=model.feature_importances_, text=model.feature_importances_, textposition='auto', )])
fig.show()

In [None]:
from catboost import CatBoostClassifier, Pool, cv
import numpy as np
import pandas as pd
import dask
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import optimization.numpy_version.single_objective.continuous as co

def cv_score(hp):
    hp = [np.round(hp[0]).astype(int), hp[1], np.round(hp[2]).astype(int), hp[3]]
    params = {"iterations": hp[0], 'learning_rate': hp[1], "min_data_in_leaf": hp[2], 'reg_lambda': hp[3], "loss_function": "Logloss", 'eval_metric': 'F1', "verbose": False}
    cv_res = cv(train_dataset, params, fold_count=7, stratified=True, verbose=False)
    return hp, -cv_res['test-F1-mean'].values[-1]

def objf(pop, dim=None):
    population, fitness = [], []
    for hp in pop:
        _hp, mse = dask.delayed(cv_score, nout=2)(hp)
        population.append(_hp), fitness.append(mse)
    population, fitness = dask.compute(population, fitness)
    population, fitness = np.asarray(population), np.asarray(fitness)
    return population, fitness

def ccf(population):
    return population

df = pd.read_csv('cleaned_hotel_bookings.csv')

out_column = 'is_canceled'
to_remove = []
features = [c for c in df.columns if (c != out_column) and (c not in to_remove)]
categorical = [c for c in features if (df[c].dtype=='object') and (df[c].nunique() > 2)]
binary = [c for c in features if df[c].nunique() == 2]
numerical = [col for col in features if col not in categorical + binary]
df[numerical] = df[numerical].apply(pd.to_numeric,1)

X_train, X_test, Y_train, Y_test = train_test_split(df[features], df[out_column], test_size=0.15, random_state=0, stratify=df[out_column])
train_dataset = Pool(X_train, Y_train, feature_names=features, cat_features=categorical+binary)
test_dataset = Pool(X_test, feature_names=features, cat_features=categorical+binary)

print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

lb, ub = [2, 1e-15, 1, 1e-5], [51, 1, len(X_train)/2, 100]
dim = len(lb)
optimizer = co.CPSO(objf, ccf, dim, lb, ub, 50, 5000)

Accuracy Score of Random Forest is : 0.862479727084615
Confusion Matrix : n[[10375   877]
 [ 1582  5047]]
