# CatBoost

**Tutorial:** https://catboost.ai/docs/concepts/tutorials.html



In [None]:
import os
import sys
import pandas as pd
import numpy as np
import importlib
import itertools
from pandas.io.json import json_normalize
import sklearn.metrics as metrics
import random
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from matplotlib import rcParams
import json
import math

%matplotlib inline

from IPython.core.display import display, HTML

display(HTML("<style>.container { width:95% !important; }</style"))

import catboost

In [None]:
# Global variables
meta_data_path = "../../data-campaigns/meta-data/"

legs = "all_legs_merged_no_outlier_0.01.pkl"
input_path = "../../2019-12-16.out/"
out_path = "../../2019-12-16.out/WI_results/"

# Graphical parameters
rcParams["axes.titlepad"] = 45
rcParams["font.size"] = 16
rcParams["figure.figsize"] = 12, 8
sns.set_style("whitegrid")

**READ DATA**

In [None]:
#### all_legs ####

all_legs = pd.read_pickle(input_path + legs)

# remove "unknown" as transport category (?)
all_legs = all_legs[all_legs.transp_category != "Unknown"]

# select only useful wasted time
all_legs = all_legs[(all_legs.wastedTime > 0) & (all_legs.wastedTime < 6)]
# convert to int
all_legs["wastedTime"] = all_legs["wastedTime"].apply(lambda x: np.round(x))

# country - assign 'CHE' to the class Other (AAA)
all_legs["onCampaigns"] = all_legs["onCampaigns"].apply(
    lambda x: "AAA" if x == "CHE" else x
)
top10 = list(all_legs.onCampaigns.unique())


#### values_from_trip ####
values_from_trip = pd.read_pickle(input_path + "values_from_trip.pkl")


# add info
values_from_trip = values_from_trip.merge(
    all_legs[
        [
            "legid",
            "wastedTime",
            "userid",
            "gender",
            "onCampaigns",
            "age",
            "transp_category",
        ]
    ],
    on="legid",
).drop_duplicates()

values_from_trip.head()

### Model 1. wt ~ E + P + F

In [None]:
tmp = values_from_trip[["legid", "value", "valueFromTrip"]].drop_duplicates()
values_from_trip_pivot = pd.pivot(
    data=tmp, index="legid", columns="valueFromTrip", values="value"
).reset_index()
# add transport category and userid
values_from_trip_pivot = values_from_trip_pivot.merge(
    all_legs[["legid", "userid", "transp_category", "wastedTime"]], on="legid"
).drop_duplicates()
# Merge Paid_work and Personal_tasks into Productivity taking the **maximum** value
values_from_trip_pivot["Productivity"] = values_from_trip_pivot[
    ["Paid_work", "Personal_tasks"]
].max(axis=1)

values_from_trip_pivot.drop(["Paid_work", "Personal_tasks"], axis=1, inplace=True)

## select columns
values_from_trip_pivot = values_from_trip_pivot[
    ["Enjoyment", "Productivity", "Fitness", "wastedTime"]
]  # , 'transp_category']]

# remove legs with missing values in E+P+F
values_from_trip_pivot = values_from_trip_pivot[
    ~(
        (values_from_trip_pivot.Enjoyment.isnull())
        & (values_from_trip_pivot.Productivity.isnull())
        & (values_from_trip_pivot.Fitness.isnull())
    )
]
# remove legs with null tc
# values_from_trip_pivot = values_from_trip_pivot[~ values_from_trip_pivot.transp_category.isnull()]

# convert E P F into int values
values_from_trip_pivot["Enjoyment"] = values_from_trip_pivot["Enjoyment"].astype(np.int)
values_from_trip_pivot["Productivity"] = values_from_trip_pivot["Productivity"].astype(
    np.int
)
values_from_trip_pivot["Fitness"] = values_from_trip_pivot["Fitness"].astype(np.int)

values_from_trip_pivot.head()

In [None]:
# values_from_trip_pivot.to_csv('values_from_trip_pivot.csv', index=False)

In [None]:
values_from_trip_pivot[
    (values_from_trip_pivot.Enjoyment == 0)
    & (values_from_trip_pivot.Productivity == 0)
    & (values_from_trip_pivot.Fitness == 0)
].groupby("wastedTime").size().reset_index(name="nlegs")

In [None]:
values_from_trip_pivot.groupby("wastedTime").size().reset_index(name="count")

**Train - Test split**

In [None]:
from catboost import CatBoostClassifier, Pool, cv
from sklearn.metrics import accuracy_score

In [None]:
random.seed(123)

th = 0.8
nlegs_train = np.int64(values_from_trip_pivot.shape[0] * th)
nlegs_train_lst = random.sample(list(values_from_trip_pivot.index), nlegs_train)

train_df = values_from_trip_pivot[values_from_trip_pivot.index.isin(nlegs_train_lst)]
test_df = values_from_trip_pivot[~values_from_trip_pivot.index.isin(nlegs_train_lst)]

In [None]:
y = train_df.wastedTime
X = train_df.drop("wastedTime", axis=1)

In [None]:
from catboost.utils import create_cd

feature_names = dict()
for column, name in enumerate(train_df):
    if column == 0:
        continue
    feature_names[column - 1] = name

create_cd(
    label=0,
    cat_features=list(range(1, train_df.columns.shape[0])),
    feature_names=feature_names,
    # output_path=os.path.join(dataset_dir, 'train.cd')
)

In [None]:
!cat train.cd

In [None]:
pool1 = Pool(data=X, label=y, cat_features=cat_features)

In [None]:
### Train and Validation set
from sklearn.model_selection import train_test_split

th = 0.8
X_train, X_validation, y_train, y_validation = train_test_split(
    X, y, train_size=th, random_state=42
)

X_test = test_df

In [None]:
model = CatBoostClassifier(
    iterations=1000,
    loss_function="MultiClass",
    # learning_rate=0.1,
    custom_loss="Accuracy",
)

model.fit(
    X_train,
    y_train,
    cat_features=cat_features,
    eval_set=(X_validation, y_validation),
    verbose=100,
    plot=True,
)
print("Model is fitted: " + str(model.is_fitted()))
print("Model params:")
print(model.get_params())

In [None]:
## OVERFITTING: Se il test error aumenta nel corso delle iterazioni
# e se l'ottimo viene raggiunto nelle prime iterazioni

In [None]:
## il modello automaticamente taglia dopo l'overfitting
print("Tree count: " + str(model.tree_count_))

In [None]:
predictions = model.predict(X_test)
predictions_probs = model.predict_proba(X_test)
print(predictions[:10])
print(predictions_probs[:10])

In [None]:
unique, counts = np.unique(predictions, return_counts=True)
dict(zip(unique, counts))

**CROSS VALIDATION**

In [None]:
from catboost import cv

params = {}
params["loss_function"] = "MultiClass"
params["iterations"] = 80
params["custom_loss"] = "Accuracy"
params["random_seed"] = 63
params["learning_rate"] = 0.5

cv_data = cv(
    params=params,
    pool=Pool(X, label=y, cat_features=cat_features),
    fold_count=5,
    shuffle=True,
    partition_random_seed=0,
    plot=True,
    # stratified=False,
    verbose=False,
)

In [None]:
best_value = np.min(cv_data["test-MultiClass-mean"])
best_iter = np.argmin(cv_data["test-MultiClass-mean"])

print(
    "Best validation Logloss score, not stratified: {:.4f}Â±{:.4f} on step {}".format(
        best_value, cv_data["test-MultiClass-std"][best_iter], best_iter
    )
)

**Overfitting detector**

In [None]:
model_with_early_stop = CatBoostClassifier(
    iterations=200,
    random_seed=63,
    learning_rate=0.5,
    early_stopping_rounds=20,  # stop when there is no improvement after 20 iterations
)
model_with_early_stop.fit(
    X_train,
    y_train,
    cat_features=cat_features,
    eval_set=(X_validation, y_validation),
    verbose=False,
    plot=True,
)

### Multiclass


For multiclass problems with many classes sometimes it's better to solve classification problem using ranking. To do that we will build a dataset with groups. Every group will represent one object from our initial dataset. But it will have one additional categorical feature - possible class value. Target values will be equal to 1 if the class value is equal to the correct class, and 0 otherwise. Thus each group will have exactly one 1 in labels, and some zeros. You can put all possible class values in the group or you can try setting only hard negatives if there are too many labels. We'll show this approach on an example of binary classification problem.

In [None]:
from copy import deepcopy


def build_multiclass_ranking_dataset(
    X, y, cat_features, label_values=[0, 1], start_group_id=0
):
    ranking_matrix = []
    ranking_labels = []
    group_ids = []

    X_train_matrix = X.values
    y_train_vector = y.values

    for obj_idx in range(X.shape[0]):
        obj = list(X_train_matrix[obj_idx])

        for label in label_values:
            obj_of_given_class = deepcopy(obj)
            obj_of_given_class.append(label)
            ranking_matrix.append(obj_of_given_class)
            ranking_labels.append(float(y_train_vector[obj_idx] == label))
            group_ids.append(start_group_id + obj_idx)

    final_cat_features = deepcopy(cat_features)
    final_cat_features.append(
        X.shape[1]
    )  # new feature that we are adding should be categorical.
    return Pool(
        ranking_matrix,
        ranking_labels,
        cat_features=final_cat_features,
        group_id=group_ids,
    )

In [None]:
groupwise_train_pool = build_multiclass_ranking_dataset(
    X_train, y_train, cat_features, [1, 2, 3, 4, 5]
)
groupwise_eval_pool = build_multiclass_ranking_dataset(
    X_validation, y_validation, cat_features, [1, 2, 3, 4, 5], X_train.shape[0]
)

In [None]:
params = {"iterations": 100, "learning_rate": 0.01, "loss_function": "QuerySoftMax"}

model = CatBoost(params)
model.fit(
    X=groupwise_train_pool, verbose=False, eval_set=groupwise_eval_pool, plot=True
)

In [None]:
import math

obj = list(X_validation.values[0])
ratings = []
for label in [1, 2, 3, 4, 5]:
    obj_with_label = deepcopy(obj)
    obj_with_label.append(label)
    rating = model.predict([obj_with_label])[0]
    ratings.append(rating)
print("Raw values:", np.array(ratings))


def soft_max(values):
    return [math.exp(val) / sum([math.exp(val) for val in values]) for val in values]


print("Probabilities", np.array(soft_max(ratings)))

### Cleaned dataset

Remove from the data all the legs with E, P, F = 0 and wt > 3,4,5

In [None]:
cleaned_df = values_from_trip_pivot[
    ~(
        (values_from_trip_pivot.Enjoyment == 0)
        & (values_from_trip_pivot.Fitness == 0)
        & (values_from_trip_pivot.Productivity == 0)
        & (values_from_trip_pivot.wastedTime >= 3)
    )
]
cleaned_df.groupby("wastedTime").size().reset_index(name="count")

**save each TC df**

In [None]:
tmp = values_from_trip[["legid", "value", "valueFromTrip"]].drop_duplicates()
tmp = tmp[tmp.valueFromTrip != "Unknown"]
values_from_trip_pivot = pd.pivot(
    data=tmp, index="legid", columns="valueFromTrip", values="value"
).reset_index()
# add transport category and userid
values_from_trip_pivot = values_from_trip_pivot.merge(
    all_legs[["legid", "userid", "transp_category", "wastedTime"]], on="legid"
).drop_duplicates()
values_from_trip_pivot = values_from_trip_pivot[
    ~values_from_trip_pivot.transp_category.isnull()
]
# Merge Paid_work and Personal_tasks into Productivity taking the **maximum** value
values_from_trip_pivot["Productivity"] = values_from_trip_pivot[
    ["Paid_work", "Personal_tasks"]
].max(axis=1)

values_from_trip_pivot.drop(["Paid_work", "Personal_tasks"], axis=1, inplace=True)

# select columns
values_from_trip_pivot = values_from_trip_pivot[
    ["Enjoyment", "Productivity", "Fitness", "wastedTime", "transp_category"]
]

cleaned_df = values_from_trip_pivot[
    ~(
        (values_from_trip_pivot.Enjoyment == 0)
        & (values_from_trip_pivot.Fitness == 0)
        & (values_from_trip_pivot.Productivity == 0)
        & (values_from_trip_pivot.wastedTime >= 3)
    )
]


for i in list(cleaned_df.transp_category.unique()):
    print(i)

    tc_df = cleaned_df[cleaned_df.transp_category == i]
    tc_df = tc_df.iloc[:, :-1]
    # save
    tc_df.to_csv(out_path + "OLR_results/" + i + ".csv", index=False)

In [None]:
random.seed(123)

th = 0.8
nlegs_train = np.int64(cleaned_df.shape[0] * th)
nlegs_train_lst = random.sample(list(cleaned_df.index), nlegs_train)

train_df = cleaned_df[cleaned_df.index.isin(nlegs_train_lst)]
test_df = cleaned_df[~cleaned_df.index.isin(nlegs_train_lst)]

In [None]:
y = train_df.wastedTime
X = train_df.drop("wastedTime", axis=1)

In [None]:
from catboost.utils import create_cd

feature_names = dict()
for column, name in enumerate(train_df):
    if column == 0:
        continue
    feature_names[column - 1] = name

create_cd(
    label=0,
    cat_features=list(range(1, train_df.columns.shape[0])),
    feature_names=feature_names,
    # output_path=os.path.join(dataset_dir, 'train.cd')
)

!cat train.cd

In [None]:
cat_features = [0, 1, 2]

In [None]:
pool1 = Pool(data=X, label=y, cat_features=cat_features)

In [None]:
### Train and Validation set
from sklearn.model_selection import train_test_split

th = 0.8
X_train, X_validation, y_train, y_validation = train_test_split(
    X, y, train_size=th, random_state=42
)

X_test = test_df

In [None]:
model = CatBoostClassifier(
    iterations=1000,
    loss_function="MultiClass",
    # learning_rate=0.1,
    custom_loss="AUC",
)

model.fit(
    X_train,
    y_train,
    cat_features=cat_features,
    eval_set=(X_validation, y_validation),
    verbose=100,
    plot=True,
)
print("Model is fitted: " + str(model.is_fitted()))
print("Model params:")
print(model.get_params())

In [None]:
predictions = model.predict(X_test)
predictions_probs = model.predict_proba(X_test)
print(predictions[:10])
print(predictions_probs[:10])

In [None]:
unique, counts = np.unique(predictions, return_counts=True)
dict(zip(unique, counts))

In [None]:
X_test.groupby("wastedTime").size().reset_index()