# Случайные леса
__Суммарное количество баллов: 10__

__Решение отправлять на `ml.course.practice@gmail.com`__

__Тема письма: `[HSE][ML][HW09] <ФИ>`, где вместо `<ФИ>` указаны фамилия и имя__

В этом задании вам предстоит реализовать ансамбль деревьев решений, известный как случайный лес, применить его к публичным данным пользователей социальной сети Вконтакте, и сравнить его эффективность с ансамблем, предоставляемым библиотекой CatBoost.

В результате мы сможем определить, какие подписки пользователей больше всего влияют на определение возраста и пола человека. 

In [24]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas
import random
import matplotlib.pyplot as plt
import matplotlib
from math import sqrt
import copy
from typing import Callable, Union, NoReturn, Optional, Dict, Any, List
from catboost import CatBoostClassifier
from scipy import stats


In [25]:
def gini(x):
    _, counts = np.unique(x, return_counts=True)
    proba = counts / len(x)
    return np.sum(proba * (1 - proba))
    
def entropy(x):
    _, counts = np.unique(x, return_counts=True)
    proba = counts / len(x)
    return -np.sum(proba * np.log2(proba))

def gain(left_y, right_y, criterion):
    y = np.concatenate((left_y, right_y))
    return criterion(y) - (criterion(left_y) * len(left_y) + criterion(right_y) * len(right_y)) / len(y)

### Задание 1 (2 балла)
Random Forest состоит из деревьев решений. Каждое такое дерево строится на одной из выборок, полученных при помощи bagging. Элементы, которые не вошли в новую обучающую выборку, образуют out-of-bag выборку. Кроме того, в каждом узле дерева мы случайным образом выбираем набор из `max_features` и ищем признак для предиката разбиения только в этом наборе.

Сегодня мы будем работать только с бинарными признаками, поэтому нет необходимости выбирать значение признака для разбиения.

#### Методы
`predict(X)` - возвращает предсказанные метки для элементов выборки `X`

#### Параметры конструктора
`X, y` - обучающая выборка и соответствующие ей метки классов. Из нее нужно получить выборку для построения дерева при помощи bagging. Out-of-bag выборку нужно запомнить, она понадобится потом.

`criterion="gini"` - задает критерий, который будет использоваться при построении дерева. Возможные значения: `"gini"`, `"entropy"`.

`max_depth=None` - ограничение глубины дерева. Если `None` - глубина не ограничена

`min_samples_leaf=1` - минимальное количество элементов в каждом листе дерева.

`max_features="auto"` - количество признаков, которые могут использоваться в узле. Если `"auto"` - равно `sqrt(X.shape[1])`

In [26]:
class DecisionTreeLeaf:
    """

    Attributes
    ----------
    y : Тип метки (напр., int или str)
        Метка класса, который встречается чаще всего среди элементов листа дерева
    """
    def __init__(self, y):
        values, counts = np.unique(y, return_counts=True)
        counts = counts / len(y)
        tmp = list(zip(values, counts))
        self.mapa = dict(tmp)
        self.y = max(tmp, key=lambda x: x[1])[0]


class DecisionTreeNode:
    """

    Attributes
    ----------
    split_dim : int
        Измерение, по которому разбиваем выборку.
    split_value : float
        Значение, по которому разбираем выборку.
    left : Union[DecisionTreeNode, DecisionTreeLeaf]
        Поддерево, отвечающее за случай x[split_dim] < split_value.
    right : Union[DecisionTreeNode, DecisionTreeLeaf]
        Поддерево, отвечающее за случай x[split_dim] >= split_value. 
    """
    def __init__(self, split_dim: int, split_value: float, 
                 left: Union['DecisionTreeNode', DecisionTreeLeaf], 
                 right: Union['DecisionTreeNode', DecisionTreeLeaf]):
        self.split_dim = split_dim
        self.split_value = split_value
        self.left = left
        self.right = right

In [59]:
class DecisionTree:        
    def __init__(self, X, y, criterion="gini", max_depth=None, min_samples_leaf=1, max_features="auto"):
        self.max_features = max_features
        self.min_samples_leaf = min_samples_leaf
        
        if max_features == "auto":
            self.max_features = int(sqrt(X.shape[1])) + 1
            
        self.root = None
        
        if criterion == "gini":
            self.criterion = gini
        else:
            self.criterion = entropy
            
        self.max_depth = len(X)
        self.min_samples_leaf = min_samples_leaf
        
        ind = random.choices(range(len(X)), k=len(X))
        out_of_bag_ind = []
                
        for i in range(len(X)):
            if i not in ind:
                out_of_bag_ind.append(i)
                
        self.out_of_bag  = ([X[i] for i in ind], [y[i] for i in ind])  
        
        self.fit(np.array([X[i] for i in ind]), np.array([y[i] for i in ind]))
        
        
    def build_tree(self, X: np.ndarray, y: np.ndarray, depth):
        if depth >= self.max_depth:
            return DecisionTreeLeaf(y)

        n, m = X.shape
        split_value = None
        split_dim = None
        split_gain = None
        
        
        for dim in random.sample(range(m), self.max_features):
            for value in np.unique(X[:, dim]):
                mask = X[:, dim] < value
                left = y[mask]
                right = y[~mask]
                if len(left) < self.min_samples_leaf or len(right) < self.min_samples_leaf:
                    continue
                g = gain(left, right, self.criterion)
                if split_gain == None or g > split_gain:
                    split_value = value
                    split_dim = dim
                    split_gain = g    
    
        if split_dim == None:
            return DecisionTreeLeaf(y)

        mask = X[:, split_dim] < split_value
        left = self.build_tree(X[mask], y[mask], depth + 1)
        right = self.build_tree(X[~mask], y[~mask], depth + 1)
        
        return DecisionTreeNode(split_dim, split_value, left, right)
    
    def fit(self, X: np.ndarray, y: np.ndarray) -> NoReturn:
        """
        Строит дерево решений по обучающей выборке.

        Parameters
        ----------
        X : np.ndarray
            Обучающая выборка.
        y : np.ndarray
            Вектор меток классов.
        """
        values = np.unique(y)
        
        self.mapa = np.unique(y)
        new_y = np.zeros(len(y))
        for i in range(len(values)):
            for j in range(len(y)):
                if y[j] == values[i]:
                    new_y[j] = i  
        y = new_y
            
        self.root = self.build_tree(X, y, 0)
    
    def walk_down(self, node: Union[DecisionTreeNode, DecisionTreeLeaf], x):
        if type(node) is DecisionTreeLeaf:
            return node.mapa
        if x[node.split_dim] < node.split_value:
            return self.walk_down(node.left, x)
        else:
            return self.walk_down(node.right, x)
        
    def predict_proba(self, X: np.ndarray) ->  List[Dict[Any, float]]:
        """
        Предсказывает вероятность классов для элементов из X.

        Parameters
        ----------
        X : np.ndarray
            Элементы для предсказания.
        
        Return
        ------
        List[Dict[Any, float]]
            Для каждого элемента из X возвращает словарь 
            {метка класса -> вероятность класса}.
        """
        return [self.walk_down(self.root, x) for x in X]
    
    def predict(self, X : np.ndarray) -> list:
        """
        Предсказывает классы для элементов X.

        Parameters
        ----------
        X : np.ndarray
            Элементы для предсказания.
        
        Return
        ------
        list
            Вектор предсказанных меток для элементов X.
        """
        proba = self.predict_proba(X)
        tmp = [max(p.keys(), key=lambda k: p[k]) for p in proba]
        return [self.mapa[int(i)] for i in tmp]

### Задание 2 (2 балла)
Теперь реализуем сам Random Forest. Идея очень простая: строим `n` деревьев, а затем берем модальное предсказание.

#### Параметры конструктора
`n_estimators` - количество используемых для предсказания деревьев.

Остальное - параметры деревьев.

#### Методы
`fit(X, y)` - строит `n_estimators` деревьев по выборке `X`.

`predict(X)` - для каждого элемента выборки `X` возвращает самый частый класс, который предсказывают для него деревья.

In [60]:
class RandomForestClassifier:
    def __init__(self, criterion="gini", max_depth=None, min_samples_leaf=1, max_features="auto", n_estimators=10):
        self.criterion = "gini"
        self.max_depth = max_depth 
        self.min_samples_leaf = min_samples_leaf
        self.max_features = max_features
        self.n_estimators = n_estimators
        
    def fit(self, X, y):
        self.trees = []
        for i in range(self.n_estimators):
            self.trees.append(DecisionTree(X, y, self.criterion, self.max_depth, self.min_samples_leaf, self.max_features))
    
    def predict(self, X):
        arr = []
        for tree in self.trees:
            arr.append(tree.predict(X))
            
        return stats.mode(arr).mode[0]

        
        

### Задание 3 (2 балла)
Часто хочется понимать, насколько большую роль играет тот или иной признак для предсказания класса объекта. Есть различные способы посчитать его важность. Один из простых способов сделать это для Random Forest - посчитать out-of-bag ошибку предсказания `err_oob`, а затем перемешать значения признака `j` и посчитать ее (`err_oob_j`) еще раз. Оценкой важности признака `j` для одного дерева будет разность `err_oob_j - err_oob`, важность для всего леса считается как среднее значение важности по деревьям.

Реализуйте функцию `feature_importance`, которая принимает на вход Random Forest и возвращает массив, в котором содержится важность для каждого признака.

In [61]:
def feature_importance(rfc):
    
    raise NotImplementedError()

def most_important_features(importance, names, k=20):
    # Выводит названия k самых важных признаков
    idicies = np.argsort(importance)[::-1][:k]
    return np.array(names)[idicies]

Наконец, пришло время протестировать наше дерево на простом синтетическом наборе данных. В результате точность должна быть примерно равна `1.0`, наибольшее значение важности должно быть у признака с индексом `4`, признаки с индексами `2` и `3`  должны быть одинаково важны, а остальные признаки - не важны совсем.

In [62]:
def synthetic_dataset(size):
    X = [(np.random.randint(0, 2), np.random.randint(0, 2), i % 6 == 3, 
          i % 6 == 0, i % 3 == 2, np.random.randint(0, 2)) for i in range(size)]
    y = [i % 3 for i in range(size)]
    return np.array(X), np.array(y)

X, y = synthetic_dataset(1000)
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X, y)
print("Accuracy:", np.mean(rfc.predict(X) == y))
# print("Importance:", feature_importance(rfc))

Accuracy: 1.0


### Задание 4 (1 балл)
Теперь поработаем с реальными данными.

Выборка состоит из публичных анонимизированных данных пользователей социальной сети Вконтакте. Первые два столбца отражают возрастную группу (`zoomer`, `doomer` и `boomer`) и пол (`female`, `male`). Все остальные столбцы являются бинарными признаками, каждый из них определяет, подписан ли пользователь на определенную группу/публичную страницу или нет.\
\
Необходимо обучить два классификатора, один из которых определяет возрастную группу, а второй - пол.\
\
Эксперименты с множеством используемых признаков и подбор гиперпараметров приветствуются. Лес должен строиться за какое-то разумное время.

In [63]:
def read_dataset(path):
    dataframe = pandas.read_csv(path, header=0)
    dataset = dataframe.values.tolist()
    random.shuffle(dataset)
    y_age = [row[0] for row in dataset]
    y_sex = [row[1] for row in dataset]
    X = [row[2:] for row in dataset]
    
    return np.array(X), np.array(y_age), np.array(y_sex), list(dataframe.columns)[2:]

In [64]:
X, y_age, y_sex, features = read_dataset("vk.csv")
X_train, X_test, y_age_train, y_age_test, y_sex_train, y_sex_test = train_test_split(X, y_age, y_sex, train_size=0.9)

#### Возраст

In [65]:
rfc = RandomForestClassifier(n_estimators=10)

rfc.fit(X_train, y_age_train)
print("Accuracy:", np.mean(rfc.predict(X_test) == y_age_test))
print("Most important features:")
for i, name in enumerate(most_important_features(feature_importance(rfc), features, 20)):
    print(str(i+1) + ".", name)

Accuracy: 0.7213114754098361
Most important features:


NotImplementedError: 

#### Пол

In [66]:
rfc = RandomForestClassifier(n_estimators=10)
rfc.fit(X_train, y_sex_train)
print("Accuracy:", np.mean(rfc.predict(X_test) == y_sex_test))
print("Most important features:")
for i, name in enumerate(most_important_features(feature_importance(rfc), features, 20)):
    print(str(i+1) + ".", name)

Accuracy: 0.8486759142496847
Most important features:


NotImplementedError: 

### CatBoost
В качестве аьтернативы попробуем CatBoost. 

Устаниовить его можно просто с помощью `pip install catboost`. Туториалы можно найти, например, [здесь](https://catboost.ai/docs/concepts/python-usages-examples.html#multiclassification) и [здесь](https://github.com/catboost/tutorials/blob/master/python_tutorial.ipynb). Главное - не забудьте использовать `loss_function='MultiClass'`.\
\
Сначала протестируйте CatBoost на синтетических данных. Выведите точность и важность признаков.

In [96]:
X, y = synthetic_dataset(1000)
rfc = CatBoostClassifier(loss_function='MultiClass')
rfc.fit(X, y)
print("Accuracy:", np.mean(rfc.predict(X).reshape(-1) == y))
print("Importance:", None)

Learning rate set to 0.079127
0:	learn: 0.9564860	total: 907us	remaining: 907ms
1:	learn: 0.8385819	total: 1.61ms	remaining: 804ms
2:	learn: 0.7446587	total: 2.5ms	remaining: 829ms
3:	learn: 0.6654846	total: 3.29ms	remaining: 820ms
4:	learn: 0.5957678	total: 3.94ms	remaining: 785ms
5:	learn: 0.5357213	total: 4.61ms	remaining: 763ms
6:	learn: 0.4834957	total: 5.17ms	remaining: 734ms
7:	learn: 0.4390978	total: 6ms	remaining: 744ms
8:	learn: 0.3998020	total: 6.79ms	remaining: 747ms
9:	learn: 0.3636802	total: 7.29ms	remaining: 722ms
10:	learn: 0.3325702	total: 7.99ms	remaining: 718ms
11:	learn: 0.3036588	total: 8.5ms	remaining: 700ms
12:	learn: 0.2786521	total: 9.13ms	remaining: 694ms
13:	learn: 0.2551899	total: 9.72ms	remaining: 685ms
14:	learn: 0.2348596	total: 10.3ms	remaining: 680ms
15:	learn: 0.2164248	total: 11ms	remaining: 678ms
16:	learn: 0.2031045	total: 11.9ms	remaining: 689ms
17:	learn: 0.1868079	total: 12.4ms	remaining: 678ms
18:	learn: 0.1727065	total: 13.1ms	remaining: 675ms


319:	learn: 0.0011769	total: 185ms	remaining: 394ms
320:	learn: 0.0011736	total: 186ms	remaining: 394ms
321:	learn: 0.0011704	total: 187ms	remaining: 393ms
322:	learn: 0.0011671	total: 187ms	remaining: 393ms
323:	learn: 0.0011638	total: 188ms	remaining: 392ms
324:	learn: 0.0011606	total: 189ms	remaining: 392ms
325:	learn: 0.0011574	total: 189ms	remaining: 391ms
326:	learn: 0.0011542	total: 190ms	remaining: 391ms
327:	learn: 0.0011510	total: 190ms	remaining: 390ms
328:	learn: 0.0011479	total: 191ms	remaining: 389ms
329:	learn: 0.0011447	total: 192ms	remaining: 389ms
330:	learn: 0.0011416	total: 192ms	remaining: 388ms
331:	learn: 0.0011385	total: 193ms	remaining: 388ms
332:	learn: 0.0011354	total: 193ms	remaining: 387ms
333:	learn: 0.0011323	total: 194ms	remaining: 386ms
334:	learn: 0.0011292	total: 194ms	remaining: 386ms
335:	learn: 0.0011262	total: 195ms	remaining: 385ms
336:	learn: 0.0011232	total: 196ms	remaining: 385ms
337:	learn: 0.0011202	total: 196ms	remaining: 384ms
338:	learn: 

625:	learn: 0.0006298	total: 373ms	remaining: 223ms
626:	learn: 0.0006288	total: 374ms	remaining: 222ms
627:	learn: 0.0006279	total: 374ms	remaining: 222ms
628:	learn: 0.0006269	total: 375ms	remaining: 221ms
629:	learn: 0.0006260	total: 376ms	remaining: 221ms
630:	learn: 0.0006250	total: 376ms	remaining: 220ms
631:	learn: 0.0006241	total: 377ms	remaining: 219ms
632:	learn: 0.0006231	total: 377ms	remaining: 219ms
633:	learn: 0.0006222	total: 378ms	remaining: 218ms
634:	learn: 0.0006212	total: 379ms	remaining: 218ms
635:	learn: 0.0006203	total: 379ms	remaining: 217ms
636:	learn: 0.0006194	total: 380ms	remaining: 216ms
637:	learn: 0.0006184	total: 380ms	remaining: 216ms
638:	learn: 0.0006175	total: 381ms	remaining: 215ms
639:	learn: 0.0006166	total: 381ms	remaining: 215ms
640:	learn: 0.0006157	total: 382ms	remaining: 214ms
641:	learn: 0.0006147	total: 383ms	remaining: 213ms
642:	learn: 0.0006138	total: 383ms	remaining: 213ms
643:	learn: 0.0006129	total: 384ms	remaining: 212ms
644:	learn: 

933:	learn: 0.0004278	total: 560ms	remaining: 39.6ms
934:	learn: 0.0004273	total: 561ms	remaining: 39ms
935:	learn: 0.0004269	total: 562ms	remaining: 38.4ms
936:	learn: 0.0004264	total: 562ms	remaining: 37.8ms
937:	learn: 0.0004260	total: 563ms	remaining: 37.2ms
938:	learn: 0.0004255	total: 564ms	remaining: 36.6ms
939:	learn: 0.0004251	total: 564ms	remaining: 36ms
940:	learn: 0.0004246	total: 565ms	remaining: 35.4ms
941:	learn: 0.0004242	total: 565ms	remaining: 34.8ms
942:	learn: 0.0004238	total: 566ms	remaining: 34.2ms
943:	learn: 0.0004233	total: 566ms	remaining: 33.6ms
944:	learn: 0.0004229	total: 567ms	remaining: 33ms
945:	learn: 0.0004225	total: 568ms	remaining: 32.4ms
946:	learn: 0.0004220	total: 568ms	remaining: 31.8ms
947:	learn: 0.0004216	total: 569ms	remaining: 31.2ms
948:	learn: 0.0004212	total: 569ms	remaining: 30.6ms
949:	learn: 0.0004207	total: 570ms	remaining: 30ms
950:	learn: 0.0004203	total: 570ms	remaining: 29.4ms
951:	learn: 0.0004199	total: 571ms	remaining: 28.8ms
9

### Задание 5 (3 балла)
Попробуем применить один из используемых на практике алгоритмов. В этом нам поможет CatBoost. Также, как и реализованный ними RandomForest, применим его для определения пола и возраста пользователей сети Вконтакте, выведите названия наиболее важных признаков так же, как в задании 3.\
\
Эксперименты с множеством используемых признаков и подбор гиперпараметров приветствуются.

In [69]:
X, y_age, y_sex, features = read_dataset("vk.csv")
X_train, X_test, y_age_train, y_age_test, y_sex_train, y_sex_test = train_test_split(X, y_age, y_sex, train_size=0.9)
X_train, X_eval, y_age_train, y_age_eval, y_sex_train, y_sex_eval = train_test_split(X_train, y_age_train, y_sex_train, train_size=0.8)

#### Возраст

In [98]:
rfc = CatBoostClassifier(loss_function='MultiClass')
rfc.fit(X_train, y_age_train)
print("Accuracy:", np.mean(rfc.predict(X_test).reshape(-1) == y_age_test))
print("Most important features:")
for i, name in enumerate(most_important_features(None, features, 10)):
    print(str(i+1) + ".", name)

Learning rate set to 0.086475
0:	learn: 1.0689176	total: 7.04ms	remaining: 7.03s
1:	learn: 1.0444473	total: 13.9ms	remaining: 6.96s
2:	learn: 1.0198652	total: 21.1ms	remaining: 7.01s
3:	learn: 1.0005432	total: 28.7ms	remaining: 7.15s
4:	learn: 0.9814270	total: 35.4ms	remaining: 7.04s
5:	learn: 0.9660803	total: 42.5ms	remaining: 7.04s
6:	learn: 0.9510011	total: 49.2ms	remaining: 6.99s
7:	learn: 0.9366228	total: 56.5ms	remaining: 7s
8:	learn: 0.9239072	total: 63.1ms	remaining: 6.95s
9:	learn: 0.9131731	total: 70.5ms	remaining: 6.98s
10:	learn: 0.9007739	total: 77.3ms	remaining: 6.95s
11:	learn: 0.8900916	total: 84.3ms	remaining: 6.94s
12:	learn: 0.8798922	total: 91.2ms	remaining: 6.92s
13:	learn: 0.8714077	total: 98.2ms	remaining: 6.91s
14:	learn: 0.8612349	total: 105ms	remaining: 6.89s
15:	learn: 0.8528636	total: 112ms	remaining: 6.86s
16:	learn: 0.8455905	total: 118ms	remaining: 6.85s
17:	learn: 0.8380250	total: 125ms	remaining: 6.83s
18:	learn: 0.8310194	total: 132ms	remaining: 6.82s


171:	learn: 0.5508598	total: 1.21s	remaining: 5.8s
172:	learn: 0.5503860	total: 1.21s	remaining: 5.8s
173:	learn: 0.5497792	total: 1.22s	remaining: 5.79s
174:	learn: 0.5488290	total: 1.23s	remaining: 5.79s
175:	learn: 0.5481344	total: 1.24s	remaining: 5.79s
176:	learn: 0.5475500	total: 1.24s	remaining: 5.78s
177:	learn: 0.5468516	total: 1.25s	remaining: 5.77s
178:	learn: 0.5460425	total: 1.26s	remaining: 5.76s
179:	learn: 0.5454894	total: 1.26s	remaining: 5.75s
180:	learn: 0.5449918	total: 1.27s	remaining: 5.75s
181:	learn: 0.5441282	total: 1.28s	remaining: 5.74s
182:	learn: 0.5434847	total: 1.28s	remaining: 5.73s
183:	learn: 0.5428337	total: 1.29s	remaining: 5.72s
184:	learn: 0.5419410	total: 1.3s	remaining: 5.71s
185:	learn: 0.5413464	total: 1.3s	remaining: 5.71s
186:	learn: 0.5409031	total: 1.31s	remaining: 5.7s
187:	learn: 0.5401080	total: 1.32s	remaining: 5.69s
188:	learn: 0.5395684	total: 1.32s	remaining: 5.69s
189:	learn: 0.5390730	total: 1.33s	remaining: 5.68s
190:	learn: 0.538

340:	learn: 0.4673711	total: 2.41s	remaining: 4.66s
341:	learn: 0.4670690	total: 2.42s	remaining: 4.65s
342:	learn: 0.4667221	total: 2.43s	remaining: 4.65s
343:	learn: 0.4661824	total: 2.44s	remaining: 4.64s
344:	learn: 0.4657887	total: 2.44s	remaining: 4.64s
345:	learn: 0.4653995	total: 2.45s	remaining: 4.63s
346:	learn: 0.4651385	total: 2.46s	remaining: 4.62s
347:	learn: 0.4648078	total: 2.46s	remaining: 4.62s
348:	learn: 0.4645784	total: 2.47s	remaining: 4.61s
349:	learn: 0.4642543	total: 2.48s	remaining: 4.6s
350:	learn: 0.4639385	total: 2.48s	remaining: 4.6s
351:	learn: 0.4636281	total: 2.49s	remaining: 4.59s
352:	learn: 0.4630875	total: 2.5s	remaining: 4.58s
353:	learn: 0.4626843	total: 2.51s	remaining: 4.58s
354:	learn: 0.4623160	total: 2.52s	remaining: 4.57s
355:	learn: 0.4619780	total: 2.52s	remaining: 4.56s
356:	learn: 0.4616415	total: 2.53s	remaining: 4.56s
357:	learn: 0.4614127	total: 2.54s	remaining: 4.55s
358:	learn: 0.4611428	total: 2.54s	remaining: 4.54s
359:	learn: 0.4

507:	learn: 0.4143741	total: 3.62s	remaining: 3.5s
508:	learn: 0.4141210	total: 3.63s	remaining: 3.5s
509:	learn: 0.4139087	total: 3.63s	remaining: 3.49s
510:	learn: 0.4135620	total: 3.64s	remaining: 3.49s
511:	learn: 0.4132349	total: 3.65s	remaining: 3.48s
512:	learn: 0.4129759	total: 3.66s	remaining: 3.47s
513:	learn: 0.4126847	total: 3.66s	remaining: 3.46s
514:	learn: 0.4123112	total: 3.67s	remaining: 3.46s
515:	learn: 0.4121233	total: 3.68s	remaining: 3.45s
516:	learn: 0.4118996	total: 3.69s	remaining: 3.44s
517:	learn: 0.4116129	total: 3.69s	remaining: 3.44s
518:	learn: 0.4112578	total: 3.7s	remaining: 3.43s
519:	learn: 0.4110286	total: 3.71s	remaining: 3.42s
520:	learn: 0.4107989	total: 3.71s	remaining: 3.41s
521:	learn: 0.4104990	total: 3.72s	remaining: 3.41s
522:	learn: 0.4103328	total: 3.73s	remaining: 3.4s
523:	learn: 0.4101097	total: 3.73s	remaining: 3.39s
524:	learn: 0.4099287	total: 3.74s	remaining: 3.39s
525:	learn: 0.4096172	total: 3.75s	remaining: 3.38s
526:	learn: 0.40

675:	learn: 0.3739273	total: 4.82s	remaining: 2.31s
676:	learn: 0.3737606	total: 4.83s	remaining: 2.3s
677:	learn: 0.3734615	total: 4.84s	remaining: 2.3s
678:	learn: 0.3733325	total: 4.84s	remaining: 2.29s
679:	learn: 0.3731882	total: 4.85s	remaining: 2.28s
680:	learn: 0.3729108	total: 4.86s	remaining: 2.28s
681:	learn: 0.3727178	total: 4.87s	remaining: 2.27s
682:	learn: 0.3724543	total: 4.87s	remaining: 2.26s
683:	learn: 0.3723224	total: 4.88s	remaining: 2.25s
684:	learn: 0.3720094	total: 4.89s	remaining: 2.25s
685:	learn: 0.3717540	total: 4.89s	remaining: 2.24s
686:	learn: 0.3714587	total: 4.9s	remaining: 2.23s
687:	learn: 0.3713257	total: 4.91s	remaining: 2.23s
688:	learn: 0.3710764	total: 4.92s	remaining: 2.22s
689:	learn: 0.3708952	total: 4.92s	remaining: 2.21s
690:	learn: 0.3707792	total: 4.93s	remaining: 2.2s
691:	learn: 0.3706387	total: 4.94s	remaining: 2.2s
692:	learn: 0.3704126	total: 4.94s	remaining: 2.19s
693:	learn: 0.3701842	total: 4.95s	remaining: 2.18s
694:	learn: 0.370

843:	learn: 0.3408020	total: 6.03s	remaining: 1.11s
844:	learn: 0.3406415	total: 6.04s	remaining: 1.11s
845:	learn: 0.3404664	total: 6.04s	remaining: 1.1s
846:	learn: 0.3402947	total: 6.05s	remaining: 1.09s
847:	learn: 0.3401762	total: 6.06s	remaining: 1.09s
848:	learn: 0.3399659	total: 6.07s	remaining: 1.08s
849:	learn: 0.3397958	total: 6.08s	remaining: 1.07s
850:	learn: 0.3396555	total: 6.08s	remaining: 1.06s
851:	learn: 0.3394792	total: 6.09s	remaining: 1.06s
852:	learn: 0.3393173	total: 6.1s	remaining: 1.05s
853:	learn: 0.3391746	total: 6.11s	remaining: 1.04s
854:	learn: 0.3389726	total: 6.11s	remaining: 1.04s
855:	learn: 0.3387678	total: 6.12s	remaining: 1.03s
856:	learn: 0.3385691	total: 6.13s	remaining: 1.02s
857:	learn: 0.3383383	total: 6.14s	remaining: 1.01s
858:	learn: 0.3381876	total: 6.14s	remaining: 1.01s
859:	learn: 0.3379969	total: 6.15s	remaining: 1s
860:	learn: 0.3378506	total: 6.16s	remaining: 994ms
861:	learn: 0.3377249	total: 6.17s	remaining: 987ms
862:	learn: 0.337

#### Пол

In [102]:
rfc = CatBoostClassifier(loss_function='MultiClass')
rfc.fit(X_train, y_sex_train)
print("Accuracy:", np.mean(rfc.predict(X_test).reshape(-1) == y_sex_test))
print("Most important features:")
for i, name in enumerate(most_important_features(None, features, 10)):
    print(str(i+1) + ".", name)

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

Learning rate set to 0.086475
0:	learn: 0.6725031	total: 5.42ms	remaining: 5.41s
1:	learn: 0.6555478	total: 11.2ms	remaining: 5.59s
2:	learn: 0.6419685	total: 17ms	remaining: 5.67s
3:	learn: 0.6270649	total: 22.5ms	remaining: 5.61s
4:	learn: 0.6140698	total: 27.9ms	remaining: 5.56s
5:	learn: 0.6043081	total: 33.5ms	remaining: 5.54s
6:	learn: 0.5931784	total: 38.8ms	remaining: 5.51s
7:	learn: 0.5833587	total: 44.4ms	remaining: 5.51s
8:	learn: 0.5726939	total: 49.5ms	remaining: 5.45s
9:	learn: 0.5649406	total: 55ms	remaining: 5.45s
10:	learn: 0.5572403	total: 60.2ms	remaining: 5.42s
11:	learn: 0.5498620	total: 65.9ms	remaining: 5.43s
12:	learn: 0.5416705	total: 71.1ms	remaining: 5.4s
13:	learn: 0.5349354	total: 77ms	remaining: 5.42s
14:	learn: 0.5279810	total: 81.9ms	remaining: 5.38s
15:	learn: 0.5221481	total: 87ms	remaining: 5.35s
16:	learn: 0.5165678	total: 91.8ms	remaining: 5.31s
17:	learn: 0.5102193	total: 96.7ms	remaining: 5.27s
18:	learn: 0.5051900	total: 102ms	remaining: 5.24s
19

187:	learn: 0.2991611	total: 999ms	remaining: 4.32s
188:	learn: 0.2988278	total: 1s	remaining: 4.31s
189:	learn: 0.2984650	total: 1.01s	remaining: 4.3s
190:	learn: 0.2979111	total: 1.01s	remaining: 4.3s
191:	learn: 0.2975521	total: 1.02s	remaining: 4.29s
192:	learn: 0.2971082	total: 1.03s	remaining: 4.29s
193:	learn: 0.2966356	total: 1.03s	remaining: 4.28s
194:	learn: 0.2963620	total: 1.04s	remaining: 4.28s
195:	learn: 0.2961229	total: 1.04s	remaining: 4.28s
196:	learn: 0.2957819	total: 1.05s	remaining: 4.27s
197:	learn: 0.2952834	total: 1.05s	remaining: 4.26s
198:	learn: 0.2949342	total: 1.06s	remaining: 4.25s
199:	learn: 0.2945697	total: 1.06s	remaining: 4.25s
200:	learn: 0.2940851	total: 1.07s	remaining: 4.24s
201:	learn: 0.2937684	total: 1.07s	remaining: 4.24s
202:	learn: 0.2933803	total: 1.08s	remaining: 4.23s
203:	learn: 0.2930451	total: 1.08s	remaining: 4.22s
204:	learn: 0.2927268	total: 1.09s	remaining: 4.22s
205:	learn: 0.2924315	total: 1.09s	remaining: 4.21s
206:	learn: 0.292

378:	learn: 0.2468094	total: 2s	remaining: 3.27s
379:	learn: 0.2465205	total: 2s	remaining: 3.27s
380:	learn: 0.2463500	total: 2.01s	remaining: 3.27s
381:	learn: 0.2461457	total: 2.02s	remaining: 3.26s
382:	learn: 0.2459835	total: 2.03s	remaining: 3.27s
383:	learn: 0.2458095	total: 2.04s	remaining: 3.27s
384:	learn: 0.2457049	total: 2.04s	remaining: 3.26s
385:	learn: 0.2454876	total: 2.05s	remaining: 3.26s
386:	learn: 0.2452218	total: 2.06s	remaining: 3.26s
387:	learn: 0.2450350	total: 2.06s	remaining: 3.25s
388:	learn: 0.2448486	total: 2.07s	remaining: 3.25s
389:	learn: 0.2447109	total: 2.07s	remaining: 3.24s
390:	learn: 0.2445281	total: 2.08s	remaining: 3.24s
391:	learn: 0.2443738	total: 2.08s	remaining: 3.23s
392:	learn: 0.2441511	total: 2.09s	remaining: 3.23s
393:	learn: 0.2439062	total: 2.1s	remaining: 3.22s
394:	learn: 0.2437161	total: 2.1s	remaining: 3.22s
395:	learn: 0.2435871	total: 2.11s	remaining: 3.21s
396:	learn: 0.2434440	total: 2.11s	remaining: 3.21s
397:	learn: 0.243263

547:	learn: 0.2184908	total: 3.01s	remaining: 2.48s
548:	learn: 0.2183835	total: 3.02s	remaining: 2.48s
549:	learn: 0.2182474	total: 3.02s	remaining: 2.47s
550:	learn: 0.2180770	total: 3.03s	remaining: 2.47s
551:	learn: 0.2179884	total: 3.03s	remaining: 2.46s
552:	learn: 0.2178958	total: 3.04s	remaining: 2.46s
553:	learn: 0.2177474	total: 3.04s	remaining: 2.45s
554:	learn: 0.2175561	total: 3.05s	remaining: 2.44s
555:	learn: 0.2174301	total: 3.05s	remaining: 2.44s
556:	learn: 0.2173401	total: 3.06s	remaining: 2.43s
557:	learn: 0.2171282	total: 3.06s	remaining: 2.43s
558:	learn: 0.2169503	total: 3.07s	remaining: 2.42s
559:	learn: 0.2168412	total: 3.08s	remaining: 2.42s
560:	learn: 0.2166625	total: 3.08s	remaining: 2.41s
561:	learn: 0.2164089	total: 3.09s	remaining: 2.4s
562:	learn: 0.2162216	total: 3.09s	remaining: 2.4s
563:	learn: 0.2160587	total: 3.1s	remaining: 2.39s
564:	learn: 0.2158607	total: 3.1s	remaining: 2.39s
565:	learn: 0.2156972	total: 3.11s	remaining: 2.38s
566:	learn: 0.21

730:	learn: 0.1956783	total: 4.01s	remaining: 1.48s
731:	learn: 0.1955737	total: 4.01s	remaining: 1.47s
732:	learn: 0.1954634	total: 4.02s	remaining: 1.46s
733:	learn: 0.1953443	total: 4.03s	remaining: 1.46s
734:	learn: 0.1952700	total: 4.03s	remaining: 1.45s
735:	learn: 0.1951989	total: 4.04s	remaining: 1.45s
736:	learn: 0.1951086	total: 4.04s	remaining: 1.44s
737:	learn: 0.1949452	total: 4.04s	remaining: 1.44s
738:	learn: 0.1948354	total: 4.05s	remaining: 1.43s
739:	learn: 0.1947558	total: 4.05s	remaining: 1.42s
740:	learn: 0.1946100	total: 4.06s	remaining: 1.42s
741:	learn: 0.1944919	total: 4.07s	remaining: 1.41s
742:	learn: 0.1944294	total: 4.07s	remaining: 1.41s
743:	learn: 0.1942462	total: 4.08s	remaining: 1.4s
744:	learn: 0.1941191	total: 4.08s	remaining: 1.4s
745:	learn: 0.1940114	total: 4.09s	remaining: 1.39s
746:	learn: 0.1938608	total: 4.09s	remaining: 1.39s
747:	learn: 0.1938119	total: 4.1s	remaining: 1.38s
748:	learn: 0.1936805	total: 4.1s	remaining: 1.37s
749:	learn: 0.19

917:	learn: 0.1771542	total: 5.01s	remaining: 448ms
918:	learn: 0.1770378	total: 5.02s	remaining: 442ms
919:	learn: 0.1769028	total: 5.02s	remaining: 437ms
920:	learn: 0.1767617	total: 5.03s	remaining: 431ms
921:	learn: 0.1766889	total: 5.03s	remaining: 426ms
922:	learn: 0.1766041	total: 5.04s	remaining: 420ms
923:	learn: 0.1765082	total: 5.04s	remaining: 415ms
924:	learn: 0.1763922	total: 5.05s	remaining: 409ms
925:	learn: 0.1763078	total: 5.05s	remaining: 404ms
926:	learn: 0.1762475	total: 5.06s	remaining: 398ms
927:	learn: 0.1761586	total: 5.07s	remaining: 393ms
928:	learn: 0.1759990	total: 5.07s	remaining: 388ms
929:	learn: 0.1759557	total: 5.08s	remaining: 382ms
930:	learn: 0.1758493	total: 5.08s	remaining: 377ms
931:	learn: 0.1757712	total: 5.09s	remaining: 371ms
932:	learn: 0.1756562	total: 5.09s	remaining: 366ms
933:	learn: 0.1755786	total: 5.1s	remaining: 360ms
934:	learn: 0.1755106	total: 5.1s	remaining: 355ms
935:	learn: 0.1753925	total: 5.11s	remaining: 349ms
936:	learn: 0.