In [9]:
from sklearn.ensemble import RandomForestClassifier
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score

#### Загружем все необходимое

In [2]:
features = pickle.load(open('./processed/bpe_8000_features.pkl', 'rb'))

In [3]:
labels = pd.read_csv('./processed/labels.csv')

#### Поделим выборку на трейн/тест

In [4]:
x_train, x_test, y_train, y_test = train_test_split(features, labels.values[:, -1], test_size=0.2)

#### Вспомогательная функция для подсчета метрик

In [12]:
def evaluate_score(model):
    predict = model.predict(x_test)
    print(f'accuracy: {accuracy_score(predict, y_test)}')
    print(f'rocauc : {roc_auc_score(y_test, predict)}')

#### Поймем, что accuracy почти ничего не дает, так как константый ноль набирает 0.89

In [23]:
y_test[y_test == 0].shape[0] / y_test.shape[0]

0.8987623374588751

### Попробуем разные модели

In [5]:
model = RandomForestClassifier(
    n_jobs=-1,
    n_estimators=1000,
    max_depth=30, 
)

In [6]:
%%time

model.fit(x_train, y_train)

CPU times: user 24min 5s, sys: 2.28 s, total: 24min 7s
Wall time: 3min 13s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=30, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [13]:
%%time 

evaluate_score(model)

accuracy: 0.9159642801190663
rocauc : 0.5916565164433618
CPU times: user 11.3 s, sys: 48.3 ms, total: 11.3 s
Wall time: 1.62 s


In [14]:
%%time

model = RandomForestClassifier(
    n_jobs=-1,
    n_estimators=1000,
    max_depth=20, 
)

model.fit(x_train, y_train)

CPU times: user 12min 48s, sys: 1.5 s, total: 12min 50s
Wall time: 1min 43s


In [15]:
%%time 

evaluate_score(model)

accuracy: 0.9054363152122826
rocauc : 0.5404993909866017
CPU times: user 10 s, sys: 104 ms, total: 10.1 s
Wall time: 1.63 s


In [16]:
%%time

model = RandomForestClassifier(
    n_jobs=-1,
    n_estimators=1000,
    max_depth=40, 
)

model.fit(x_train, y_train)

CPU times: user 37min 49s, sys: 3.44 s, total: 37min 52s
Wall time: 5min 4s


In [17]:
%%time 

evaluate_score(model)

accuracy: 0.9248942503524988
rocauc : 0.6358574596387969
CPU times: user 18.3 s, sys: 92.3 ms, total: 18.4 s
Wall time: 2.66 s


Увличи еще глубину

In [18]:
%%time

model = RandomForestClassifier(
    n_jobs=-1,
    n_estimators=1000,
    max_depth=50, 
)

model.fit(x_train, y_train)

CPU times: user 52min 30s, sys: 5.03 s, total: 52min 35s
Wall time: 7min 2s


In [19]:
%%time 

evaluate_score(model)

accuracy: 0.9322262259125803
rocauc : 0.6722934820059736
CPU times: user 21 s, sys: 83.9 ms, total: 21.1 s
Wall time: 2.97 s


а может еще глубже?

In [20]:
%%time

model = RandomForestClassifier(
    n_jobs=-1,
    n_estimators=1000,
    max_depth=80, 
)

model.fit(x_train, y_train)

CPU times: user 1h 33min 47s, sys: 7.55 s, total: 1h 33min 55s
Wall time: 12min 25s


In [21]:
%%time 

evaluate_score(model)

accuracy: 0.9454488485038384
rocauc : 0.7403185088384263
CPU times: user 27 s, sys: 124 ms, total: 27.2 s
Wall time: 3.63 s


In [23]:
pickle.dump(model, open('./models/rf1000_deep_80.pkl', 'wb'))

К сожалению пикл последней модели, которая приблизилась по метрикам к нейронной модели, весит $906$MB, что уже явно не является легкой моделью.