## Imports

In [1]:
import numpy as np
import pandas as pd
import sys

sys.path.append("/home/khaymonenko/trees-and-ensembles")

from spooky_trees.criterions import SpookyEntropy, SpookyMSE
from spooky_trees.models import GradSpookyBoost, SpookyTree
from spooky_trees.preprocess import OneHotEncoder, LabelEncoder
from spooky_trees.model_selection import KFold, train_test_split
from spooky_trees.metrics import f1_score

## Data

In [2]:
TRAIN_PATH = "/home/khaymonenko/trees-and-ensembles/data/competition/train.csv"
TEST_PATH = "/home/khaymonenko/trees-and-ensembles/data/competition/test.csv"

data = pd.read_csv(TRAIN_PATH, index_col="Id")

X = data.drop(["class"], axis=1)
y = data["class"].to_numpy()

X.sample()

Unnamed: 0_level_0,does-bruise-or-bleed,habitat,season,cap-diameter,stem-height,stem-width
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
28177,t,d,a,6.42,4.5,11.71


In [3]:
y_preprocessed = np.zeros((len(y), 2))
y_preprocessed[y == 1, 1] = 1
y_preprocessed[y == 0, 0] = 1

In [4]:
TRANSFORM_COLUMNS = ["does-bruise-or-bleed", "habitat", "season"]
CAT_COLUMNS_IDS = {0, 1, 2}

# encoder = OneHotEncoder(transform_columns=TRANSFORM_COLUMNS)
encoder = LabelEncoder(TRANSFORM_COLUMNS)

X_preprocessed = encoder.fit_transform(X)
X_preprocessed

array([[ 0.  ,  0.  ,  0.  ,  8.94,  7.91, 24.55],
       [ 1.  ,  1.  ,  1.  ,  2.92,  3.59,  6.59],
       [ 1.  ,  1.  ,  1.  ,  4.11,  4.78,  3.56],
       ...,
       [ 1.  ,  0.  ,  2.  ,  3.28,  4.29,  2.64],
       [ 1.  ,  4.  ,  1.  ,  6.7 ,  7.67, 50.19],
       [ 1.  ,  0.  ,  1.  , 21.51,  4.99, 37.01]])

## CV

In [5]:
kfold = KFold()

metrics = []
for split_idx, (train_ids, test_ids) in enumerate(kfold.split(len(X_preprocessed))):
    X_train, X_test = X_preprocessed[train_ids], X_preprocessed[test_ids]
    y_train, y_test = y_preprocessed[train_ids], y_preprocessed[test_ids]

    forest = GradSpookyBoost(criterion=SpookyEntropy, max_depth=2, n_estimators=100, n_classes=2, learning_rate=1e-2)
    predictions = forest.predict(X_test)
#     print(predictions)
    current_f1 = f1_score(y_test, predictions.argmax(axis=1))

    print(f"Split idx: {split_idx}, f1: {current_f1}")
    metrics.append(current_f1)

print(f"Avg f1: {np.mean(metrics)}")

AttributeError: 'GradSpookyBoost' object has no attribute 'predict'

## Train

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y_preprocessed, test_size=0.2)

In [None]:
forest = GradSpookyBoost(
    criterion=SpookyEntropy,
    n_estimators=100,
    n_classes=2,
    learning_rate=0.02,
    learning_rate_decay=0.9,
    max_depth=10,
    min_information_gain=5e-5,
    min_samples_split=2,
    rsm=0.9,
    early_stop=True,
    early_stop_patience=20,
    reduce_lr_on_plateu=True,
    reduce_lr_on_plateu_patience=15,
#     cat_features=CAT_COLUMNS_IDS,
)

forest.fit(X_train, y_train)

val metric: 0.840, lr: 0.020:  19%|██▎         | 19/100 [00:53<03:48,  2.82s/it]

In [15]:
f1_score(y_test, forest.predict_proba(X_test))

0.8431603773584906