## Imports

In [1]:
import numpy as np
import pandas as pd
import sys

sys.path.append("/home/khaymon/spooky-trees/")

from spooky_trees.criterions import SpookyEntropy, SpookyMSE
from spooky_trees.models import AdaSpookyBoost, SpookyTree
from spooky_trees.preprocess import OneHotEncoder, LabelEncoder
from spooky_trees.model_selection import KFold, train_test_split
from spooky_trees.metrics import f1_score

## Data

In [2]:
TRAIN_PATH = "/home/khaymon/spooky-trees/data/competition/train.csv"
TEST_PATH = "/home/khaymon/spooky-trees/data/competition/test.csv"

data = pd.read_csv(TRAIN_PATH, index_col="Id")

X = data.drop(["class"], axis=1)
y = data["class"].to_numpy()

X.sample()

Unnamed: 0_level_0,does-bruise-or-bleed,habitat,season,cap-diameter,stem-height,stem-width
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
20548,f,m,a,3.32,4.1,6.7


In [3]:
y_preprocessed = np.zeros((len(y), 2))
y_preprocessed[y == 1, 1] = 1
y_preprocessed[y == 0, 0] = 1

In [4]:
TRANSFORM_COLUMNS = ["does-bruise-or-bleed", "habitat", "season"]
CAT_COLUMNS_IDS = {0, 1, 2}

encoder = OneHotEncoder(transform_columns=TRANSFORM_COLUMNS)
# encoder = LabelEncoder(TRANSFORM_COLUMNS)

X_preprocessed = encoder.fit_transform(X)
X_preprocessed

array([[ 8.94,  7.91, 24.55, ...,  0.  ,  1.  ,  0.  ],
       [ 2.92,  3.59,  6.59, ...,  0.  ,  0.  ,  0.  ],
       [ 4.11,  4.78,  3.56, ...,  0.  ,  0.  ,  0.  ],
       ...,
       [ 3.28,  4.29,  2.64, ...,  1.  ,  0.  ,  0.  ],
       [ 6.7 ,  7.67, 50.19, ...,  0.  ,  0.  ,  0.  ],
       [21.51,  4.99, 37.01, ...,  0.  ,  0.  ,  0.  ]])

## CV

In [6]:
kfold = KFold()

metrics = []
for split_idx, (train_ids, test_ids) in enumerate(kfold.split(len(X_preprocessed))):
    X_train, X_test = X_preprocessed[train_ids], X_preprocessed[test_ids]
    y_train, y_test = y_preprocessed[train_ids], y_preprocessed[test_ids]

    adaboost = AdaSpookyBoost(criterion=SpookyEntropy, max_depth=4, n_estimators=100, n_classes=2, learning_rate=1e-5)
    adaboost.fit(X_train, y_train)
    predictions = adaboost.predict(X_test)

    current_f1 = f1_score(y_test, predictions)

    print(f"Split idx: {split_idx}, f1: {current_f1}")
    metrics.append(current_f1)

print(f"Avg f1: {np.mean(metrics)}")

val metric: 0.596, lr: 0.000: 100%|███████████| 100/100 [02:44<00:00,  1.64s/it]


Split idx: 0, f1: 0.6899563318777293


val metric: 0.631, lr: 0.000:  45%|█████▍      | 45/100 [01:13<01:30,  1.64s/it]

(2,) (2,)





AssertionError: 

## Train

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y_preprocessed, test_size=0.2)

In [None]:
forest = GradSpookyBoost(
    criterion=SpookyEntropy,
    n_estimators=100,
    n_classes=2,
    learning_rate=0.02,
    learning_rate_decay=0.9,
    max_depth=10,
#     min_information_gain=5e-5,
    min_samples_split=5,
    rsm=0.8,
    early_stop=True,
    early_stop_patience=20,
    reduce_lr_on_plateu=True,
    reduce_lr_on_plateu_patience=15,
#     cat_features=CAT_COLUMNS_IDS,
)

forest.fit(X_train, y_train)

val metric: 0.787, lr: 0.020:  14%|█▋          | 14/100 [00:38<03:48,  2.65s/it]

In [None]:
f1_score(y_test, forest.predict_proba(X_test))

0.8091511137868754