## Imports

In [1]:
import numpy as np
import pandas as pd
import sys

sys.path.append("/home/khaymonenko/trees-and-ensembles")

from spooky_trees.criterions import SpookyEntropy
from spooky_trees.models import SpookyForest
from spooky_trees.preprocess import OneHotEncoder
from spooky_trees.model_selection import KFold
from spooky_trees.metrics import f1_score

## Data

In [2]:
TRAIN_PATH = "/home/khaymonenko/trees-and-ensembles/data/competition/train.csv"
TEST_PATH = "/home/khaymonenko/trees-and-ensembles/data/competition/test.csv"

data = pd.read_csv(TRAIN_PATH, index_col="Id")

X = data.drop(["class"], axis=1)
y = data["class"].to_numpy()

X.sample()

Unnamed: 0_level_0,does-bruise-or-bleed,habitat,season,cap-diameter,stem-height,stem-width
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
26180,f,d,w,10.58,5.27,18.35


In [3]:
y_preprocessed = np.zeros((len(y), 2))
y_preprocessed[y == 1, 1] = 1
y_preprocessed[y == 0, 0] = 1

In [4]:
TRANSFORM_COLUMNS = ["does-bruise-or-bleed", "habitat", "season"]

encoder = OneHotEncoder(transform_columns=TRANSFORM_COLUMNS)

X_preprocessed = encoder.fit_transform(X)
X_preprocessed

array([[ 8.94,  7.91, 24.55, ...,  0.  ,  1.  ,  0.  ],
       [ 2.92,  3.59,  6.59, ...,  0.  ,  0.  ,  0.  ],
       [ 4.11,  4.78,  3.56, ...,  0.  ,  0.  ,  0.  ],
       ...,
       [ 3.28,  4.29,  2.64, ...,  1.  ,  0.  ,  0.  ],
       [ 6.7 ,  7.67, 50.19, ...,  0.  ,  0.  ,  0.  ],
       [21.51,  4.99, 37.01, ...,  0.  ,  0.  ,  0.  ]])

## CV

In [6]:
kfold = KFold()

metrics = []
for split_idx, (train_ids, test_ids) in enumerate(kfold.split(len(X_preprocessed))):
    X_train, X_test = X_preprocessed[train_ids], X_preprocessed[test_ids]
    y_train, y_test = y_preprocessed[train_ids], y_preprocessed[test_ids]

    forest = SpookyForest(criterion=SpookyEntropy, n_estimators=100, n_workers=4, n_classes=2)
    forest.fit(X_train, y_train)

    predictions = forest.predict(X_test)
    current_f1 = f1_score(y_test, predictions)

    print(f"Split idx: {split_idx}, f1: {current_f1}")
    metrics.append(current_f1)

print(f"Avg f1: {np.mean(metrics)}")

100%|███████████████████████████████████████| 100/100 [00:00<00:00, 4736.65it/s]
100%|████████████████████████████████████████| 100/100 [00:00<00:00, 411.34it/s]

Split idx: 0, f1: 0.7189341532154754



100%|███████████████████████████████████████| 100/100 [00:00<00:00, 4309.10it/s]
100%|████████████████████████████████████████| 100/100 [00:00<00:00, 360.54it/s]

Split idx: 1, f1: 0.7120041215868109



100%|███████████████████████████████████████| 100/100 [00:00<00:00, 5597.48it/s]
100%|████████████████████████████████████████| 100/100 [00:00<00:00, 380.36it/s]

Split idx: 2, f1: 0.7103430487490328
Avg f1: 0.7137604411837731





## Train