## Imports

In [1]:
import pandas as pd
import sys

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

sys.path.append("/home/khaymonenko/trees-and-ensembles")

from spooky_trees.models.spooky_tree import SpookyTree
from spooky_trees.preprocess import LabelEncoder

## Data

In [2]:
TRAIN_PATH = "/home/khaymonenko/trees-and-ensembles/data/competition/train.csv"
TEST_PATH = "/home/khaymonenko/trees-and-ensembles/data/competition/test.csv"

data = pd.read_csv(TRAIN_PATH, index_col="Id")

X = data.drop(["class"], axis=1)
y = data["class"].to_numpy()

X.sample()

Unnamed: 0_level_0,does-bruise-or-bleed,habitat,season,cap-diameter,stem-height,stem-width
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
32273,f,m,u,4.16,5.88,4.16


In [3]:
CAT_FEATURES = ["does-bruise-or-bleed", "habitat", "season"]
CAT_FEATURES_IDS = {idx for idx, column in enumerate(X.columns) if column in CAT_FEATURES}

preprocessor = LabelEncoder(CAT_FEATURES)

X_preprocessed = preprocessor.fit_transform(X)
X_preprocessed

array([[ 0.  ,  0.  ,  0.  ,  8.94,  7.91, 24.55],
       [ 1.  ,  1.  ,  1.  ,  2.92,  3.59,  6.59],
       [ 1.  ,  1.  ,  1.  ,  4.11,  4.78,  3.56],
       ...,
       [ 1.  ,  0.  ,  2.  ,  3.28,  4.29,  2.64],
       [ 1.  ,  4.  ,  1.  ,  6.7 ,  7.67, 50.19],
       [ 1.  ,  0.  ,  1.  , 21.51,  4.99, 37.01]])

In [6]:
tree = SpookyTree(max_depth=10, cat_features=CAT_FEATURES_IDS)
tree.fit(X_preprocessed, y)

<spooky_trees.models.spooky_tree.SpookyTree at 0x7f78d0a9ec50>

In [7]:
predictions = tree.predict(X_preprocessed)
predictions

accuracy_score(predictions, y)

0.8326666666666667