In [1]:
import numpy as np
import pandas as pd
from pandasgui import show
from sklearn import datasets
from sklearn.utils import Bunch
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
bunch: Bunch = datasets.fetch_covtype(download_if_missing=True)
data = pd.DataFrame(
    np.hstack([bunch.data, bunch.target.reshape((-1, 1))]),
    columns=bunch.feature_names + bunch.target_names
)

In [None]:
show(data)

In [4]:
features = []
label: str = 'Cover_Type'
for feature in data.columns:
    if feature.startswith('Wilderness_Area') or feature.startswith('Soil_Type'):
        data[feature] = data[feature].astype(int)
    elif feature == label:
        data[feature] = LabelEncoder().fit_transform(data[feature].astype(int))
    else:
        ...

In [None]:
_orig_size = len(data)
data.dropna(axis=0, how='any', inplace=True)
print(f"DropNA: {_orig_size - len(data)} instances have been removed.")

In [None]:
bunch.feature_names

In [None]:
bunch.target_names

In [8]:
# https://github.com/RAMitchell/GBM-Benchmarks/blob/a0bbed08c918b0a82e9a5e2207d1f43134b445e0/benchmark.py#L150
test_size = 0.2
validation_size = 0.2

train, test = train_test_split(data, test_size=test_size, random_state=0)
train, valid = train_test_split(train, test_size=validation_size / (1.0 - test_size), random_state=0)

In [None]:
from xtime.estimators import Estimator
from xtime.ml import TaskType, ClassificationTask
from xtime.datasets import (Dataset, DatasetSplit, DatasetMetadata)
from sklearn.dummy import DummyClassifier

dataset = Dataset(
    metadata=DatasetMetadata(
        name='Forest_Cover_Type',
        version='NA',
        task=ClassificationTask(type_=TaskType.BINARY_CLASSIFICATION, num_classes=2),
    ),
    splits={
        'train': DatasetSplit(x=train.drop(label, axis=1, inplace=False), y=train[label]),
        'valid': DatasetSplit(x=train.drop(label, axis=1, inplace=False), y=train[label]),
        'test': DatasetSplit(x=test.drop(label, axis=1, inplace=False), y=test[label])
    }
)

estimator = Estimator()
estimator.model = DummyClassifier(strategy="prior").fit(dataset.splits['train'].x, dataset.splits['train'].y)

metrics = estimator.evaluate(dataset)
print(metrics)