In [None]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import Bunch

In [None]:
# This does not work on headless Linux systems.
from pandasgui import show

In [None]:
bunch: Bunch = datasets.fetch_covtype(download_if_missing=True)
data = pd.DataFrame(
    np.hstack([bunch.data, bunch.target.reshape((-1, 1))]), columns=bunch.feature_names + bunch.target_names
)

In [None]:
show(data)

In [None]:
from xtime.ml import Feature, FeatureType

# In this dataset two types of features are present - continuous and binary. It seems that the binary features
# are one-hot encoded features for original categorical features - Wilderness_Area (4 values) and
# Soil_Type (40 values). Originally, all features have `float` data type.
features = []
label: str = "Cover_Type"
for feature in data.columns:
    if feature.startswith("Wilderness_Area") or feature.startswith("Soil_Type"):
        data[feature] = data[feature].astype(int)
        features.append(Feature(feature, FeatureType.BINARY))
    elif feature == label:
        data[feature] = LabelEncoder().fit_transform(data[feature].astype(int))
    else:
        features.append(Feature(feature, FeatureType.CONTINUOUS, cardinality=int(data[feature].nunique())))

In [None]:
_orig_size = len(data)
data.dropna(axis=0, how="any", inplace=True)
print(f"DropNA: {_orig_size - len(data)} instances have been removed.")

In [None]:
bunch.feature_names

In [None]:
bunch.target_names

In [None]:
# https://github.com/RAMitchell/GBM-Benchmarks/blob/a0bbed08c918b0a82e9a5e2207d1f43134b445e0/benchmark.py#L150
test_size = 0.2
validation_size = 0.2

train, test = train_test_split(data, test_size=test_size, random_state=0)
train, valid = train_test_split(train, test_size=validation_size / (1.0 - test_size), random_state=0)

In [None]:
from sklearn.dummy import DummyClassifier

from xtime.datasets import Dataset, DatasetMetadata, DatasetSplit
from xtime.estimators import Estimator
from xtime.ml import ClassificationTask, TaskType

dataset = Dataset(
    metadata=DatasetMetadata(
        name="Forest_Cover_Type",
        version="NA",
        features=features,
        task=ClassificationTask(TaskType.MULTI_CLASS_CLASSIFICATION, num_classes=7),
    ),
    splits={
        "train": DatasetSplit(x=train.drop(label, axis=1, inplace=False), y=train[label]),
        "valid": DatasetSplit(x=valid.drop(label, axis=1, inplace=False), y=valid[label]),
        "test": DatasetSplit(x=test.drop(label, axis=1, inplace=False), y=test[label]),
    },
)

estimator = Estimator()
estimator.model = DummyClassifier(strategy="prior").fit(dataset.splits["train"].x, dataset.splits["train"].y)

metrics = estimator.evaluate(dataset)
print(metrics)