In [1]:
from openml.datasets import get_dataset as get_openml_dataset
from openml.datasets.dataset import OpenMLDataset
from sklearn.model_selection import train_test_split

In [None]:
# This does not work on headless Linux systems.
from pandasgui import show

In [3]:
data: OpenMLDataset = get_openml_dataset(
    dataset_id="eye_movements", version=1, error_if_multiple=True, download_data=True
)

# Load from local cache
x, y, categorical_indicator, attributed_names = data.get_data(
    target=data.default_target_attribute, dataset_format="dataframe"
)

print(type(x), type(y))

<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.series.Series'>


In [None]:
# Analyze it with Pandas GUI library
show(x)
# show(y)

In [4]:
# Encode labels. Move from `category` type to int type with labels [0, 1, 2]
y = y.astype(int)

In [5]:
# Drop unique columns
x.drop("lineNo", axis=1, inplace=True)

# Convert `category` features to int type.
x["P1stFixation"] = x["P1stFixation"].astype(int)  # Binary 0/1
x["P2stFixation"] = x["P2stFixation"].astype(int)  # Binary 0/1
x["nextWordRegress"] = x["nextWordRegress"].astype(int)  # Binary 0/1

In [6]:
test_size = 0.2
validation_size = 0.1

train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=test_size, random_state=1, shuffle=True)
train_x, valid_x, train_y, valid_y = train_test_split(
    train_x, train_y, test_size=validation_size / (1.0 - test_size), random_state=1, shuffle=True
)

In [None]:
from sklearn.dummy import DummyClassifier

from xtime.datasets import Dataset, DatasetMetadata, DatasetSplit
from xtime.estimators import Estimator
from xtime.ml import ClassificationTask, Feature, FeatureType, TaskType

_binary_features = ["P1stFixation", "P2stFixation", "nextWordRegress"]
_drop_features = ["lineNo"]
features = []
for feature in x.columns:
    if feature in _drop_features:
        continue
    if feature in _binary_features:
        features.append(Feature(feature, FeatureType.BINARY))
    else:
        features.append(Feature(feature, FeatureType.CONTINUOUS, cardinality=int(x[feature].nunique())))

dataset = Dataset(
    metadata=DatasetMetadata(
        name="eye_movements",
        version="NA",
        features=features,
        task=ClassificationTask(type_=TaskType.MULTI_CLASS_CLASSIFICATION, num_classes=3),
    ),
    splits={
        "train": DatasetSplit(x=train_x, y=train_y),
        "valid": DatasetSplit(x=valid_x, y=valid_y),
        "test": DatasetSplit(x=test_x, y=test_y),
    },
)
estimator = Estimator()
estimator.model = DummyClassifier(strategy="prior").fit(dataset.splits["train"].x, dataset.splits["train"].y)

metrics = estimator.evaluate(dataset)
print(metrics)