In [1]:
from pathlib import Path

import pandas as pd
from pandasgui import show

from xtime.datasets._year_prediction_msd import YearPredictionMSDBuilder

In [2]:
data_dir = Path("~/.cache/uci/datasets/00203").expanduser()
file_name = "YearPredictionMSD.txt.zip"
if not (data_dir / file_name).is_file():
    YearPredictionMSDBuilder.download(data_dir, file_name)

In [3]:
# All columns a numerical columns (continuous features). First column is the target column (year to predict / classify).
data: pd.DataFrame = pd.read_csv((data_dir / file_name).as_posix(), header=None)

In [4]:
label = "Year"

In [5]:
# Dataset comes without columns. 90 attributes, 12 = timbre average, 78 = timbre covariance
# The first value is the year (target), ranging from 1922 to 2011.
columns = ["Year"]
for i in range(12):
    columns.append(f"TimbreAvg_{i}")
for i in range(78):
    columns.append(f"TimbreCov_{i}")
assert len(columns) == 91, f"Fix me. Length = {len(columns)}"

data.columns = columns

In [None]:
show(data)

According to dataset [documentation](https://archive.ics.uci.edu/ml/datasets/yearpredictionmsd):
> You should respect the following train / test split:
> - train: first 463,715 examples
> - test: last 51,630 examples
>
> It avoids the 'producer effect' by making sure no song from a given artist ends up in both the train and test set.

In [7]:
train: pd.DataFrame = data.iloc[0:463715, :]
test: pd.DataFrame = data.iloc[-51630:, :]

In [None]:
print(f"train = {train.shape} test = {test.shape}")

In [None]:
from sklearn.dummy import DummyRegressor

from xtime.datasets import Dataset, DatasetMetadata, DatasetSplit
from xtime.estimators import Estimator
from xtime.ml import RegressionTask, TaskType

dataset = Dataset(
    metadata=DatasetMetadata(
        name="year_prediction_msd",
        version="NA",
        task=RegressionTask(ttype=TaskType.REGRESSION),
    ),
    splits={
        "train": DatasetSplit(x=train.drop(label, axis=1, inplace=False), y=train[label]),
        "valid": DatasetSplit(x=test.drop(label, axis=1, inplace=False), y=test[label]),
    },
)

estimator = Estimator()
estimator.model = DummyRegressor(strategy="mean").fit(dataset.splits["train"].x, dataset.splits["train"].y)

metrics = estimator.evaluate(dataset)
print(metrics)

In [None]:
from sklearn.linear_model import LinearRegression

estimator = Estimator()
estimator.model = LinearRegression(copy_X=False).fit(dataset.splits["train"].x, dataset.splits["train"].y)

metrics = estimator.evaluate(dataset)
print(metrics)

In [None]:
# https://dainesanalytics.blog/2019/04/15/regression-model-for-song-year-prediction-using-python/
# https://merelydoit.blog/2020/03/09/regression-deep-learning-model-for-song-year-prediction-using-tensorflow-take-2/
# page 6: https://bdataanalytics.biomedcentral.com/track/pdf/10.1186/s41044-016-0010-4.pdf