In [67]:
from clearml import Dataset
from os import listdir
from os.path import join as joinpath
import pandas as pd
from pathlib import Path

In [2]:
dataset_name = "news_clsf"
dataset_project = "News Classification"

In [5]:
dataset_path = Dataset.get(
    dataset_name=dataset_name, dataset_project=dataset_project
).get_local_copy()

In [6]:
def load_data(directory):
    data = []
    labels = []
    label_names = []

    for label_index, label in enumerate(sorted(listdir(directory))):
        label_names.append(label)
        for file in listdir(joinpath(directory, label)):
            with open(joinpath(directory, label, file), "r", encoding="latin-1") as f:
                data.append(f.read())
                labels.append(label_index)

    return data, labels, label_names

In [7]:
train_directory = "20news-bydate-train"
test_directory = "20news-bydate-test"

X_train, y_train, target_names_train = load_data(
    joinpath(dataset_path, train_directory)
)
X_test, y_test, target_names_test = load_data(joinpath(dataset_path, test_directory))

In [68]:
def create_csv(X, y, label_names, file_name):
    df = pd.DataFrame({"text": X, "label_index": y})
    df["label_name"] = df["label_index"].apply(lambda x: label_names[x])
    df.to_csv(file_name, index=False)

In [75]:
BASE_DIR = Path().resolve().parent
CONVERTED = "data/converted"
create_csv(
    X_train, y_train, target_names_train, str(BASE_DIR / CONVERTED / "train.csv")
)
create_csv(X_test, y_test, target_names_test, str(BASE_DIR / CONVERTED / "test.csv"))

In [81]:
new_dataset = Dataset.create(
    dataset_name="news_clsf_csv",
    dataset_project="News Classification",
)

new_dataset.add_files(path=str(BASE_DIR / CONVERTED / "train.csv"))
new_dataset.add_files(path=str(BASE_DIR / CONVERTED / "test.csv"))

new_dataset.upload()

new_dataset.finalize()

ClearML results page: https://app.clear.ml/projects/1ac7fdc2f02d4bf5a5598f150047fd47/experiments/c08dcfca66eb41e8911496c1077162bb/output/log
ClearML dataset page: https://app.clear.ml/datasets/simple/1ac7fdc2f02d4bf5a5598f150047fd47/experiments/c08dcfca66eb41e8911496c1077162bb
Uploading dataset changes (2 files compressed to 21.65 MiB) to https://files.clear.ml
File compression and upload completed: total size 21.65 MiB, 1 chunk(s) stored (average size 21.65 MiB)


True

In [82]:
dataset = Dataset.get(dataset_id="c08dcfca66eb41e8911496c1077162bb")

In [84]:
from collections import Counter

label_counts_train = Counter(y_train)
label_counts_test = Counter(y_test)

labels_train, values_train = zip(*Counter(y_train).items())
labels_train = [target_names_train[label] for label in labels_train]

labels_test, values_test = zip(*Counter(y_test).items())
labels_test = [target_names_test[label] for label in labels_test]

logger = dataset.get_logger()

logger.report_histogram(
    title="Class Distribution",
    series="Train Classes",
    iteration=0,
    values=values_train,
    xaxis=labels_train,
)

logger.report_histogram(
    title="Class Distribution",
    series="Test Classes",
    iteration=0,
    values=values_test,
    xaxis=labels_test,
)