In [1]:
from IPython.display import display

In [2]:
import datasets
import polars as pl

tatoeba_dataset = datasets.load_dataset("NetherQuartz/tatoeba-tokipona")
tatoeba_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'source_id', 'target_id', 'source', 'source_lang', 'tok', 'en', 'ru', 'uk', 'be', 'fr', 'es', 'pt', 'it', 'de', 'vi', 'ja', 'zh', 'ko', 'ar', 'he', 'pl', 'tr', 'la', 'el'],
        num_rows: 74668
    })
    validation: Dataset({
        features: ['id', 'source_id', 'target_id', 'source', 'source_lang', 'tok', 'en', 'ru', 'uk', 'be', 'fr', 'es', 'pt', 'it', 'de', 'vi', 'ja', 'zh', 'ko', 'ar', 'he', 'pl', 'tr', 'la', 'el'],
        num_rows: 3930
    })
})

In [3]:
sewi_dataset = datasets.load_dataset("NetherQuartz/lipu-sewi")
sewi_dataset

DatasetDict({
    train: Dataset({
        features: ['key', 'part', 'book', 'chapter', 'verse', 'tok', 'en', 'ru', 'uk', 'be', 'fr', 'es', 'pt', 'it', 'de', 'vi', 'ja', 'zh', 'ko', 'ar', 'he', 'pl', 'tr', 'la', 'el'],
        num_rows: 1081
    })
    validation: Dataset({
        features: ['key', 'part', 'book', 'chapter', 'verse', 'tok', 'en', 'ru', 'uk', 'be', 'fr', 'es', 'pt', 'it', 'de', 'vi', 'ja', 'zh', 'ko', 'ar', 'he', 'pl', 'tr', 'la', 'el'],
        num_rows: 65
    })
})

In [4]:
wiki_dataset = datasets.load_dataset("NetherQuartz/wikipesija")
wiki_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'title', 'text'],
        num_rows: 2917
    })
    validation: Dataset({
        features: ['id', 'url', 'title', 'text'],
        num_rows: 146
    })
})

In [5]:
poki_dataset = datasets.load_dataset("NetherQuartz/poki-lapo")
poki_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'description', 'authors', 'proofreaders', 'date', 'date-precision', 'tags', 'original', 'license', 'sources', 'archives', 'preprocessing', 'accessibility-notes', 'notes', 'text'],
        num_rows: 1433
    })
    validation: Dataset({
        features: ['id', 'title', 'description', 'authors', 'proofreaders', 'date', 'date-precision', 'tags', 'original', 'license', 'sources', 'archives', 'preprocessing', 'accessibility-notes', 'notes', 'text'],
        num_rows: 69
    })
})

In [6]:
kule_dataset = datasets.load_dataset("NetherQuartz/lipu-kule")
kule_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'text'],
        num_rows: 42
    })
})

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("NetherQuartz/tatoeba-tok-multi-gemma-2-2b-merged")

def count_tokens(s: str) -> int:
    return tokenizer(s, return_tensors="pt")["input_ids"].shape[1]

count_tokens("hi")

2

In [118]:
import re, unicodedata

EMOJI_PATTERN = re.compile(
    "["
    "\U0001F600-\U0001F64F"  # emoticons
    "\U0001F300-\U0001F5FF"  # symbols & pictographs
    "\U0001F680-\U0001F6FF"  # transport & map
    "\U0001F1E0-\U0001F1FF"  # flags
    "\U00002700-\U000027BF"  # dingbats
    "\U000024C2-\U0001F251"
    "]+",
    flags=re.UNICODE
)

def clean_text(
    text: str,
    remove_emoji: bool = True,
    emoji_token: str | None = None,
    remove_music_notes: bool = True,
    normalize_newlines: bool = True
) -> str:
    """
    Щадящая очистка текста:
    - удаляет HTML, управляющие, невидимые и частные юникод-символы (󱥁, ,  и т.п.)
    - опционально удаляет или заменяет эмодзи
    - опционально удаляет музыкальные ноты
    - сжимает переносы и пробелы
    - нормализует пунктуацию
    """
    if text is None:
        return ""
    text = unicodedata.normalize("NFKC", text)

    ELLIPSIS = "<ELLIPSIS_PLACEHOLDER>"
    text = text.replace("...", ELLIPSIS)

    # убрать HTML
    text = re.sub(r"<[^>]+>", "", text)

    # убрать невидимые и управляющие символы
    text = re.sub(r"[\u200b-\u200f\uFEFF\x00-\x08\x0b-\x1f\x7f]+", "", text)

    # удалить "Private Use Area" (все диапазоны U+E000–U+F8FF и Plane 15–16)
    text = re.sub(r"[\ue000-\uf8ff\U000F0000-\U0010FFFF]+", "", text)

    # удалить явно мусорные суррогаты (вроде 󱥁)
    text = re.sub(r"[\ud800-\udfff]", "", text)

    if remove_music_notes:
        text = text.replace("♪", " ").replace("♫", " ").replace("♩", " ")

    if remove_emoji:
        if emoji_token is None:
            text = EMOJI_PATTERN.sub("", text)
        else:
            text = EMOJI_PATTERN.sub(" " + emoji_token + " ", text)
    else:
        if emoji_token is not None:
            text = EMOJI_PATTERN.sub(" " + emoji_token + " ", text)

    text = re.sub(r"[-–—]{2,}", " — ", text)
    text = re.sub(r"[`*_~]{2,}", "", text)
    text = re.sub(r"(?m)^\s*[-=]{2,}\s*$", "", text)
    text = re.sub(r"([!?;,:])\1{1,}", r"\1", text)

    if normalize_newlines:
        text = re.sub(r"\n{2,}", "\n", text)

    text = text.replace("\t", " ")
    text = re.sub(r"[ \u00A0]{2,}", " ", text)
    text = re.sub(r" *([,;:!?]) *", r"\1 ", text)
    text = re.sub(r"\.([^\s\.])", r". \1", text)
    text = "\n".join(line.strip() for line in text.splitlines())
    text = text.replace(ELLIPSIS, "...")
    text = re.sub(r" {2,}", " ", text)
    text = re.sub(r"\b[a-fA-F0-9]{16,}\b", "", text)
    text = re.sub(r"\b[a-zA-Z0-9\-/\\;:\.\,]{20,}\b", "", text)
    return text.strip()

In [119]:
df = poki_dataset["train"].to_polars().sort(pl.col("text").str.len_chars(), descending=True)
t = df["text"][4]
# print(t)
print("---")
print(clean_text(t))

---
jan Sitata
toki open tan jan pi ante toki
lipu lili ni li lipu nanpa wan tan lipu suli Sitata · wile kulupu la mi kama pana e lipu nanpa tu · lipu tu taso li lon · o toki e pilin sina e wile sina · toki kulupu la mi ken sona e wile jan ·
lipu ni li kama tan kon Eman Ese · taso ni li jan seme? tenpo pini la ona li jan lipu suli · ona li kama tan ma Tosi suli · lipu mute la ona li wile pana e sitelen pi alasa sona · kon jan pi lipu ona li alasa e lon e sona e nasin sewi · kon Ese la nasin mute li ken sewi · nasin wan li nasin taso ala ·
jan pi toki pona li lukin e lipu ni la ona o awen sona e ni · lipu ni li pana ala e sona lon pi nasin Puta, li pana ala e nasin sewi pi kon Ese · taso la ona li pana e sitelen wile pi kon Ese: sitelen la jan li ken alasa e lon e sewi kepeken nasin pi ona taso · jan ale o kama sona e lon ona · ken la lon ona li lon pi jan ante ala · taso ni li ike ala · sina wile kama sona lili e nasin Puta lon la mi pana e sitelen lili lon pini lipu · ni la jan li son

In [66]:
df = tatoeba_dataset["train"].to_polars()
for col in df.columns[5:]:
    display(col)
    display(df[col].map_elements(count_tokens).describe())

'tok'

KeyboardInterrupt: 

In [39]:
df = sewi_dataset["train"].to_polars()
for col in df.columns[5:]:
    display(col)
    display(df[col].map_elements(count_tokens).describe())

'tok'

statistic,value
str,f64
"""count""",1081.0
"""null_count""",0.0
"""mean""",43.013876
"""std""",19.159728
"""min""",8.0
"""25%""",29.0
"""50%""",40.0
"""75%""",54.0
"""max""",133.0


'en'

statistic,value
str,f64
"""count""",1072.0
"""null_count""",9.0
"""mean""",28.853545
"""std""",12.458109
"""min""",6.0
"""25%""",19.0
"""50%""",27.0
"""75%""",36.0
"""max""",98.0


'ru'

statistic,value
str,f64
"""count""",971.0
"""null_count""",110.0
"""mean""",33.533471
"""std""",14.78895
"""min""",5.0
"""25%""",22.0
"""50%""",31.0
"""75%""",43.0
"""max""",116.0


'uk'

statistic,value
str,f64
"""count""",1074.0
"""null_count""",7.0
"""mean""",39.798883
"""std""",17.295862
"""min""",8.0
"""25%""",26.0
"""50%""",37.0
"""75%""",51.0
"""max""",122.0


'be'

statistic,value
str,f64
"""count""",1076.0
"""null_count""",5.0
"""mean""",45.618959
"""std""",19.92511
"""min""",6.0
"""25%""",30.0
"""50%""",42.0
"""75%""",59.0
"""max""",164.0


'fr'

statistic,value
str,f64
"""count""",1080.0
"""null_count""",1.0
"""mean""",36.099074
"""std""",15.535913
"""min""",6.0
"""25%""",24.0
"""50%""",34.0
"""75%""",46.0
"""max""",114.0


'es'

statistic,value
str,f64
"""count""",1072.0
"""null_count""",9.0
"""mean""",31.58209
"""std""",13.590543
"""min""",7.0
"""25%""",21.0
"""50%""",30.0
"""75%""",40.0
"""max""",121.0


'pt'

statistic,value
str,f64
"""count""",1069.0
"""null_count""",12.0
"""mean""",32.973807
"""std""",13.671071
"""min""",7.0
"""25%""",22.0
"""50%""",32.0
"""75%""",41.0
"""max""",120.0


'it'

statistic,value
str,f64
"""count""",1069.0
"""null_count""",12.0
"""mean""",37.210477
"""std""",16.15218
"""min""",5.0
"""25%""",25.0
"""50%""",35.0
"""75%""",48.0
"""max""",133.0


'de'

statistic,value
str,f64
"""count""",1077.0
"""null_count""",4.0
"""mean""",31.504178
"""std""",13.429467
"""min""",7.0
"""25%""",21.0
"""50%""",30.0
"""75%""",40.0
"""max""",102.0


'vi'

statistic,value
str,f64
"""count""",1069.0
"""null_count""",12.0
"""mean""",38.809167
"""std""",16.54828
"""min""",8.0
"""25%""",26.0
"""50%""",37.0
"""75%""",49.0
"""max""",133.0


'ja'

statistic,value
str,f64
"""count""",199.0
"""null_count""",882.0
"""mean""",30.351759
"""std""",10.914585
"""min""",7.0
"""25%""",23.0
"""50%""",30.0
"""75%""",37.0
"""max""",63.0


'zh'

statistic,value
str,f64
"""count""",1069.0
"""null_count""",12.0
"""mean""",27.115996
"""std""",11.041953
"""min""",4.0
"""25%""",19.0
"""50%""",25.0
"""75%""",34.0
"""max""",83.0


'ko'

statistic,value
str,f64
"""count""",1067.0
"""null_count""",14.0
"""mean""",39.779756
"""std""",16.692148
"""min""",3.0
"""25%""",27.0
"""50%""",37.0
"""75%""",51.0
"""max""",119.0


'ar'

statistic,value
str,f64
"""count""",1069.0
"""null_count""",12.0
"""mean""",88.540692
"""std""",40.177573
"""min""",10.0
"""25%""",56.0
"""50%""",83.0
"""75%""",116.0
"""max""",340.0


'he'

statistic,value
str,f64
"""count""",1077.0
"""null_count""",4.0
"""mean""",28.31662
"""std""",11.432891
"""min""",6.0
"""25%""",19.0
"""50%""",27.0
"""75%""",36.0
"""max""",97.0


'pl'

statistic,value
str,f64
"""count""",1069.0
"""null_count""",12.0
"""mean""",36.885875
"""std""",15.604129
"""min""",8.0
"""25%""",24.0
"""50%""",35.0
"""75%""",47.0
"""max""",132.0


'tr'

statistic,value
str,f64
"""count""",1069.0
"""null_count""",12.0
"""mean""",34.767072
"""std""",14.571758
"""min""",4.0
"""25%""",23.0
"""50%""",33.0
"""75%""",45.0
"""max""",120.0


'la'

statistic,value
str,f64
"""count""",807.0
"""null_count""",274.0
"""mean""",33.174721
"""std""",15.067193
"""min""",5.0
"""25%""",21.0
"""50%""",30.0
"""75%""",43.0
"""max""",125.0


'el'

statistic,value
str,f64
"""count""",974.0
"""null_count""",107.0
"""mean""",60.069815
"""std""",25.923384
"""min""",7.0
"""25%""",39.0
"""50%""",56.0
"""75%""",78.0
"""max""",173.0


In [120]:
df = wiki_dataset["train"].to_polars()
df["text"].map_elements(count_tokens).describe()

statistic,value
str,f64
"""count""",2917.0
"""null_count""",0.0
"""mean""",226.772369
"""std""",509.421358
"""min""",4.0
"""25%""",58.0
"""50%""",113.0
"""75%""",228.0
"""max""",15212.0


In [121]:
df = wiki_dataset["train"].to_polars()
df["text"].map_elements(clean_text).map_elements(count_tokens).describe()

statistic,value
str,f64
"""count""",2917.0
"""null_count""",0.0
"""mean""",224.757628
"""std""",506.993372
"""min""",4.0
"""25%""",58.0
"""50%""",111.0
"""75%""",227.0
"""max""",15313.0


In [122]:
poki_dataset["train"].to_polars()["text"].map_elements(count_tokens).describe()

statistic,value
str,f64
"""count""",1433.0
"""null_count""",0.0
"""mean""",1237.900907
"""std""",16624.123236
"""min""",1.0
"""25%""",149.0
"""50%""",361.0
"""75%""",735.0
"""max""",618874.0


In [123]:
poki_dataset["train"].to_polars()["text"].map_elements(clean_text).map_elements(count_tokens).describe()

statistic,value
str,f64
"""count""",1433.0
"""null_count""",0.0
"""mean""",831.460572
"""std""",3464.325029
"""min""",1.0
"""25%""",144.0
"""50%""",345.0
"""75%""",726.0
"""max""",97669.0


In [124]:
kule_dataset["train"].to_polars()["text"].map_elements(count_tokens).describe()

statistic,value
str,f64
"""count""",42.0
"""null_count""",0.0
"""mean""",801.738095
"""std""",537.579764
"""min""",27.0
"""25%""",399.0
"""50%""",756.0
"""75%""",1071.0
"""max""",2224.0


In [125]:
kule_dataset["train"].to_polars()["text"].map_elements(clean_text).map_elements(count_tokens).describe()

statistic,value
str,f64
"""count""",42.0
"""null_count""",0.0
"""mean""",767.547619
"""std""",535.904715
"""min""",27.0
"""25%""",363.0
"""50%""",717.0
"""75%""",1074.0
"""max""",2190.0


In [156]:
new_data = {
    "train": [],
    "validation": []
}

In [157]:
batch_size = 5
overlap = 2

for key in ["train", "validation"]:
    df = tatoeba_dataset[key].to_polars()[["tok", "en", "ru", "vi"]].drop_nulls()
    # conc_df = df\
    #     .with_row_index("row_id")\
    #     .with_columns([
    #         ((pl.col("row_id") - pl.col("row_id") % (batch_size - overlap)) // (batch_size - overlap))
    #         .alias("chunk")
    #     ])\
    #     .group_by(["chunk"], maintain_order=True)\
    #     .agg([
    #         pl.col(col).str.join(" ").alias(col)
    #         for col in df.columns
    #     ])
    new_data[key].append(df[["tok", "en", "ru", "vi"]])

In [158]:
batch_size = 3
overlap = 2

for key in ["train", "validation"]:
    df = sewi_dataset[key].to_polars()["key", "part", "book", "chapter", "verse", "tok", "en", "ru", "vi"]#.drop_nulls()

    # conc_df = df\
    #     .sort(["part", "book", "chapter", "verse"])\
    #     .with_row_index("row_id")\
    #     .with_columns((pl.col("row_id") - pl.col("row_id") % (batch_size - overlap)).alias("chunk"))\
    #     .group_by(["chunk"], maintain_order=True)\
    #     .agg([
    #         pl.col(col).str.join(" ").alias(col)
    #         for col in df.columns[5:]
    #     ])
    new_data[key].append(df[["tok", "en", "ru", "vi"]])

for col in df.columns[5:10]:
    display(col)
    display(df[col].map_elements(count_tokens).describe())


'tok'

statistic,value
str,f64
"""count""",65.0
"""null_count""",0.0
"""mean""",39.184615
"""std""",17.649267
"""min""",14.0
"""25%""",27.0
"""50%""",36.0
"""75%""",49.0
"""max""",90.0


'en'

statistic,value
str,f64
"""count""",65.0
"""null_count""",0.0
"""mean""",27.092308
"""std""",9.830951
"""min""",8.0
"""25%""",19.0
"""50%""",27.0
"""75%""",35.0
"""max""",49.0


'ru'

statistic,value
str,f64
"""count""",65.0
"""null_count""",0.0
"""mean""",30.630769
"""std""",12.249451
"""min""",9.0
"""25%""",22.0
"""50%""",30.0
"""75%""",37.0
"""max""",67.0


'vi'

statistic,value
str,f64
"""count""",65.0
"""null_count""",0.0
"""mean""",38.907692
"""std""",16.100973
"""min""",12.0
"""25%""",25.0
"""50%""",38.0
"""75%""",47.0
"""max""",76.0


In [163]:
batch_size = 5
overlap = 2

for key in ["train", "validation"]:

    df = wiki_dataset[key].to_polars()
    df = df.with_columns(pl.col("text").map_elements(clean_text))

    exploded = (
        df
        .with_row_index("doc_id")

        # .with_columns(pl.col("text").str.split(". ", inclusive=True).alias("paragraphs"))
        # .explode("paragraphs")

        .with_columns(pl.col("text").str.split("\n", inclusive=True).alias("paragraphs"))
        .explode("paragraphs")

        .filter(pl.col("paragraphs").str.strip_chars().str.len_chars() > 0)
    )

    result = (
        exploded
        .with_row_index("row_id")
        .with_columns([
            ((pl.col("row_id") - pl.col("row_id") % (batch_size - overlap)) // (batch_size - overlap))
            .alias("chunk")
        ])
        .group_by(["doc_id", "chunk"], maintain_order=True)
        .agg([
            pl.col("paragraphs").str.join("").alias("text_concat")
        ])
        .sort(["doc_id", "chunk"])
        .select(["doc_id", "text_concat"])
        .rename({"text_concat": "text"})
        .unique("text")
    )
    new_data[key].append(result[["text"]])

result["text"].map_elements(count_tokens).describe()

statistic,value
str,f64
"""count""",326.0
"""null_count""",0.0
"""mean""",98.380368
"""std""",93.920733
"""min""",3.0
"""25%""",30.0
"""50%""",73.0
"""75%""",131.0
"""max""",748.0


In [166]:
batch_size = 6
overlap = 1

for key in ["train", "validation"]:

    df = poki_dataset[key].to_polars()
    df = df.with_columns(pl.col("text").map_elements(clean_text))

    exploded = (
        df
        .with_row_index("doc_id")
        # .with_columns(pl.col("text").str.split(". ", inclusive=True).alias("sentences"))
        # .explode("sentences")
        .with_columns(pl.col("text").str.split("\n", inclusive=True).alias("sentences"))
        .explode("sentences")
        .filter(pl.col("sentences").str.strip_chars().str.len_chars().is_between(1, 1500))
    )

    # exploded = exploded.with_columns(exploded["sentences"].map_elements(count_tokens).alias("tokens"))

    # exploded = exploded.filter(exploded["tokens"] < 1000)
    # exploded = exploded.with_columns(exploded["text"].str.strip_chars_start(" \n"))
    # exploded = exploded.filter(exploded["text"].str.len_chars() > 3)

    # print(exploded.sort(pl.col("tokens"), descending=True)["sentences"][0])
    # display(exploded["tokens"].describe())
    # break

    result = (
        exploded
        .with_row_index("row_id")
        .with_columns([
            ((pl.col("row_id") - pl.col("row_id") % (batch_size - overlap)) // (batch_size - overlap))
            .alias("chunk")
        ])
        .group_by(["doc_id", "chunk"], maintain_order=True)
        .agg([
            pl.col("sentences").str.join("").alias("text_concat")
        ])
        .sort(["doc_id", "chunk"])
        .select(["doc_id", "text_concat"])
        .rename({"text_concat": "text"})
        .unique("text")
    )
    new_data[key].append(result[["text"]])

result["text"].map_elements(count_tokens).describe()

statistic,value
str,f64
"""count""",673.0
"""null_count""",0.0
"""mean""",119.540862
"""std""",111.71823
"""min""",4.0
"""25%""",43.0
"""50%""",86.0
"""75%""",160.0
"""max""",650.0


In [167]:
for key in ["train"]:
    df = kule_dataset[key].to_polars().with_columns(pl.col("text").map_elements(clean_text)).unique("text")
    new_data[key].append(df[["text"]])

In [168]:
for key in new_data.keys():
    print(key, len(new_data[key]))

train 5
validation 4


In [169]:
for d in new_data["train"]:
    print(d.shape)

(74668, 4)
(1081, 4)
(7080, 1)
(9695, 1)
(42, 1)


In [170]:
languages = ["en", "ru", "vi"]

def make_pairs(seq):
    for row in seq:
        if "text" in row.keys():
            yield {"from": "tok", "to": None, "source": row["text"], "target": None}
            continue

        for code in languages:
            if not row[code]:
                continue
            yield {"from": "tok", "to": code, "source": row["tok"], "target": row[code]}
            yield {"from": code, "to": "tok", "source": row[code], "target": row["tok"]}

In [171]:
for pair in make_pairs(new_data["train"][2].to_dicts()):
    print(pair)
    break

{'from': 'tok', 'to': None, 'source': 'pipi\npoka\npoki\n', 'target': None}


In [172]:
new_dataset = {"train": [], "validation": []}
dataset_dict = datasets.DatasetDict()

for key, value in new_data.items():
    for df in value:
        for row in make_pairs(df.iter_rows(named=True)):
            new_dataset[key].append(row)

    data_df = pl.DataFrame(data=new_dataset[key])
    dataset_dict[key] = datasets.Dataset.from_polars(data_df)

dataset_dict

DatasetDict({
    train: Dataset({
        features: ['from', 'to', 'source', 'target'],
        num_rows: 471049
    })
    validation: Dataset({
        features: ['from', 'to', 'source', 'target'],
        num_rows: 24969
    })
})

In [177]:
dataset_dict["train"].to_polars().filter(pl.col("to") == "tok")["target"].unique()

target
str
"""tan kala lili la akesi li kama…"
"""ona mute li toki."""
"""jan seme li jan Kete?"""
"""mama sewi o pana e pona tawa m…"
"""mi pilin la mi sona la ni li s…"
…
"""jan Ton li lukin e sitelen taw…"
"""ona li kama sinpin lon poka pi…"
"""mi mute li wawa."""
"""ona li toki e ni: ona li awen …"


In [178]:
for key in dataset_dict.keys():
    display(key)
    tt = dataset_dict[key].to_polars()
    tt = tt.with_columns(tt["source"].map_elements(count_tokens).alias("tokens"))
    display(tt["tokens"].describe())
    display(tt["target"].null_count())
    display(tt.shape)

'train'

statistic,value
str,f64
"""count""",471049.0
"""null_count""",0.0
"""mean""",15.699114
"""std""",31.884079
"""min""",2.0
"""25%""",8.0
"""50%""",11.0
"""75%""",15.0
"""max""",2190.0


16817

(471049, 5)

'validation'

statistic,value
str,f64
"""count""",24969.0
"""null_count""",0.0
"""mean""",16.354159
"""std""",29.756315
"""min""",3.0
"""25%""",8.0
"""50%""",11.0
"""75%""",15.0
"""max""",748.0


999

(24969, 5)

In [179]:
dataset_dict.push_to_hub("NetherQuartz/combined-tokipona-dataset", commit_message="Fix preprocessing")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

README.md:   0%|          | 0.00/692 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/NetherQuartz/combined-tokipona-dataset/commit/bbc00ad3ae272c1b3913d8061cad38144d0f0323', commit_message='Fix preprocessing', commit_description='', oid='bbc00ad3ae272c1b3913d8061cad38144d0f0323', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/NetherQuartz/combined-tokipona-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='NetherQuartz/combined-tokipona-dataset'), pr_revision=None, pr_num=None)