In [1]:
from IPython.display import display

In [2]:
import datasets
import polars as pl

tatoeba_dataset = datasets.load_dataset("NetherQuartz/tatoeba-tokipona")
tatoeba_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'source_id', 'target_id', 'source', 'source_lang', 'tok', 'en', 'ru', 'uk', 'be', 'fr', 'es', 'pt', 'it', 'de', 'vi', 'ja', 'zh', 'ko', 'ar', 'he', 'pl', 'tr', 'la', 'el'],
        num_rows: 74668
    })
    validation: Dataset({
        features: ['id', 'source_id', 'target_id', 'source', 'source_lang', 'tok', 'en', 'ru', 'uk', 'be', 'fr', 'es', 'pt', 'it', 'de', 'vi', 'ja', 'zh', 'ko', 'ar', 'he', 'pl', 'tr', 'la', 'el'],
        num_rows: 3930
    })
})

In [3]:
sewi_dataset = datasets.load_dataset("NetherQuartz/lipu-sewi")
sewi_dataset

DatasetDict({
    train: Dataset({
        features: ['key', 'part', 'book', 'chapter', 'verse', 'tok', 'en', 'ru', 'uk', 'be', 'fr', 'es', 'pt', 'it', 'de', 'vi', 'ja', 'zh', 'ko', 'ar', 'he', 'pl', 'tr', 'la', 'el'],
        num_rows: 1081
    })
    validation: Dataset({
        features: ['key', 'part', 'book', 'chapter', 'verse', 'tok', 'en', 'ru', 'uk', 'be', 'fr', 'es', 'pt', 'it', 'de', 'vi', 'ja', 'zh', 'ko', 'ar', 'he', 'pl', 'tr', 'la', 'el'],
        num_rows: 65
    })
})

In [4]:
wiki_dataset = datasets.load_dataset("NetherQuartz/wikipesija")
wiki_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'title', 'text'],
        num_rows: 2917
    })
    validation: Dataset({
        features: ['id', 'url', 'title', 'text'],
        num_rows: 146
    })
})

In [5]:
poki_dataset = datasets.load_dataset("NetherQuartz/poki-lapo")
poki_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'description', 'authors', 'proofreaders', 'date', 'date-precision', 'tags', 'original', 'license', 'sources', 'archives', 'preprocessing', 'accessibility-notes', 'notes', 'text'],
        num_rows: 1433
    })
    validation: Dataset({
        features: ['id', 'title', 'description', 'authors', 'proofreaders', 'date', 'date-precision', 'tags', 'original', 'license', 'sources', 'archives', 'preprocessing', 'accessibility-notes', 'notes', 'text'],
        num_rows: 69
    })
})

In [6]:
kule_dataset = datasets.load_dataset("NetherQuartz/lipu-kule")
kule_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'text'],
        num_rows: 42
    })
})

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("NetherQuartz/tatoeba-tok-multi-gemma-2-2b-merged")

def count_tokens(s: str) -> int:
    return tokenizer(s, return_tensors="pt")["input_ids"].shape[1]

count_tokens("hi")

2

In [8]:
df = tatoeba_dataset["train"].to_polars()
for col in df.columns[5:]:
    display(col)
    display(df[col].map_elements(count_tokens).describe())

'tok'

statistic,value
str,f64
"""count""",74668.0
"""null_count""",0.0
"""mean""",14.188434
"""std""",6.710767
"""min""",3.0
"""25%""",10.0
"""50%""",13.0
"""75%""",17.0
"""max""",189.0


'en'

statistic,value
str,f64
"""count""",74668.0
"""null_count""",0.0
"""mean""",8.82532
"""std""",3.698366
"""min""",3.0
"""25%""",7.0
"""50%""",8.0
"""75%""",10.0
"""max""",86.0


'ru'

statistic,value
str,f64
"""count""",74668.0
"""null_count""",0.0
"""mean""",9.929716
"""std""",4.531394
"""min""",3.0
"""25%""",7.0
"""50%""",9.0
"""75%""",11.0
"""max""",126.0


'uk'

statistic,value
str,f64
"""count""",74668.0
"""null_count""",0.0
"""mean""",11.933452
"""std""",5.503235
"""min""",3.0
"""25%""",9.0
"""50%""",11.0
"""75%""",14.0
"""max""",143.0


'be'

statistic,value
str,f64
"""count""",74668.0
"""null_count""",0.0
"""mean""",14.574302
"""std""",7.049529
"""min""",2.0
"""25%""",10.0
"""50%""",13.0
"""75%""",17.0
"""max""",187.0


'fr'

statistic,value
str,f64
"""count""",74668.0
"""null_count""",0.0
"""mean""",10.583784
"""std""",4.732388
"""min""",3.0
"""25%""",8.0
"""50%""",10.0
"""75%""",12.0
"""max""",140.0


'es'

statistic,value
str,f64
"""count""",74668.0
"""null_count""",0.0
"""mean""",9.000656
"""std""",3.873709
"""min""",2.0
"""25%""",7.0
"""50%""",8.0
"""75%""",10.0
"""max""",122.0


'pt'

statistic,value
str,f64
"""count""",74668.0
"""null_count""",0.0
"""mean""",9.452871
"""std""",4.117414
"""min""",3.0
"""25%""",7.0
"""50%""",9.0
"""75%""",11.0
"""max""",146.0


'it'

statistic,value
str,f64
"""count""",74668.0
"""null_count""",0.0
"""mean""",9.499652
"""std""",4.231946
"""min""",3.0
"""25%""",7.0
"""50%""",9.0
"""75%""",11.0
"""max""",122.0


'de'

statistic,value
str,f64
"""count""",74668.0
"""null_count""",0.0
"""mean""",9.625154
"""std""",4.243031
"""min""",2.0
"""25%""",7.0
"""50%""",9.0
"""75%""",11.0
"""max""",94.0


'vi'

statistic,value
str,f64
"""count""",74668.0
"""null_count""",0.0
"""mean""",10.643502
"""std""",4.79255
"""min""",2.0
"""25%""",8.0
"""50%""",10.0
"""75%""",12.0
"""max""",117.0


'ja'

statistic,value
str,f64
"""count""",74668.0
"""null_count""",0.0
"""mean""",9.109444
"""std""",4.03968
"""min""",2.0
"""25%""",7.0
"""50%""",8.0
"""75%""",10.0
"""max""",131.0


'zh'

statistic,value
str,f64
"""count""",74668.0
"""null_count""",0.0
"""mean""",8.070057
"""std""",3.537622
"""min""",2.0
"""25%""",6.0
"""50%""",7.0
"""75%""",9.0
"""max""",93.0


'ko'

statistic,value
str,f64
"""count""",74668.0
"""null_count""",0.0
"""mean""",13.189318
"""std""",5.557673
"""min""",2.0
"""25%""",10.0
"""50%""",12.0
"""75%""",15.0
"""max""",135.0


'ar'

statistic,value
str,f64
"""count""",74668.0
"""null_count""",0.0
"""mean""",10.730032
"""std""",5.043661
"""min""",2.0
"""25%""",8.0
"""50%""",10.0
"""75%""",12.0
"""max""",134.0


'he'

statistic,value
str,f64
"""count""",74668.0
"""null_count""",0.0
"""mean""",11.647694
"""std""",5.437531
"""min""",2.0
"""25%""",8.0
"""50%""",11.0
"""75%""",13.0
"""max""",145.0


'pl'

statistic,value
str,f64
"""count""",74668.0
"""null_count""",0.0
"""mean""",10.822521
"""std""",4.910211
"""min""",2.0
"""25%""",8.0
"""50%""",10.0
"""75%""",13.0
"""max""",137.0


'tr'

statistic,value
str,f64
"""count""",74668.0
"""null_count""",0.0
"""mean""",10.274669
"""std""",4.534276
"""min""",3.0
"""25%""",8.0
"""50%""",9.0
"""75%""",12.0
"""max""",102.0


'la'

statistic,value
str,f64
"""count""",74668.0
"""null_count""",0.0
"""mean""",9.941126
"""std""",4.314012
"""min""",2.0
"""25%""",7.0
"""50%""",9.0
"""75%""",11.0
"""max""",112.0


'el'

statistic,value
str,f64
"""count""",74668.0
"""null_count""",0.0
"""mean""",15.514076
"""std""",7.216036
"""min""",3.0
"""25%""",11.0
"""50%""",14.0
"""75%""",18.0
"""max""",178.0


In [9]:
df = sewi_dataset["train"].to_polars()
for col in df.columns[5:]:
    display(col)
    display(df[col].map_elements(count_tokens).describe())

'tok'

statistic,value
str,f64
"""count""",1081.0
"""null_count""",0.0
"""mean""",43.013876
"""std""",19.159728
"""min""",8.0
"""25%""",29.0
"""50%""",40.0
"""75%""",54.0
"""max""",133.0


'en'

statistic,value
str,f64
"""count""",1072.0
"""null_count""",9.0
"""mean""",28.853545
"""std""",12.458109
"""min""",6.0
"""25%""",19.0
"""50%""",27.0
"""75%""",36.0
"""max""",98.0


'ru'

statistic,value
str,f64
"""count""",971.0
"""null_count""",110.0
"""mean""",33.533471
"""std""",14.78895
"""min""",5.0
"""25%""",22.0
"""50%""",31.0
"""75%""",43.0
"""max""",116.0


'uk'

statistic,value
str,f64
"""count""",1074.0
"""null_count""",7.0
"""mean""",39.798883
"""std""",17.295862
"""min""",8.0
"""25%""",26.0
"""50%""",37.0
"""75%""",51.0
"""max""",122.0


'be'

statistic,value
str,f64
"""count""",1076.0
"""null_count""",5.0
"""mean""",45.618959
"""std""",19.92511
"""min""",6.0
"""25%""",30.0
"""50%""",42.0
"""75%""",59.0
"""max""",164.0


'fr'

statistic,value
str,f64
"""count""",1080.0
"""null_count""",1.0
"""mean""",36.099074
"""std""",15.535913
"""min""",6.0
"""25%""",24.0
"""50%""",34.0
"""75%""",46.0
"""max""",114.0


'es'

statistic,value
str,f64
"""count""",1072.0
"""null_count""",9.0
"""mean""",31.58209
"""std""",13.590543
"""min""",7.0
"""25%""",21.0
"""50%""",30.0
"""75%""",40.0
"""max""",121.0


'pt'

statistic,value
str,f64
"""count""",1069.0
"""null_count""",12.0
"""mean""",32.973807
"""std""",13.671071
"""min""",7.0
"""25%""",22.0
"""50%""",32.0
"""75%""",41.0
"""max""",120.0


'it'

statistic,value
str,f64
"""count""",1069.0
"""null_count""",12.0
"""mean""",37.210477
"""std""",16.15218
"""min""",5.0
"""25%""",25.0
"""50%""",35.0
"""75%""",48.0
"""max""",133.0


'de'

statistic,value
str,f64
"""count""",1077.0
"""null_count""",4.0
"""mean""",31.504178
"""std""",13.429467
"""min""",7.0
"""25%""",21.0
"""50%""",30.0
"""75%""",40.0
"""max""",102.0


'vi'

statistic,value
str,f64
"""count""",1069.0
"""null_count""",12.0
"""mean""",38.809167
"""std""",16.54828
"""min""",8.0
"""25%""",26.0
"""50%""",37.0
"""75%""",49.0
"""max""",133.0


'ja'

statistic,value
str,f64
"""count""",199.0
"""null_count""",882.0
"""mean""",30.351759
"""std""",10.914585
"""min""",7.0
"""25%""",23.0
"""50%""",30.0
"""75%""",37.0
"""max""",63.0


'zh'

statistic,value
str,f64
"""count""",1069.0
"""null_count""",12.0
"""mean""",27.115996
"""std""",11.041953
"""min""",4.0
"""25%""",19.0
"""50%""",25.0
"""75%""",34.0
"""max""",83.0


'ko'

statistic,value
str,f64
"""count""",1067.0
"""null_count""",14.0
"""mean""",39.779756
"""std""",16.692148
"""min""",3.0
"""25%""",27.0
"""50%""",37.0
"""75%""",51.0
"""max""",119.0


'ar'

statistic,value
str,f64
"""count""",1069.0
"""null_count""",12.0
"""mean""",88.540692
"""std""",40.177573
"""min""",10.0
"""25%""",56.0
"""50%""",83.0
"""75%""",116.0
"""max""",340.0


'he'

statistic,value
str,f64
"""count""",1077.0
"""null_count""",4.0
"""mean""",28.31662
"""std""",11.432891
"""min""",6.0
"""25%""",19.0
"""50%""",27.0
"""75%""",36.0
"""max""",97.0


'pl'

statistic,value
str,f64
"""count""",1069.0
"""null_count""",12.0
"""mean""",36.885875
"""std""",15.604129
"""min""",8.0
"""25%""",24.0
"""50%""",35.0
"""75%""",47.0
"""max""",132.0


'tr'

statistic,value
str,f64
"""count""",1069.0
"""null_count""",12.0
"""mean""",34.767072
"""std""",14.571758
"""min""",4.0
"""25%""",23.0
"""50%""",33.0
"""75%""",45.0
"""max""",120.0


'la'

statistic,value
str,f64
"""count""",807.0
"""null_count""",274.0
"""mean""",33.174721
"""std""",15.067193
"""min""",5.0
"""25%""",21.0
"""50%""",30.0
"""75%""",43.0
"""max""",125.0


'el'

statistic,value
str,f64
"""count""",974.0
"""null_count""",107.0
"""mean""",60.069815
"""std""",25.923384
"""min""",7.0
"""25%""",39.0
"""50%""",56.0
"""75%""",78.0
"""max""",173.0


In [10]:
df = wiki_dataset["train"].to_polars()
df["text"].map_elements(count_tokens).describe()

statistic,value
str,f64
"""count""",2917.0
"""null_count""",0.0
"""mean""",226.772369
"""std""",509.421358
"""min""",4.0
"""25%""",58.0
"""50%""",113.0
"""75%""",228.0
"""max""",15212.0


In [11]:
poki_dataset["train"].to_polars()["text"].map_elements(count_tokens).describe()

statistic,value
str,f64
"""count""",1433.0
"""null_count""",0.0
"""mean""",1237.900907
"""std""",16624.123236
"""min""",1.0
"""25%""",149.0
"""50%""",361.0
"""75%""",735.0
"""max""",618874.0


In [12]:
kule_dataset["train"].to_polars()["text"].map_elements(count_tokens).describe()

statistic,value
str,f64
"""count""",42.0
"""null_count""",0.0
"""mean""",801.738095
"""std""",537.579764
"""min""",27.0
"""25%""",399.0
"""50%""",756.0
"""75%""",1071.0
"""max""",2224.0


In [13]:
new_data = {
    "train": [],
    "validation": []
}

In [14]:
new_data

{'train': [], 'validation': []}

In [15]:
batch_size = 5
overlap = 2

for key in ["train", "validation"]:
    df = tatoeba_dataset[key].to_polars()[["tok", "en", "ru", "vi"]].drop_nulls()
    conc_df = df\
        .with_row_index("row_id")\
        .with_columns([
            ((pl.col("row_id") - pl.col("row_id") % (batch_size - overlap)) // (batch_size - overlap))
            .alias("chunk")
        ])\
        .group_by(["chunk"], maintain_order=True)\
        .agg([
            pl.col(col).str.join(" ").alias(col)
            for col in df.columns
        ])
    new_data[key].append(conc_df[["tok", "en", "ru", "vi"]])
#     break

# display(conc_df.shape)
# for col in df.columns:
#     display(col)
#     display(conc_df[col].map_elements(count_tokens).describe())

In [16]:
batch_size = 5
overlap = 3

for key in ["train", "validation"]:
    df = sewi_dataset[key].to_polars()["key", "part", "book", "chapter", "verse", "tok", "en", "ru", "vi"].drop_nulls()

    conc_df = df\
        .sort(["part", "book", "chapter", "verse"])\
        .with_row_index("row_id")\
        .with_columns((pl.col("row_id") - pl.col("row_id") % (batch_size - overlap)).alias("chunk"))\
        .group_by(["chunk"], maintain_order=True)\
        .agg([
            pl.col(col).str.join(" ").alias(col)
            for col in df.columns[5:]
        ])
    new_data[key].append(conc_df[["tok", "en", "ru", "vi"]])

# for col in df.columns[5:]:
#     display(col)
#     display(conc_df[col].map_elements(count_tokens).describe())


In [17]:
batch_size = 6
overlap = 3

for key in ["train", "validation"]:

    df = wiki_dataset[key].to_polars()

    exploded = (
        df
        .with_row_index("doc_id")
        .with_columns(pl.col("text").str.split("\n").alias("paragraphs"))
        .explode("paragraphs")
    )

    result = (
        exploded
        .with_row_index("row_id")
        .with_columns([
            ((pl.col("row_id") - pl.col("row_id") % (batch_size - overlap)) // (batch_size - overlap))
            .alias("chunk")
        ])
        .group_by(["doc_id", "chunk"], maintain_order=True)
        .agg([
            pl.col("paragraphs").str.join("\n").alias("text_concat")
        ])
        .sort(["doc_id", "chunk"])
        .select(["doc_id", "text_concat"])
        .rename({"text_concat": "text"})
    )
    new_data[key].append(result[["text"]])

result["text"].map_elements(count_tokens).describe()

statistic,value
str,f64
"""count""",326.0
"""null_count""",0.0
"""mean""",98.070552
"""std""",94.17838
"""min""",3.0
"""25%""",29.0
"""50%""",73.0
"""75%""",133.0
"""max""",751.0


In [18]:
batch_size = 10
overlap = 5

for key in ["train", "validation"]:

    df = poki_dataset[key].to_polars()

    exploded = (
        df
        .with_row_index("doc_id")
        .with_columns(pl.col("text").str.split(". ", inclusive=True).alias("sentences"))
        .explode("sentences")
    )

    exploded = exploded.with_columns(exploded["sentences"].map_elements(count_tokens).alias("tokens"))

    exploded = exploded.filter(exploded["tokens"] < 1000)
    exploded = exploded.with_columns(exploded["text"].str.strip_chars_start(" \n"))
    exploded = exploded.filter(exploded["text"].str.len_chars() > 3)

    result = (
        exploded
        .with_row_index("row_id")
        .with_columns([
            ((pl.col("row_id") - pl.col("row_id") % (batch_size - overlap)) // (batch_size - overlap))
            .alias("chunk")
        ])
        .group_by(["doc_id", "chunk"], maintain_order=True)
        .agg([
            pl.col("sentences").str.join("").alias("text_concat")
        ])
        .sort(["doc_id", "chunk"])
        .select(["doc_id", "text_concat"])
        .rename({"text_concat": "text"})
    )
    new_data[key].append(result[["text"]])

result["text"].map_elements(count_tokens).describe()

statistic,value
str,f64
"""count""",721.0
"""null_count""",0.0
"""mean""",112.510402
"""std""",82.323111
"""min""",5.0
"""25%""",61.0
"""50%""",89.0
"""75%""",139.0
"""max""",797.0


In [19]:
for key in ["train"]:
    df = kule_dataset[key].to_polars()
    new_data[key].append(df[["text"]])

In [20]:
for key in new_data.keys():
    print(key, len(new_data[key]))

train 5
validation 4


In [27]:
languages = ["en", "ru", "vi"]

def make_pairs(seq):
    for row in seq:
        if "text" in row.keys():
            yield {"from": "tok", "to": None, "source": row["text"], "target": None}
            continue

        for code in languages:
            if not row[code]:
                continue
            yield {"from": "tok", "to": code, "source": row["tok"], "target": row[code]}
            yield {"from": code, "to": "tok", "source": row[code], "target": row["tok"]}

In [30]:
for pair in make_pairs(new_data["train"][2].to_dicts()):
    print(pair)
    break

{'from': 'tok', 'to': None, 'source': 'Z li sitelen pini lon nasin sitelen Lasina.', 'target': None}


In [31]:
new_dataset = {"train": [], "validation": []}
dataset_dict = datasets.DatasetDict()

for key, value in new_data.items():
    for df in value:
        for row in make_pairs(df.iter_rows(named=True)):
            new_dataset[key].append(row)

    data_df = pl.DataFrame(data=new_dataset[key])
    dataset_dict[key] = datasets.Dataset.from_polars(data_df)

dataset_dict

DatasetDict({
    train: Dataset({
        features: ['from', 'to', 'source', 'target'],
        num_rows: 168981
    })
    validation: Dataset({
        features: ['from', 'to', 'source', 'target'],
        num_rows: 9105
    })
})

In [33]:
for key in dataset_dict.keys():
    display(key)
    tt = dataset_dict[key].to_polars()
    tt = tt.with_columns(tt["source"].map_elements(count_tokens).alias("tokens"))
    display(tt["tokens"].describe())
    display(tt["target"].null_count())
    display(tt.shape)

'train'

statistic,value
str,f64
"""count""",168981.0
"""null_count""",0.0
"""mean""",42.227878
"""std""",42.193817
"""min""",2.0
"""25%""",26.0
"""50%""",33.0
"""75%""",44.0
"""max""",2224.0


16755

(168981, 5)

'validation'

statistic,value
str,f64
"""count""",9105.0
"""null_count""",0.0
"""mean""",43.126085
"""std""",39.481608
"""min""",3.0
"""25%""",26.0
"""50%""",34.0
"""75%""",45.0
"""max""",797.0


1047

(9105, 5)

In [34]:
dataset_dict.push_to_hub("NetherQuartz/combined-tokipona-dataset")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

README.md:   0%|          | 0.00/683 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/NetherQuartz/combined-tokipona-dataset/commit/6dbcb66b0995936f615a37f747b23caaa3e960cf', commit_message='Upload dataset', commit_description='', oid='6dbcb66b0995936f615a37f747b23caaa3e960cf', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/NetherQuartz/combined-tokipona-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='NetherQuartz/combined-tokipona-dataset'), pr_revision=None, pr_num=None)