In [1]:
import datasets
import polars as pl

from hashlib import sha256

In [2]:
data = pl.read_parquet("wikipesija.pq")
data = data.with_columns(pl.Series(range(len(data))).alias("id"))
data = data[["id", "url", "title", "text"]]
data

id,url,title,text
i64,str,str,str
0,"""https://wikipesija.org/wiki/Z""","""Z""","""Z li sitelen pini lon nasin si…"
1,"""https://wikipesija.org/wiki/ku…","""kulupu_SCP""","""kulupu SCP anu kulupu Esipi li…"
2,"""https://wikipesija.org/wiki/po…","""poki_mani""","""lon poki mani la, jan li ken p…"
3,"""https://wikipesija.org/wiki/so…","""sona_pi_pu_taso""","""sona pi pu taso (toki Inli: pu…"
4,"""https://wikipesija.org/wiki/IS…","""ISO_3166:CR""","""ma Kosalika (Costa Rica) anu m…"
…,…,…,…
3058,"""https://wikipesija.org/wiki/ma…","""ma_Upe""","""ma Upe(toki Sonko:湖北省hu2 bei3 …"
3059,"""https://wikipesija.org/wiki/IS…","""ISO_3166:ER""","""ma Eliteja li ma lawa lon ma A…"
3060,"""https://wikipesija.org/wiki/ja…","""jaki""","""jaki li ijo ike. jan li wile w…"
3061,"""https://wikipesija.org/wiki/ja…","""jan_sewi_Asokasuntali""","""jan sewi Asokasuntali (toki Sa…"


In [3]:
data = data.with_columns(
    data["text"]
        .map_elements(
            lambda t:
            int(sha256(t.encode()).hexdigest(), 16) % 10000
        )
        .alias("hash")
)
data["hash"].plot.hist()

In [4]:
dataset_dict = datasets.DatasetDict()
dataset_dict["train"] = datasets.Dataset.from_polars(data.filter(data["hash"] > 500)[data.columns[:-1]])
dataset_dict["validation"] = datasets.Dataset.from_polars(data.filter(data["hash"] <= 500)[data.columns[:-1]])

dataset_dict

DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'title', 'text'],
        num_rows: 2917
    })
    validation: Dataset({
        features: ['id', 'url', 'title', 'text'],
        num_rows: 146
    })
})

In [5]:
dataset_dict.push_to_hub("NetherQuartz/wikipesija")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

CommitInfo(commit_url='https://huggingface.co/datasets/NetherQuartz/wikipesija/commit/b7c0e1fad208224900b1a21f3e9d4d11fd16afef', commit_message='Upload dataset', commit_description='', oid='b7c0e1fad208224900b1a21f3e9d4d11fd16afef', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/NetherQuartz/wikipesija', endpoint='https://huggingface.co', repo_type='dataset', repo_id='NetherQuartz/wikipesija'), pr_revision=None, pr_num=None)