### This file converts the raw csv dataset into a hugging face dataset.

In [1]:
import polars as pl

#df_full = pl.read_csv("../data/dataset-train.csv")

#df_tiny = df_full.limit(1000)

#df_tiny.write_csv("../data/dataset-tiny.csv")

df_tiny = pl.read_csv("../data/dataset-tiny.csv")
df_train = pl.read_csv("../data/dataset-train.csv")
df_test = pl.read_csv("../data/dataset-test.csv")
df_test_mini = pl.read_csv("../data/dataset-test-mini.csv")

In [2]:
'''
Structure:
  ...
  Example:
    Inputs:
      ...
      Input_1:
        Input_ID:
        Input_Datetime:
        A1:
        B1:
        C1:
        ...
        
    Output:
      OutputID:
      Output_Datetime:
      Nujna:
      Nova:
      Programs:
      Content:

'''

'\nStructure:\n  ...\n  Example:\n    Inputs:\n      ...\n      Input_1:\n        Input_ID:\n        Input_Datetime:\n        A1:\n        B1:\n        C1:\n        ...\n\n    Output:\n      OutputID:\n      Output_Datetime:\n      Nujna:\n      Nova:\n      Programs:\n      Content:\n\n'

In [3]:
def extract_sample(row):
    return {
        "output": {
            "id": row['id_output'],
            "datetime": row['datetime_output'],
            "nujna": row['nujna'],
            "nova": row['nova'],
            "programs": row['programs'],
            "content": row['content']
        },
        "inputs": []
    }

def extract_input(row):
    return {
        "id": row['id_input'],
        "datetime": row['datetime_input'],
        "A1": row['A1'],
        "B1": row['B1'],
        "C1": row['C1'],
        "TitlePomembno": row['TitlePomembnoSLO'],
        "ContentPomembno": row['ContentPomembnoSLO'],
        "TitleNesrece": row['TitleNesreceSLO'],
        "ContentNesrece": row['ContentNesreceSLO'],
        "TitleZastoji": row['TitleZastojiSLO'],
        "ContentZastoji": row['ContentZastojiSLO'],
        "TitleVreme": row['TitleVremeSLO'],
        "ContentVreme": row['ContentVremeSLO'],
        "TitleOvire": row['TitleOvireSLO'],
        "ContentOvire": row['ContentOvireSLO'],
        "TitleDeloNaCesti": row['TitleDeloNaCestiSLO'],
        "ContentDeloNaCesti": row['ContentDeloNaCestiSLO'],
        "TitleOpozorila": row['TitleOpozorilaSLO'],
        "ContentOpozorila": row['ContentOpozorilaSLO'],
        "TitleMednarodneInformacije": row['TitleMednarodneInformacijeSLO'],
        "ContentMednarodneInformacije": row['ContentMednarodneInformacijeSLO'],
        "TitleSplosno": row['TitleSplosnoSLO'],
        "ContentSplosno": row['ContentSplosnoSLO'],
    }

def find_sample_index(samples, id):
    for index, element in enumerate(samples):
        if element['output']['id'] == id:
            return index
        
    return None

def build_dataset(df):
    samples = []

    for row in df.iter_rows(named=True):
        index = find_sample_index(samples, row['id_output'])
        if index == None:
            samples.append(extract_sample(row))
            index = find_sample_index(samples, row['id_output'])

        if index != None:
            samples[index]['inputs'].append(extract_input(row))

    return samples



In [4]:
from datasets import Dataset

d_tiny = build_dataset(df_tiny)
d_test = build_dataset(df_test)
d_test_mini = build_dataset(df_test_mini)
d_train = build_dataset(df_train)

ds_tiny = Dataset.from_list(d_tiny)
ds_test = Dataset.from_list(d_test)
ds_test_mini = Dataset.from_list(d_test_mini)
ds_train = Dataset.from_list(d_train)

In [5]:
#print(d[0]["inputs"][0]["B1"])

In [6]:
ds_tiny

Dataset({
    features: ['output', 'inputs'],
    num_rows: 183
})

In [7]:
ds_tiny.save_to_disk("../data/hf/dataset-tiny")
ds_test.save_to_disk("../data/hf/dataset-test")
ds_test_mini.save_to_disk("../data/hf/dataset-test-mini")
ds_train.save_to_disk("../data/hf/dataset-train")

Saving the dataset (0/1 shards):   0%|          | 0/183 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4625 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/232 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/18499 [00:00<?, ? examples/s]