In [None]:
import os
import pandas as pd
from dotenv import load_dotenv
import xml.etree.ElementTree as ET
from datasets import Dataset, DatasetDict, Features, Image, Value

load_dotenv()

In [None]:
# Define dataset path
DATASET_DIR = "/home/ubuntu/meta_hackathon_oslo/datasets/norhand_v3"
HF_ORG_NAME = "MykMaks"
HF_REPO_NAME = "norhand_v3"

In [None]:
def parse_xml_label(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    label = ""

    namespace = {'ns': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15'}
    unicode_elem = root.find(".//ns:Page/ns:TextRegion/ns:TextEquiv/ns:Unicode", namespace)

    if unicode_elem is not None:
        label = unicode_elem.text

    return label

In [None]:
def create_dataframe(image_dir, label_dir):
    data = []
    for image_file in os.listdir(image_dir):
        if image_file.endswith(('.png', '.jpg', '.jpeg')):
            image_path = os.path.join(image_dir, image_file)
            xml_file = os.path.join(label_dir, os.path.splitext(image_file)[0] + '.xml')
            if os.path.exists(xml_file):
                data.append({
                    'image': image_path,
                    'solution': parse_xml_label(xml_file),
                    'original_question': "",
                    'original_answer': "",
                    'question': "What's on this image?",
                    'language': "no",
                    'source': f"Zenodo/{HF_REPO_NAME}",
                })
    return pd.DataFrame(data)


In [None]:
def create_dataset_dict(base_dir):
    dataset_dict = {}
    for split in ['train', 'val', 'test']:
        image_dir = os.path.join(base_dir, split, 'images')
        label_dir = os.path.join(base_dir, split, 'page')
        df = create_dataframe(image_dir, label_dir)
        features = Features({
            'image': Image(),
            'solution': Value('string'),
            'original_question': Value('string'),
            'original_answer': Value('string'),
            'question': Value('string'),
            'language': Value('string'),
            'source': Value('string'),
        })
        dataset = Dataset.from_pandas(df, features=features)
        dataset_dict[split] = dataset
    return DatasetDict(dataset_dict)

In [None]:
def push_to_hub(data_dict, repo_name, repo_owner):
    # api = HfApi()
    # api.create_repo(
    #     token=os.getenv("HF_TOKEN"),
    #     name=repo_name,
    #     organization=repo_owner,
    #     repo_type='dataset',
    #     exist_ok=True
    # )
    data_dict.push_to_hub(f"{repo_owner}/{repo_name}")

In [None]:
# Create the DatasetDict
dataset_dict = create_dataset_dict(DATASET_DIR)

In [None]:
dataset_dict["test"][0]

In [None]:
# Push the dataset to the Hugging Face Hub
push_to_hub(dataset_dict, repo_name=HF_REPO_NAME, repo_owner=HF_ORG_NAME)