In [1]:
import json

### 1. create the structure of the dataset (context_qa type)

In [2]:
def get_dictonary(c,q):
    dictionary = {
        "context":"table("+c+")",
        "question":q['question'],
        "answer":q['answer'],
    }
    return dictionary

### 2. Synthetic data
A simple example: to automate the process, you should randomly generate object poses.

In [3]:
import random

def generate_dataset_and_qa():
    # Define object names
    fruit = ["apple", "banana", "pear", "melon"]
    recipient = ["cup", "bowl", "box", "basket"]
    object_names = fruit + recipient

    # Initialize a list to store the final questions and answers
    questions_and_answers = []

    # Generate dataset with object positions and process them in a single loop
    for _ in range(10000):
        # Generate positions for each object
        objects = [
            {
                "name": name,
                "x": random.randint(-500, 500),
                "y": random.randint(-500, 500),
                "z": random.randint(0, 500),
            }
            for name in object_names
        ]

        # Generate the context for this set of objects
        context = "; ".join(f"{obj['name']} x: {obj['x']}, y: {obj['y']}, z: {obj['z']}" for obj in objects) + ";"

        # Randomly assign task type based on required proportions
        task_type = random.choices(
            ["place_next_to", "place_into", "invalid_task"], weights=[0.33, 0.34, 0.33], k=1
        )[0]

        obj1 = random.choice(objects)

        if task_type == "place_next_to":
            # Ensure obj2 is not the same as obj1 and is not a recipient
            obj2 = random.choice(objects)
            while obj2['name'] in recipient or obj1 == obj2:
                obj2 = random.choice(objects)

            x_a, y_a, z_a = obj1['x'], obj1['y'], obj1['z']
            x_c, y_c, z_c = obj2['x'], obj2['y'], obj2['z']

            # Generate question and answer for "place next to"
            q = f"Pick the object {obj1['name']} and place next to the {obj2['name']}."
            a = (
                f"go to x: {x_a}, y: {y_a}, z: {z_a}+30; "
                f"go to x: {x_a}, y: {y_a}, z: {z_a}; "
                f"close the gripper; "
                f"go to x: {x_c}+10, y: {y_c}, z: {z_c}+10; "
                f"open the gripper; "
                "go home;"
            )

        elif task_type == "place_into":
            # Ensure obj2 is a recipient
            obj2 = random.choice(objects)
            while obj2['name'] not in recipient or obj1 == obj2:
                obj2 = random.choice(objects)

            x_a, y_a, z_a = obj1['x'], obj1['y'], obj1['z']
            x_c, y_c, z_c = obj2['x'], obj2['y'], obj2['z']

            # Generate question and answer for "place into"
            q = f"Pick the object {obj1['name']} and place it in the {obj2['name']}."
            a = (
                f"go to x: {x_a}, y: {y_a}, z: {z_a}+30; "
                f"go to x: {x_a}, y: {y_a}, z: {z_a}; "
                f"close the gripper; "
                f"go to x: {x_c}+0, y: {y_c}, z: {z_c}+10; "
                f"open the gripper; "
                "go home;"
            )

        else:  # "invalid_task"
            # Ensure obj2 is not a recipient
            obj2 = random.choice(objects)
            while obj2['name'] not in recipient:
                obj2 = random.choice(objects)

            # Generate invalid task
            q = f"Pick the object {obj1['name']} and place it in the {obj2['name']}."
            a = f"This task cannot be performed as {obj1['name']} is a recipient."

        # Store the question and answer
        questions_and_answers.append({
            "context": context,
            "question": q,
            "answer": a
            }
        )

    return questions_and_answers

final_data = generate_dataset_and_qa()

In [4]:
# save in json file
with open('data_2.json', 'w') as f:
    json.dump(final_data, f, indent=4)


# Load the json file
with open('data_2.json', 'r') as f:
    data = json.load(f)

### 5. Login to HuggingFace

In [1]:
access_token = "secret"
from huggingface_hub import login
login(token="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")

  from .autonotebook import tqdm as notebook_tqdm


### 6. Upload the dataset to HuggingFace

In [2]:
import datasets
from datasets import load_dataset
dataset = load_dataset('json', data_files='data_2.json')
dataset.push_to_hub("endritnazifi/test_lab_dataset2")

Generating train split: 10000 examples [00:00, 259189.24 examples/s]
Creating parquet from Arrow format: 100%|██████████| 10/10 [00:00<00:00, 1111.07ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.00s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/endritnazifi/test_lab_dataset2/commit/687d3f78a9fe44e18b355da9d4b9595b161fb78b', commit_message='Upload dataset', commit_description='', oid='687d3f78a9fe44e18b355da9d4b9595b161fb78b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/endritnazifi/test_lab_dataset2', endpoint='https://huggingface.co', repo_type='dataset', repo_id='endritnazifi/test_lab_dataset2'), pr_revision=None, pr_num=None)