In [15]:
import os
task_type = 'OBJECT_DETECTION'
dataset_name = 'MyDatasetnew2'
jsonline_files_path = os.path.join(os.getcwd(), 'JsonLines')


In [16]:
import jsonlines
import pathlib

TAG_CREATED_BY = 'labelingCreatedBy'
TAG_PROJECT_TYPE = 'labelingProjectType'
TAG_SOURCE_DATASTORE_NAME = 'SourceDatastoreName'
TAG_SOURCE_RELATIVE_PATH = 'SourceRelativePath'
TAG_LABEL_NAME = 'labelingLabelName'

Labeling_Project_Type = {
    'IMAGE_CLASSIFICATION': 'Image Classification Multi-class',
    'IMAGE_MULTI_LABEL_CLASSIFICATION': 'Image Classification',
    'OBJECT_DETECTION': 'Object Identification (Bounding Box)',
    'IMAGE_INSTANCE_SEGMENTATION': 'Instance Segmentation',
    'TEXT_CLASSIFICATION': 'Text Classification Multi-class',
    'TEXT_MULTI_LABEL_CLASSIFICATION': 'Text Classification Multi-label',
    'TEXT_NAMED_ENTITY_RECOGNITION': 'Text Named Entity Recognition'
    }
 
URL_SCHEME = 'AmlDatastore:/'
URL_KEY = 'image_url'
LABEL_KEY = 'label'


class DatasetTagsGenerator:

    def __init__(self):
        self.labelSet = set()
        self.tags = {TAG_CREATED_BY: 'Labeled Dataset Registration NoteBook (v.4)',
                     TAG_PROJECT_TYPE: None,
                     TAG_SOURCE_DATASTORE_NAME: None,
                     TAG_SOURCE_RELATIVE_PATH: None,
                     TAG_LABEL_NAME: []}

    def get_tags_from_jsonl_files(self, jsonl_file_folder: str, task_type: str) -> dict():
        if not os.path.exists(jsonl_file_folder):
            raise Exception("JsonLine folder {} not found.".format(jsonl_file_folder))

        for root, _, files in os.walk(jsonl_file_folder):
            for file in files:
                if pathlib.PurePath(file).suffix == '.jsonl':
                    with jsonlines.open(os.path.join(root, file)) as reader:
                        for json_line in reader:
                            self._populate_label_names(json_line)
                            self._populate_source_relative_path(json_line[URL_KEY])

        p = pathlib.PurePath(self.tags[TAG_SOURCE_RELATIVE_PATH])
        p = p.relative_to(URL_SCHEME)

        self.tags[TAG_PROJECT_TYPE] = Labeling_Project_Type[task_type]
        self.tags[TAG_SOURCE_DATASTORE_NAME] = p.parts[0]
        self.tags[TAG_SOURCE_RELATIVE_PATH] = str(pathlib.PurePosixPath(*list(p.parts[1:]))) + "/**"
        self.tags[TAG_LABEL_NAME] = list(self.labelSet)
        return self.tags

    def _populate_label_names(self, json_line:str):

        if type(json_line[LABEL_KEY]) is list:
            for label in json_line[LABEL_KEY]:
                if type(label) is dict:
                    self.labelSet.add(label[LABEL_KEY])
                else:
                    self.labelSet.add(label)
        else:
            self.labelSet.add(json_line[LABEL_KEY])

    def _populate_source_relative_path(self, image_url:str):
        if self.tags[TAG_SOURCE_RELATIVE_PATH] is None:
            self.tags[TAG_SOURCE_RELATIVE_PATH] = image_url
        else:
            self.tags[TAG_SOURCE_RELATIVE_PATH] = os.path.commonpath([self.tags[TAG_SOURCE_RELATIVE_PATH], image_url])

In [17]:
from azureml.core import Workspace
#Uploading and Registering the Dataset with AzureML 
from azureml.core.authentication import InteractiveLoginAuthentication
from azureml.core import Workspace

interactive_auth = InteractiveLoginAuthentication(tenant_id="<TenantID>")
ws = Workspace.get(name='<workspace-name>',
            subscription_id='<sub-id>',
            resource_group='<resource-group>',
            location='<location>',
            cloud='AzureCloud',
            auth=interactive_auth
            )

In [18]:
def_blob_store = ws.get_default_datastore()
path_on_datastore = pathlib.PurePosixPath('/','Labeling', 'datasets', dataset_name)
jsonline_files = [os.path.join(jsonline_files_path, file) for _, _, files in os.walk(jsonline_files_path) for file in files if pathlib.PurePath(file).suffix == '.jsonl']
dataset_source = def_blob_store.upload_files(jsonline_files, target_path = str(path_on_datastore), overwrite = True, show_progress = True)

Uploading an estimated of 1 files
Uploading c:\Users\varmag\Documents\AzureML-Playground\COCO-AzureMLDataset\JsonLines\LabeledData.jsonl
Uploaded c:\Users\varmag\Documents\AzureML-Playground\COCO-AzureMLDataset\JsonLines\LabeledData.jsonl, 1 files out of an estimated total of 1
Uploaded 1 files


In [19]:
from azureml.data.dataset_factory import TabularDatasetFactory, DataType
dataset_list = [k for k in ws.datasets.keys()]
if dataset_name in dataset_list:
    ouput_labeled_dataset = ws.datasets.get(dataset_name)
    print('Dataset "{}" has been registered in workspace "{}", please provide a different dataset name.'.format(dataset_name, ws.name))
else:
    tagsGenerator = DatasetTagsGenerator()
    tags = tagsGenerator.get_tags_from_jsonl_files(jsonline_files_path, task_type)
    output_tabular_dataset = TabularDatasetFactory.from_json_lines_files(path = dataset_source, set_column_types = {'image_url': DataType.to_stream(ws)} )
    output_tabular_dataset = output_tabular_dataset.register(workspace = ws, name = dataset_name, tags = tags)

print('Done.')

Done.
