## Azure Data

In [None]:
    # load in packages
    from azureml.core import Workspace, Datastore

   #load in a configured workspace
    ws = Workspace.from_config()

    # Register a new datastore
    blob_ds = Datastore.register_azure_blob_container(workspace=ws,
        datastore_name='blob_data',
        container_name='data_container',
        account_name='az_store_acct',
        account_key='123456abcde789…')

In [None]:
#look at a list of datastores
for ds_name in ws.datastores:
    print(ds_name)

In [None]:
#get a reference to a datastore
blob_store = Datastore.get(ws, datastore_name='blob_data')

In [None]:
#load in default store
default_store = ws.get_default_datastore()

In [None]:
#set as default datastore
ws.set_default_datastore('blob_data')

In [None]:
#download and upload datastores
blob_ds.upload(src_dir='/files',
               target_path='/data/files',
               overwrite=True, show_progress=True)

blob_ds.download(target_path='downloads',
                 prefix='/data',
                 show_progress=True)

In [None]:
#using datastore on a script format
data_ref = blob_ds.path('data/files').as_download(path_on_compute='training_data')
estimator = SKLearn(source_directory='experiment_folder',
                    entry_script='training_script.py'
                    compute_target='local',
                    script_params = {'--data_folder': data_ref})

In [None]:
#using datastore like a local folder
import os
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--data_folder', type=str, dest='data_folder')
args = parser.parse_args()
data_files = os.listdir(args.data_folder)

In [None]:
#create and register tabular dataset (structured)
from azureml.core import Dataset

blob_ds = ws.get_default_datastore()
csv_paths = [(blob_ds, 'data/files/current_data.csv'),
             (blob_ds, 'data/files/archive/*.csv')]
tab_ds = Dataset.Tabular.from_delimited_files(path=csv_paths)
tab_ds = tab_ds.register(workspace=ws, name='csv_table')

In [None]:
#create and register a file dataset (unstructured)
from azureml.core import Dataset

blob_ds = ws.get_default_datastore()
file_ds = Dataset.File.from_files(path=(blob_ds, 'data/files/images/*.jpg'))
file_ds = file_ds.register(workspace=ws, name='img_files')

In [None]:
#retrieve dataset
import azureml.core
from azureml.core import Workspace, Dataset

# Load the workspace from the saved config file
ws = Workspace.from_config()

# Get a dataset from the workspace datasets collection
ds1 = ws.datasets['csv_table']

# Get a dataset by name from the datasets class
ds2 = Dataset.get_by_name(ws, 'img_files')

In [None]:
#version a dataset
img_paths = [(blob_ds, 'data/files/images/*.jpg'),
             (blob_ds, 'data/files/images/*.png')]
file_ds = Dataset.File.from_files(path=img_paths)
file_ds = file_ds.register(workspace=ws, name='img_files', create_new_version=True)

In [None]:
#retrieving a versioned dataset
img_ds = Dataset.get_by_name(workspace=ws, name='img_files', version=2)

In [None]:
#loading tabular data
df = tab_ds.to_pandas_dataframe()
# code to work with dataframe goes here

In [None]:
#loading file data
for file_path in file_ds.to_path():
    print(file_path)

In [None]:
#accessing data with Experiment script
estimator = SKLearn( source_directory='experiment_folder',
                     entry_script='training_script.py',
                     compute_target='local',
                     inputs=[tab_ds.as_named_input('csv_data')],
                     pip_packages=['azureml-dataprep[pandas]')