# Accessing Delta tables from Azure ML compute

This notebook will walk through how to query Delta tables that were written by Azure Databricks within an Azure ML compute instance using Datastores, and a FileDataset.

- [Create a FileDataset](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-create-register-datasets#create-a-filedataset)
- [Register datasets](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-create-register-datasets#register-datasets)
- [Mount vs. Download](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-train-with-datasets#mount-vs-download)

In [None]:
import azureml.core
from azureml.core import Workspace, Datastore, Dataset
import pandas as pd
import tempfile
import shutil
from tempfile import TemporaryDirectory
import os

ws = Workspace.from_config()

print(ws)
print()

# List all datastores registered in the current workspace
datastores = ws.datastores
for name, ds in datastores.items():
    print(name, ds.datastore_type)
    
# Get a named datastore from the current workspace (created in Azure ML UI)
datastore = Datastore.get(ws, datastore_name='<DATASTORE_NAME>')

# FileDataset

In [None]:
# (Do once) set up file paths for dataset
# Once registered, data will automatically get new files if FileDataset
datastore_paths = [(datastore, 'delta/events/*/*.parquet')]
events_ds = Dataset.File.from_files(path=datastore_paths)#, partition_format="/date={date:yyyy-MM-dd}")

# (Do once) to register dataset
events_ds = events_ds.register(workspace=ws, 
                               create_new_version=True, 
                               name='delta_events', 
                               description='Data in Delta format generated by Databricks')

In [None]:
# Get a dataset by name
events_ds = Dataset.get_by_name(workspace=ws, name='delta_events')

# Load a TabularDataset into pandas DataFrame
# mount dataset to the temp directory at `mounted_path`
mounted_path = tempfile.mkdtemp()
mount_context = events_ds.mount(mounted_path)

mount_context.start()

print("Delta/Parquet files are located (temporarily) here: {}".format(mounted_path))
print("Child directories: {})".format(os.listdir(mounted_path)))

In [None]:
# Read parquet files into Pandas DataFrame from temp directory
df = pd.read_parquet(mounted_path)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.groupby(['date', 'action'])['action'].agg('count').to_frame('count').reset_index()

In [None]:
mount_context.stop()