In [0]:
%run "/Shared/AV_Data_Discovery_Helper"

In [0]:
#Templated Shared Notebook

Here is an example of different steps of the user's workflow.

1. Filter the datastreams based on search criteria
2. Get the data URI path of the data stream
3. Read the data from the data uri path
4. Apply the modifications to the data 
4. Create a new derived datastream with applicable type & also add lineage to it
5. Save the data to the data uri returned from the datastream to be used by downstream applications

In [0]:
# Query the metadata storage(Cosmos DB) via REST API to find the right datastream
searchQuery = 'type=EXTRACTED&status=COMPLETED&tags=yolo'
datastreamId, measurementId, datastreamDataUri = GetDatastreamsDataUri(searchQuery)

print(f"Dataset url - {datastreamDataUri}")
print(f"Datastream Id - {datastreamId}")
print(f"Measurement Id - {measurementId}")

#nuscenesDataUri="abfss://preannotateddataset@preannotatedatasetsdev.dfs.core.windows.net"
#path=nuscenesDataUri+"/v1.0-mini/attribute.json"
#print(path)
#abfss://datalake@raihanstorage.dfs.core.windows.net/curated
#df = spark.read.option("sep", '\t').csv(f"{datastreamDataUri}/*.json")
#df = spark.read.json(path,  multiLine=True)
#df.printSchema()
#df.show()

In [0]:
# Access the datastrem using the datastream url retrieved from the metadata storage(Cosmos DB)

service_principal_secret = dbutils.secrets.get(scope="spdatabrickssecret",key="spdatabrickssecret")
set_spark_conf("avdataopsderivedzone",service_principal_secret)
print(service_principal_secret)
df = ReadDatastream(datastreamDataUri, "\t", "txt")

display(df)


_c0,_c1,_c2,_c3,_c4,_c5,_c6
0,person,909.2471923828124,676.335693359375,173.156982421875,161.59075927734375,0.731320858001709
2,car,1581.3782958984375,616.2061767578125,149.49755859375,42.14697265625,0.7025194764137268
3,motorcycle,315.1141967773437,742.4796142578125,122.78956604003906,131.65118408203125,0.6847835183143616
3,motorcycle,760.3540649414062,726.8539428710938,190.9237060546875,126.611328125,0.6552481651306152
4,airplane,1396.21044921875,521.6889038085938,394.271240234375,229.42556762695312,0.632475733757019
0,person,312.305908203125,682.1528930664062,109.02993774414062,183.9417724609375,0.6091387867927551
3,motorcycle,391.6471557617188,724.5401611328125,92.78640747070312,104.71038818359376,0.5680833458900452
2,car,244.0111389160156,644.7286987304688,140.9042205810547,111.4307861328125,0.5540120005607605
0,person,436.7977294921875,628.9166259765625,49.91253662109375,80.6666259765625,0.5309588313102722
0,person,382.300537109375,634.8646240234375,62.030670166015625,94.52655029296876,0.4800456464290619


In [0]:
# Make some modification to the dataframe 

df = df \
    .withColumnRenamed('_c0','class_id') \
    .withColumnRenamed('_c1','type') \
    .withColumnRenamed('_c2','X') \
    .withColumnRenamed('_c3','Y') \
    .withColumnRenamed('_c4','width') \
    .withColumnRenamed('_c5','height') \
    .withColumnRenamed('_c6','confidence')

display(df)

class_id,type,X,Y,width,height,confidence
0,person,909.2471923828124,676.335693359375,173.156982421875,161.59075927734375,0.731320858001709
2,car,1581.3782958984375,616.2061767578125,149.49755859375,42.14697265625,0.7025194764137268
3,motorcycle,315.1141967773437,742.4796142578125,122.78956604003906,131.65118408203125,0.6847835183143616
3,motorcycle,760.3540649414062,726.8539428710938,190.9237060546875,126.611328125,0.6552481651306152
4,airplane,1396.21044921875,521.6889038085938,394.271240234375,229.42556762695312,0.632475733757019
0,person,312.305908203125,682.1528930664062,109.02993774414062,183.9417724609375,0.6091387867927551
3,motorcycle,391.6471557617188,724.5401611328125,92.78640747070312,104.71038818359376,0.5680833458900452
2,car,244.0111389160156,644.7286987304688,140.9042205810547,111.4307861328125,0.5540120005607605
0,person,436.7977294921875,628.9166259765625,49.91253662109375,80.6666259765625,0.5309588313102722
0,person,382.300537109375,634.8646240234375,62.030670166015625,94.52655029296876,0.4800456464290619


<h2> Creating a datastream is optional </h2>

In [0]:
# Create new datastream
import json

datastreamData= json.dumps({ "name": "CuratedYoloDemo",
  "type": "CURATED",
  "lineage": {
    "producerMetadata": {
      "name": "ML_Databricks",
      "type": "datastream",
      "version": "1.0",
      "additionalProperties": {
        "Databricks_JobId": "something"
      }
    },
    "sources": [datastreamId]
  },
  "tags": [
    "curatedyolo"
  ]
})

curatedDatastreamDataUri = CreateDatastream(measurementId,datastreamData)
print(f"Curated Datastream Data URI - {curatedDatastreamDataUri}")


In [0]:
# Save the dataframe as an external table for downstream usecases 
externalTableName = "curatedyolo"
write_table(df, externalTableName, "append", curatedDatastreamDataUri)


In [0]:
%sql
-- Query the external table to access the data
select * from curatedyolo where type = 'car';

<h3>Play around with the data with SQL syntax </h3>

In [0]:
%sql

select type, count(type) from curatedyolo GROUP BY (type)