In [1]:
from hyperspace import *  
from com.microsoft.hyperspace import *
from com.microsoft.hyperspace.index import *

# Disable BroadcastHashJoin, so Spark will use standard SortMergeJoin. Currently, Hyperspace indexes utilize SortMergeJoin to speed up query.
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)

# Replace the value below with the name of your primary ADLS Gen2 account for your Synapse workspace
datalake = 'asadatalakeic6nl8d'

dfSales = spark.read.parquet("abfss://wwi-02@" + datalake + ".dfs.core.windows.net/sale-small/Year=2019/Quarter=Q4/Month=12/*/*.parquet")
dfSales.show(10)

dfCustomers = spark.read.load("abfss://wwi-02@" + datalake + ".dfs.core.windows.net/data-generators/generator-customer-clean.csv", format="csv", header=True)
dfCustomers.show(10)

# Create an instance of Hyperspace
hyperspace = Hyperspace(spark)

StatementMeta(SparkPool01, 0, 1, Finished, Available)

+--------------------+----------+---------+--------+--------------------+--------------------+---------------+--------------------+----+------+-------+
|       TransactionId|CustomerId|ProductId|Quantity|               Price|         TotalAmount|TransactionDate|        ProfitAmount|Hour|Minute|StoreId|
+--------------------+----------+---------+--------+--------------------+--------------------+---------------+--------------------+----+------+-------+
|6efc5ea6-36f6-4c4...|         1|     2857|       4|20.08000000000000...|80.32000000000000...|       20191230|24.00000000000000...|  19|    38|   7405|
|6efc5ea6-36f6-4c4...|         1|     1488|       1|26.84000000000000...|26.84000000000000...|       20191230|6.610000000000000000|  19|    38|   7405|
|6efc5ea6-36f6-4c4...|         1|     4021|       3|31.12000000000000...|93.36000000000000...|       20191230|28.11000000000000...|  19|    38|   7405|
|6efc5ea6-36f6-4c4...|         1|      728|       1|28.39000000000000...|28.390000000000

In [2]:
#create indexes: each one contains a name, a set of indexed columns and a set of included columns
indexConfigSales = IndexConfig("indexSALES", ["CustomerId"], ["TotalAmount"])
indexConfigCustomers = IndexConfig("indexCUSTOMERS", ["CustomerId"], ["FullName"])

hyperspace.createIndex(dfSales, indexConfigSales)			# only create index once
hyperspace.createIndex(dfCustomers, indexConfigCustomers)	# only create index once
hyperspace.indexes().show()

StatementMeta(SparkPool01, 0, 2, Finished, Available)

+--------------+--------------+---------------+----------+--------------------+--------------------+------+
|          name|indexedColumns|includedColumns|numBuckets|              schema|       indexLocation| state|
+--------------+--------------+---------------+----------+--------------------+--------------------+------+
|indexCUSTOMERS|  [CustomerId]|     [FullName]|       200|{"type":"struct",...|abfss://tempdata@...|ACTIVE|
|    indexSALES|  [CustomerId]|  [TotalAmount]|       200|{"type":"struct",...|abfss://tempdata@...|ACTIVE|
+--------------+--------------+---------------+----------+--------------------+--------------------+------+



In [3]:
df1 = dfSales.filter("""CustomerId = 6""").select("""TotalAmount""")
df1.show()
df1.explain(True)

StatementMeta(SparkPool01, 0, 3, Finished, Available)

+--------------------+
|         TotalAmount|
+--------------------+
|140.8800000000000...|
|70.44000000000000...|
|22.51000000000000...|
|30.45000000000000...|
|88.00000000000000...|
|49.30000000000000...|
|22.75000000000000...|
+--------------------+

== Parsed Logical Plan ==
'Project [unresolvedalias('TotalAmount, None)]
+- Filter (CustomerId#1 = 6)
   +- Relation[TransactionId#0,CustomerId#1,ProductId#2,Quantity#3,Price#4,TotalAmount#5,TransactionDate#6,ProfitAmount#7,Hour#8,Minute#9,StoreId#10] parquet

== Analyzed Logical Plan ==
TotalAmount: decimal(38,18)
Project [TotalAmount#5]
+- Filter (CustomerId#1 = 6)
   +- Relation[TransactionId#0,CustomerId#1,ProductId#2,Quantity#3,Price#4,TotalAmount#5,TransactionDate#6,ProfitAmount#7,Hour#8,Minute#9,StoreId#10] parquet

== Optimized Logical Plan ==
Project [TotalAmount#5]
+- Filter (isnotnull(CustomerId#1) && (CustomerId#1 = 6))
   +- Relation[TransactionId#0,CustomerId#1,ProductId#2,Quantity#3,Price#4,TotalAmount#5,TransactionDate#6

In [4]:
# Enable Hyperspace - Hyperspace optimization rules become visible to the Spark optimizer and exploit existing Hyperspace indexes to optimize user queries
Hyperspace.enable(spark)
df1 = dfSales.filter("""CustomerId = 6""").select("""TotalAmount""")
df1.show()
df1.explain(True)

StatementMeta(SparkPool01, 0, 4, Finished, Available)

+--------------------+
|         TotalAmount|
+--------------------+
|140.8800000000000...|
|70.44000000000000...|
|22.51000000000000...|
|30.45000000000000...|
|88.00000000000000...|
|49.30000000000000...|
|22.75000000000000...|
+--------------------+

== Parsed Logical Plan ==
'Project [unresolvedalias('TotalAmount, None)]
+- Filter (CustomerId#1 = 6)
   +- Relation[TransactionId#0,CustomerId#1,ProductId#2,Quantity#3,Price#4,TotalAmount#5,TransactionDate#6,ProfitAmount#7,Hour#8,Minute#9,StoreId#10] parquet

== Analyzed Logical Plan ==
TotalAmount: decimal(38,18)
Project [TotalAmount#5]
+- Filter (CustomerId#1 = 6)
   +- Relation[TransactionId#0,CustomerId#1,ProductId#2,Quantity#3,Price#4,TotalAmount#5,TransactionDate#6,ProfitAmount#7,Hour#8,Minute#9,StoreId#10] parquet

== Optimized Logical Plan ==
Project [TotalAmount#5]
+- Filter (isnotnull(CustomerId#1) && (CustomerId#1 = 6))
   +- Relation[CustomerId#1,TotalAmount#5] Hyperspace(Type: CI, Name: indexSALES, LogVersion: 1)

== Physi

In [5]:
df1 = dfSales.filter("""CustomerId = 6""").select("""TotalAmount""")

spark.conf.set("spark.hyperspace.explain.displayMode", "html")
hyperspace.explain(df1, True, displayHTML)

StatementMeta(SparkPool01, 0, 5, Finished, Available)

In [6]:
eqJoin = dfSales.join(dfCustomers, dfSales.CustomerId == dfCustomers.CustomerId).select(dfSales.TotalAmount, dfCustomers.FullName)

hyperspace.explain(eqJoin, True, displayHTML)

StatementMeta(SparkPool01, 0, 6, Finished, Available)

In [7]:
# Disable Hyperspace - Hyperspace rules no longer apply during query optimization. Disabling Hyperspace has no impact on created indexes because they remain intact
Hyperspace.disable(spark)

hyperspace.deleteIndex("indexSALES")
hyperspace.vacuumIndex("indexSALES")
hyperspace.deleteIndex("indexCUSTOMERS")
hyperspace.vacuumIndex("indexCUSTOMERS")

StatementMeta(SparkPool01, 0, 7, Finished, Available)

In [8]:
from notebookutils import mssparkutils

#
# Microsoft Spark Utilities
#
# https://docs.microsoft.com/en-us/azure/synapse-analytics/spark/microsoft-spark-utilities?pivots=programming-language-python
#

# Azure storage access info
blob_account_name = datalake
blob_container_name = 'wwi-02'
blob_relative_path = '/'
linkedServiceName = datalake
blob_sas_token = mssparkutils.credentials.getConnectionStringOrCreds(linkedServiceName)

# Allow SPARK to access from Blob remotely
spark.conf.set('fs.azure.sas.%s.%s.blob.core.windows.net' % (blob_container_name, blob_account_name), blob_sas_token)

files = mssparkutils.fs.ls('/')
for file in files:
    print(file.name, file.isDir, file.isFile, file.path, file.size)

mssparkutils.fs.mkdirs('/SomeNewFolder')

files = mssparkutils.fs.ls('/')
for file in files:
    print(file.name, file.isDir, file.isFile, file.path, file.size)

StatementMeta(SparkPool01, 0, 8, Finished, Available)

synapse True False abfss://tempdata@asadatalakeic6nl8d.dfs.core.windows.net/synapse 0
SomeNewFolder True False abfss://tempdata@asadatalakeic6nl8d.dfs.core.windows.net/SomeNewFolder 0
synapse True False abfss://tempdata@asadatalakeic6nl8d.dfs.core.windows.net/synapse 0
