##Databricks Overview (Demo)
####. Intro to Apache Spark and Databricks
####. Databricks Clusters (Compute)
####. Access to Data Lake (Storage)
####. Data Preparation and Analysis using Spark SQL (Python, R and SQL)
####. Integration with RStudio (Open source version)
####. Persistently store the cleansed dataset in the data lake

<img src ='/files/Spark_Cluster.JPG'>

<img src ='/files/Spark_Eco_System.JPG'>

#Easily store and access data from the Data Hub (Data Lake)

#### Access files from Data Lake storage and load it into a spark dataframe using Python

In [0]:
df=spark.read.format('csv').option("header","true").load('abfss://databricks-demo@datahubdatalakedev.dfs.core.windows.net/demo/export-2.csv')

#Data manipulation using Spark SQL

#### Display data from the dataframe

In [0]:
df.show()

#### Select specific columns

In [0]:
df.select("GEO","Value","Date").where("value >6000").show()

#### Filter out rows

In [0]:
dfND = df.where(df["VALUE"].isNotNull())

#### Aggregate/Count by Column

In [0]:
dfCountByLF=dfND.groupBy("Labour_force_characteristics").count()
display(dfCountByLF)

Labour_force_characteristics,count
Labour force,177
Full-time employment,171
Employment,170
Part-time employment,166
Unemployment,160
Unemployment rate,152


#### Filter for records with Full Time Employment as labour force characteristic

In [0]:
dfFullTimeEmp = dfND.filter('Labour_force_characteristics=="Full-time employment"')
display(dfFullTimeEmp)

GEO,Labour_force_characteristics,Sex,Age_group,VALUE,Date,NAICS
Canada,Full-time employment,Both sexes,15 years and over,8007.4,,"Total, all industries"
Canada,Full-time employment,Both sexes,15 years and over,2893.0,,Goods-producing sector
Canada,Full-time employment,Both sexes,15 years and over,312.6,,"Agriculture [111-112, 1100, 1151-1152]"
Canada,Full-time employment,Both sexes,15 years and over,220.8,,"Forestry, fishing, mining, quarrying, oil and gas [21, 113-114, 1153, 2100]"
Canada,Full-time employment,Both sexes,15 years and over,103.8,,Utilities [22]
Canada,Full-time employment,Both sexes,15 years and over,534.2,,Construction [23]
Canada,Full-time employment,Both sexes,15 years and over,1721.6,,Manufacturing [31-33]
Canada,Full-time employment,Both sexes,15 years and over,5114.4,,Services-producing sector
Canada,Full-time employment,Both sexes,15 years and over,1199.3,,"Wholesale and retail trade [41, 44-45]"
Canada,Full-time employment,Both sexes,15 years and over,507.8,,Transportation and warehousing [48-49]


#Use Pandas and Spark Dataframe in the same notebook

#### Convert Spark Dataframe to regular Python dataframe if needed (Pandas)

In [0]:
pandadf = dfFullTimeEmp.select("*").toPandas()

In [0]:
pandadf.head()

Unnamed: 0,GEO,Labour_force_characteristics,Sex,Age_group,VALUE,Date,NAICS
0,Canada,Full-time employment,Both sexes,15 years and over,8007.4,,"Total, all industries"
1,Canada,Full-time employment,Both sexes,15 years and over,2893.0,,Goods-producing sector
2,Canada,Full-time employment,Both sexes,15 years and over,312.6,,"Agriculture [111-112, 1100, 1151-1152]"
3,Canada,Full-time employment,Both sexes,15 years and over,220.8,,"Forestry, fishing, mining, quarrying, oil and ..."
4,Canada,Full-time employment,Both sexes,15 years and over,103.8,,Utilities [22]


#### Create Pandas dataframe from scratch

In [0]:
import pandas as pd
data = [1,2,3,4,5]
pandadf = pd.DataFrame(data)
print(pandadf)

#### Convert Pandas dataframe to Spark Dataframe
######*Not all data types are supported

In [0]:
sparkdf = spark.createDataFrame(pandadf)

#Use Standard SQL

#### Convert Dataframe to Temporary Table to analyze data using SQL

In [0]:
dfFullTimeEmp.createOrReplaceTempView("vw_dfFullTimeEmp")

In [0]:
%sql
SELECT * FROM vw_dfFullTimeEmp 

GEO,Labour_force_characteristics,Sex,Age_group,VALUE,Date,NAICS
Canada,Full-time employment,Both sexes,15 years and over,8007.4,,"Total, all industries"
Canada,Full-time employment,Both sexes,15 years and over,2893.0,,Goods-producing sector
Canada,Full-time employment,Both sexes,15 years and over,312.6,,"Agriculture [111-112, 1100, 1151-1152]"
Canada,Full-time employment,Both sexes,15 years and over,220.8,,"Forestry, fishing, mining, quarrying, oil and gas [21, 113-114, 1153, 2100]"
Canada,Full-time employment,Both sexes,15 years and over,103.8,,Utilities [22]
Canada,Full-time employment,Both sexes,15 years and over,534.2,,Construction [23]
Canada,Full-time employment,Both sexes,15 years and over,1721.6,,Manufacturing [31-33]
Canada,Full-time employment,Both sexes,15 years and over,5114.4,,Services-producing sector
Canada,Full-time employment,Both sexes,15 years and over,1199.3,,"Wholesale and retail trade [41, 44-45]"
Canada,Full-time employment,Both sexes,15 years and over,507.8,,Transportation and warehousing [48-49]


In [0]:
%sql
SELECT sum(value) Employment, NAICS FROM vw_dfFullTimeEmp 
WHERE NAICS <> 'Total, all industries'
GROUP BY NAICS 
ORDER BY Employment desc

Employment,NAICS
48149.3,Services-producing sector
28822.2,Goods-producing sector
16319.7,Manufacturing [31-33]
11038.2,"Wholesale and retail trade [41, 44-45]"
5903.5,Health care and social assistance [62]
5799.7,Construction [23]
5557.4,Public administration [91]
5083.299999999999,Educational services [61]
4699.700000000001,Transportation and warehousing [48-49]
4294.0,"Finance, insurance, real estate, rental and leasing [52, 53]"


# Create User Defined Functions in Python (or Scala, R) and use it in SQL

In [0]:
def converted_value(pval: float):
  return pval * 1.5
spark.udf.register("alteredvalue", converted_value, FloatType())

In [0]:
%sql select NAICS,converted_value(cast(VALUE as double)) as new_value from vw_dfFullTimeEmp

NAICS,new_value
"Total, all industries",12011.1
Goods-producing sector,4339.5
"Agriculture [111-112, 1100, 1151-1152]",468.9
"Forestry, fishing, mining, quarrying, oil and gas [21, 113-114, 1153, 2100]",331.2
Utilities [22],155.7
Construction [23],801.3
Manufacturing [31-33],2582.4
Services-producing sector,7671.6
"Wholesale and retail trade [41, 44-45]",1798.95
Transportation and warehousing [48-49],761.7


# Create persistent table in the Data Lake 
#####. Data is stored in columnar (Parquet) format 
#####. Can be queried and updated by multiple users (ACID compliance)
#####. Versioned
#####. Accessible via JDBC (Power BI, Tableau etc)

In [0]:
%sql
CREATE TABLE IF NOT EXISTS DEMO_LABOUR_FORCE 
USING DELTA
LOCATION 'abfss://databricks-demo@datahubdatalakedev.dfs.core.windows.net/demo/tbl/'
AS SELECT * FROM vw_dfFullTimeEmp


In [0]:
%sql

UPDATE DEMO_LABOUR_FORCE SET Labour_force_characteristics = 'Full Time'

In [0]:
%sql
DESCRIBE HISTORY DEMO_LABOUR_FORCE

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata
1,2021-02-22T22:32:33.000+0000,4980114848272222,ilango.sivapathasundaram@nrcan-rncan.gc.ca,UPDATE,Map(),,List(2073514716953218),0222-154221-pub857,0.0,WriteSerializable,False,"Map(numRemovedFiles -> 1, numAddedFiles -> 1, numUpdatedRows -> 171, numCopiedRows -> 0)",
0,2021-02-22T20:54:02.000+0000,4980114848272222,ilango.sivapathasundaram@nrcan-rncan.gc.ca,CREATE TABLE AS SELECT,"Map(isManaged -> false, description -> null, partitionBy -> [], properties -> {})",,List(2073514716953218),0222-154221-pub857,,WriteSerializable,True,"Map(numFiles -> 1, numOutputBytes -> 4347, numOutputRows -> 171)",


In [0]:
%sql
SELECT * FROM DEMO_LABOUR_FORCE TIMESTAMP AS OF '2021-02-22T22:32:33.000+0000'

GEO,Labour_force_characteristics,Sex,Age_group,VALUE,Date,NAICS
Canada,Full Time,Both sexes,15 years and over,8007.4,,"Total, all industries"
Canada,Full Time,Both sexes,15 years and over,2893.0,,Goods-producing sector
Canada,Full Time,Both sexes,15 years and over,312.6,,"Agriculture [111-112, 1100, 1151-1152]"
Canada,Full Time,Both sexes,15 years and over,220.8,,"Forestry, fishing, mining, quarrying, oil and gas [21, 113-114, 1153, 2100]"
Canada,Full Time,Both sexes,15 years and over,103.8,,Utilities [22]
Canada,Full Time,Both sexes,15 years and over,534.2,,Construction [23]
Canada,Full Time,Both sexes,15 years and over,1721.6,,Manufacturing [31-33]
Canada,Full Time,Both sexes,15 years and over,5114.4,,Services-producing sector
Canada,Full Time,Both sexes,15 years and over,1199.3,,"Wholesale and retail trade [41, 44-45]"
Canada,Full Time,Both sexes,15 years and over,507.8,,Transportation and warehousing [48-49]


# Using R in Databricks

#### Integration of RStudio in Databricks (Open Source Version)

#### Convert data to SparkR dataframe

In [0]:
%r
library(SparkR)

sdrf2<-sql("SELECT * FROM DEMO_LABOUR_FORCE")

In [0]:
%r
showDF(sdrf2)