In [1]:
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip
import ConnectionConfigKaloyan as cc
from ConnectionConfigKaloyan import config

print(config.sections())
cc.setupEnvironment()
spark = cc.startLocalCluster("SQLExcercise")
spark.getActiveSession()

['default', 'tutorial_op', 'kafka']


Go to https://spark.apache.org/docs/latest/sql-getting-started.html and https://spark.apache.org/docs/latest/api/python/getting_started/quickstart_df.html#Quickstart:-DataFrame to get some insights in coding Spark SQL. Always select 'Python' as the language.

Use the Spark SQL Reference documentation to complete this excercise
- To write dataframe operations: Python SparkSQL API: https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/index.html
- To write pure SQL statements: Spark SQL API: https://spark.apache.org/docs/2.3.0/api/sql/index.html and https://spark.apache.org/docs/latest/sql-ref.html
Helpfull site with examples: https://sparkbyexamples.com/pyspark/


## Load employees.csv as a Spark Dataframe
Tip: https://spark.apache.org/docs/latest/api/python/getting_started/quickstart_df.html#Getting-Data-In/Out

In [2]:
#Extract 
df = spark.read.format("csv") \
  .option("header", "true") \
  .option("inferSchema", "true") \
  .load("./FileStore/tables/employees.csv")

## Display the schema of the DataFrame
Tip: https://spark.apache.org/docs/latest/api/python/getting_started/quickstart_df.html#Getting-Data-In/Out

In [3]:
df.printSchema()


root
 |-- employee_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- hire_date: date (nullable = true)



## Create a temperary view of the dataset with name tbl_employees
https://spark.apache.org/docs/latest/api/python/getting_started/quickstart_df.html#Getting-Data-In/Out

In [4]:
df.createOrReplaceTempView("tbl_employees")


## Calculate the total number of employees in two ways:
-   Via dataframe operations: Tip: https://spark.apache.org/docs/latest/api/python/getting_started/quickstart_df.html#Grouping-Data
-   With a sql statement op tbl_employees: use spark.sql()

In [5]:
df.groupBy().count().show()

+-----+
|count|
+-----+
|   10|
+-----+



In [6]:
spark.sql("select count(employee_id) from tbl_employees").show()

+------------------+
|count(employee_id)|
+------------------+
|                10|
+------------------+



# Find the average salary of all employees in two ways:
-   Via the dataframe operation 'select'
-   With a sql statement ont tbl_employees

In [7]:
from pyspark.sql.functions import *
df.select(avg("salary")).show()

+-----------+
|avg(salary)|
+-----------+
|     4820.0|
+-----------+



In [8]:
spark.sql("select avg(salary) from tbl_employees").show()

+-----------+
|avg(salary)|
+-----------+
|     4820.0|
+-----------+



# Get the explain plan of the sql statement
1. use the method explain(mode="extended") on the spark.sql statement and look  at the different plans Spark created to excecute the query.
2. Read the physical plan from bottom to top and try to match the plan with the query you wrote. (Exchange means that the data is being shuffled between the executors)

In [9]:
spark.sql("select avg(salary) from tbl_employees").explain(mode="extended")


== Parsed Logical Plan ==
'Project [unresolvedalias('avg('salary), None)]
+- 'UnresolvedRelation [tbl_employees], [], false

== Analyzed Logical Plan ==
avg(salary): double
Aggregate [avg(salary#20) AS avg(salary)#91]
+- SubqueryAlias tbl_employees
   +- View (`tbl_employees`, [employee_id#17,name#18,department#19,salary#20,hire_date#21])
      +- Relation [employee_id#17,name#18,department#19,salary#20,hire_date#21] csv

== Optimized Logical Plan ==
Aggregate [avg(salary#20) AS avg(salary)#91]
+- Project [salary#20]
   +- Relation [employee_id#17,name#18,department#19,salary#20,hire_date#21] csv

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[], functions=[avg(salary#20)], output=[avg(salary)#91])
   +- Exchange SinglePartition, ENSURE_REQUIREMENTS, [plan_id=159]
      +- HashAggregate(keys=[], functions=[partial_avg(salary#20)], output=[sum#95, count#96L])
         +- FileScan csv [salary#20] Batched: false, DataFilters: [], Format: CSV, Location: InMe

# Go to the SparkUI in the tab SQL/Dataframe
1. Search for the query plans in the SparkUI SQL tab.
2. Try to understand the excution plan.
3. Go to https://dzone.com/articles/debugging-spark-performance-using-explain-plan to get some insights in the operators of the plan. 

# Find the highest salary in each department in two ways:
-  Via the dataframe operation 'groupBy'
-  With a sql statement ont tbl_employees

In [10]:
from pyspark.sql.functions import max
highestSalariesByDepartment = df.groupBy("department").agg(max("salary").alias("highest_salary"))
highestSalariesByDepartment.show()

spark.sql(f'select department, max(salary) as highest_salary from tbl_employees group by department' ).explain(mode="extended")

+-----------+--------------+
| department|highest_salary|
+-----------+--------------+
|         HR|          4800|
|  Marketing|          4300|
|Engineering|          6000|
+-----------+--------------+

== Parsed Logical Plan ==
'Aggregate ['department], ['department, 'max('salary) AS highest_salary#121]
+- 'UnresolvedRelation [tbl_employees], [], false

== Analyzed Logical Plan ==
department: string, highest_salary: int
Aggregate [department#19], [department#19, max(salary#20) AS highest_salary#121]
+- SubqueryAlias tbl_employees
   +- View (`tbl_employees`, [employee_id#17,name#18,department#19,salary#20,hire_date#21])
      +- Relation [employee_id#17,name#18,department#19,salary#20,hire_date#21] csv

== Optimized Logical Plan ==
Aggregate [department#19], [department#19, max(salary#20) AS highest_salary#121]
+- Project [department#19, salary#20]
   +- Relation [employee_id#17,name#18,department#19,salary#20,hire_date#21] csv

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false

# Calculate the total salary expenditure for each year

In [11]:
totalSalaryExpenditureByYear = df.groupBy(year("hire_date").alias("year")).agg(sum("salary").alias("total_salary_expenditure"))
totalSalaryExpenditureByYear.show()

spark.sql(f"select year(hire_date), sum(salary) as total_salary_expenditure from tbl_employees group by year(hire_date)").explain(mode="extended")

+----+------------------------+
|year|total_salary_expenditure|
+----+------------------------+
|2019|                    8800|
|2021|                   10300|
|2020|                    9200|
|2022|                    8700|
|2018|                    6000|
|2023|                    5200|
+----+------------------------+

== Parsed Logical Plan ==
'Aggregate ['year('hire_date)], [unresolvedalias('year('hire_date), None), 'sum('salary) AS total_salary_expenditure#153]
+- 'UnresolvedRelation [tbl_employees], [], false

== Analyzed Logical Plan ==
year(hire_date): int, total_salary_expenditure: bigint
Aggregate [year(hire_date#21)], [year(hire_date#21) AS year(hire_date)#155, sum(salary#20) AS total_salary_expenditure#153L]
+- SubqueryAlias tbl_employees
   +- View (`tbl_employees`, [employee_id#17,name#18,department#19,salary#20,hire_date#21])
      +- Relation [employee_id#17,name#18,department#19,salary#20,hire_date#21] csv

== Optimized Logical Plan ==
Aggregate [_groupingexpression#158]

# Calculate the number of employees per postal code
Postal codes are available in the parquet file empPostalCodes
Create a view for the parquet file and join the two datasets

In [12]:
df_PC =spark.read.format("parquet").load("./FileStore/tables/empPostalCodes")
df_PC.createOrReplaceTempView("tbl_empPostalCodes")
df_empPerPc = spark.sql("select p.postal_code, count(e.employee_id) as number_of_employees from tbl_employees e inner join tbl_empPostalCodes p on e.employee_id = p.emp_id group by postal_code")
df_empPerPc.explain(mode="extended")

AnalysisException: [PATH_NOT_FOUND] Path does not exist: file:/C:/Users/kkiva/data4_project_group5/examples/FileStore/tables/empPostalCodes.

# Write the results to a DeltaTable in the spark-warehouse

In [13]:
df_empPerPc.write.format("delta").mode("overwrite").saveAsTable("employeesPerPostalCode")

NameError: name 'df_empPerPc' is not defined

In [14]:
spark.stop()

Stuff to create the excercises. Not part of the excercise

In [14]:
#df_PC =spark.read.format("csv").option("header", "true").load("./FileStore/tables/empPostalCodes.csv")
#df_PC.write.format("parquet").mode("overwrite").save("./FileStore/tables/empPostalCodes")