In [1]:
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip
import ConnectionConfig as cc
from ConnectionConfig import config

print("Loaded sections:", config.sections())
cc.setupEnvironment()
spark = cc.startLocalCluster("SQLExcercise")
spark.getActiveSession()

Dynamically set JAVA_HOME: /Users/user/Library/Java/JavaVirtualMachines/temurin-21.0.2/Contents/Home
Loaded sections: ['veloDB', 'tutorial_op', 'kafka', 'default']


25/03/12 10:32:58 WARN Utils: Your hostname, MacBook-Pro-170.local resolves to a loopback address: 127.0.0.1; using 10.140.33.207 instead (on interface en0)
25/03/12 10:32:58 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/Users/user/Desktop/spark_and_hadop/spark-3.5.4-bin-hadoop3/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/user/.ivy2/cache
The jars for the packages stored in: /Users/user/.ivy2/jars
io.delta#delta-spark_2.12 added as a dependency
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
org.postgresql#postgresql added as a dependency
org.elasticsearch#elasticsearch-spark-30_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-a2919bbe-03d9-4560-9dc5-f0ffadfff55a;1.0
	confs: [default]
	found io.delta#delta-spark_2.12;3.2.0 in central
	found io.delta#delta-storage;3.2.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.4.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.4.0 in central
	found org.apache.kafka#kafka-clients;3.3.2 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.9.1 in central
	found org.slf4j#slf4j-api;2.0.6 in central
	found org.apache.hadoop#hadoop-client-runtime;3.

Go to https://spark.apache.org/docs/latest/sql-getting-started.html and https://spark.apache.org/docs/latest/api/python/getting_started/quickstart_df.html#Quickstart:-DataFrame to get some insights in coding Spark SQL. Always select 'Python' as the language.

Use the Spark SQL Reference documentation to complete this excercise
- To write dataframe operations: Python SparkSQL API: https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/index.html
- To write pure SQL statements: Spark SQL API: https://spark.apache.org/docs/2.3.0/api/sql/index.html and https://spark.apache.org/docs/latest/sql-ref.html
Helpfull site with examples: https://sparkbyexamples.com/pyspark/


## Load employees.csv as a Spark Dataframe
Tip: https://spark.apache.org/docs/latest/api/python/getting_started/quickstart_df.html#Getting-Data-In/Out

In [2]:
#Extract 
df = spark.read.format("csv") \
  .option("header", "true") \
  .option("inferSchema", "true") \
  .load("./FileStore/tables/employees.csv")

                                                                                

## Display the schema of the DataFrame
Tip: https://spark.apache.org/docs/latest/api/python/getting_started/quickstart_df.html#Getting-Data-In/Out

In [3]:
df.printSchema()


root
 |-- employee_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- hire_date: date (nullable = true)



## Create a temperary view of the dataset with name tbl_employees
https://spark.apache.org/docs/latest/api/python/getting_started/quickstart_df.html#Getting-Data-In/Out

In [4]:
df.createOrReplaceTempView("tbl_employees")


## Calculate the total number of employees in two ways:
-   Via dataframe operations: Tip: https://spark.apache.org/docs/latest/api/python/getting_started/quickstart_df.html#Grouping-Data
-   With a sql statement op tbl_employees: use spark.sql()

In [5]:
df.groupBy().count().show()

+-----+
|count|
+-----+
|   10|
+-----+



In [6]:
spark.sql("select count(employee_id) from tbl_employees").show()

+------------------+
|count(employee_id)|
+------------------+
|                10|
+------------------+



# Find the average salary of all employees in two ways:
-   Via the dataframe operation 'select'
-   With a sql statement ont tbl_employees

In [9]:
from pyspark.sql.functions import *
df.select(avg("salary")).show()

+-----------+
|avg(salary)|
+-----------+
|     4820.0|
+-----------+



In [10]:
spark.sql("select avg(salary) from tbl_employees").show()

+-----------+
|avg(salary)|
+-----------+
|     4820.0|
+-----------+



# Get the explain plan of the sql statement
1. use the method explain(mode="extended") on the spark.sql statement and look  at the different plans Spark created to excecute the query.
2. Read the physical plan from bottom to top and try to match the plan with the query you wrote. (Exchange means that the data is being shuffled between the executors)

In [11]:
spark.sql("select avg(salary) from tbl_employees").explain(mode="extended")


== Parsed Logical Plan ==
'Project [unresolvedalias('avg('salary), None)]
+- 'UnresolvedRelation [tbl_employees], [], false

== Analyzed Logical Plan ==
avg(salary): double
Aggregate [avg(salary#20) AS avg(salary)#115]
+- SubqueryAlias tbl_employees
   +- View (`tbl_employees`, [employee_id#17,name#18,department#19,salary#20,hire_date#21])
      +- Relation [employee_id#17,name#18,department#19,salary#20,hire_date#21] csv

== Optimized Logical Plan ==
Aggregate [avg(salary#20) AS avg(salary)#115]
+- Project [salary#20]
   +- Relation [employee_id#17,name#18,department#19,salary#20,hire_date#21] csv

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[], functions=[avg(salary#20)], output=[avg(salary)#115])
   +- Exchange SinglePartition, ENSURE_REQUIREMENTS, [plan_id=204]
      +- HashAggregate(keys=[], functions=[partial_avg(salary#20)], output=[sum#119, count#120L])
         +- FileScan csv [salary#20] Batched: false, DataFilters: [], Format: CSV, Location:

# Go to the SparkUI in the tab SQL/Dataframe
1. Search for the query plans in the SparkUI SQL tab.
2. Try to understand the excution plan.
3. Go to https://dzone.com/articles/debugging-spark-performance-using-explain-plan to get some insights in the operators of the plan. 

# Find the highest salary in each department in two ways:
-  Via the dataframe operation 'groupBy'
-  With a sql statement ont tbl_employees

In [12]:
from pyspark.sql.functions import max
highestSalariesByDepartment = df.groupBy("department").agg(max("salary").alias("highest_salary"))
highestSalariesByDepartment.show()

spark.sql(f'select department, max(salary) as highest_salary from tbl_employees group by department' ).explain(mode="extended")

+-----------+--------------+
| department|highest_salary|
+-----------+--------------+
|         HR|          4800|
|  Marketing|          4300|
|Engineering|          6000|
+-----------+--------------+

== Parsed Logical Plan ==
'Aggregate ['department], ['department, 'max('salary) AS highest_salary#145]
+- 'UnresolvedRelation [tbl_employees], [], false

== Analyzed Logical Plan ==
department: string, highest_salary: int
Aggregate [department#19], [department#19, max(salary#20) AS highest_salary#145]
+- SubqueryAlias tbl_employees
   +- View (`tbl_employees`, [employee_id#17,name#18,department#19,salary#20,hire_date#21])
      +- Relation [employee_id#17,name#18,department#19,salary#20,hire_date#21] csv

== Optimized Logical Plan ==
Aggregate [department#19], [department#19, max(salary#20) AS highest_salary#145]
+- Project [department#19, salary#20]
   +- Relation [employee_id#17,name#18,department#19,salary#20,hire_date#21] csv

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false

# Calculate the total salary expenditure for each year

In [13]:
totalSalaryExpenditureByYear = df.groupBy(year("hire_date").alias("year")).agg(sum("salary").alias("total_salary_expenditure"))
totalSalaryExpenditureByYear.show()

spark.sql(f"select year(hire_date), sum(salary) as total_salary_expenditure from tbl_employees group by year(hire_date)").explain(mode="extended")

+----+------------------------+
|year|total_salary_expenditure|
+----+------------------------+
|2019|                    8800|
|2021|                   10300|
|2020|                    9200|
|2022|                    8700|
|2018|                    6000|
|2023|                    5200|
+----+------------------------+

== Parsed Logical Plan ==
'Aggregate ['year('hire_date)], [unresolvedalias('year('hire_date), None), 'sum('salary) AS total_salary_expenditure#177]
+- 'UnresolvedRelation [tbl_employees], [], false

== Analyzed Logical Plan ==
year(hire_date): int, total_salary_expenditure: bigint
Aggregate [year(hire_date#21)], [year(hire_date#21) AS year(hire_date)#179, sum(salary#20) AS total_salary_expenditure#177L]
+- SubqueryAlias tbl_employees
   +- View (`tbl_employees`, [employee_id#17,name#18,department#19,salary#20,hire_date#21])
      +- Relation [employee_id#17,name#18,department#19,salary#20,hire_date#21] csv

== Optimized Logical Plan ==
Aggregate [_groupingexpression#182]

# Calculate the number of employees per postal code
Postal codes are available in the parquet file empPostalCodes
Create a view for the parquet file and join the two datasets

In [16]:
df_PC =spark.read.format("parquet").load("./FileStore/tables/empPostalCodes.csv")
df_PC.createOrReplaceTempView("tbl_empPostalCodes")
df_empPerPc = spark.sql("select p.postal_code, count(e.employee_id) as number_of_employees from tbl_employees e inner join tbl_empPostalCodes p on e.employee_id = p.emp_id group by postal_code")
df_empPerPc.explain(mode="extended")
/Users/user/Desktop/data4_project_group5/examples/FileStore/tables/empPostalCodes.csv

25/03/12 10:38:11 ERROR Executor: Exception in task 0.0 in stage 23.0 (TID 16)
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.util.ThreadUtils$.parmap(ThreadUtils.scala:387)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.readParquetFootersInParallel(ParquetFileFormat.scala:443)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.$anonfun$mergeSchemasInParallel$1(ParquetFileFormat.scala:493)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.$anonfun$mergeSchemasInParallel$1$adapted(ParquetFileFormat.scala:485)
	at org.apache.spark.sql.execution.datasources.SchemaMergeUtils$.$anonfun$mergeSchemasInParallel$2(SchemaMergeUtils.scala:80)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:858)
	at org.apache.spar

Py4JJavaError: An error occurred while calling o169.load.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 23.0 failed 1 times, most recent failure: Lost task 0.0 in stage 23.0 (TID 16) (10.140.33.207 executor driver): org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.util.ThreadUtils$.parmap(ThreadUtils.scala:387)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.readParquetFootersInParallel(ParquetFileFormat.scala:443)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.$anonfun$mergeSchemasInParallel$1(ParquetFileFormat.scala:493)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.$anonfun$mergeSchemasInParallel$1$adapted(ParquetFileFormat.scala:485)
	at org.apache.spark.sql.execution.datasources.SchemaMergeUtils$.$anonfun$mergeSchemasInParallel$2(SchemaMergeUtils.scala:80)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:858)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:858)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	at java.base/java.lang.Thread.run(Thread.java:1583)
Caused by: org.apache.spark.SparkException: [CANNOT_READ_FILE_FOOTER] Could not read footer for file: file:/Users/user/Desktop/data4_project_group5/examples/FileStore/tables/empPostalCodes.csv. Please ensure that the file is in either ORC or Parquet format. If not, please convert it to a valid format. If the file is in the valid format, please check if it is corrupt. If it is, you can choose to either ignore it or fix the corruption.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.cannotReadFooterForFileError(QueryExecutionErrors.scala:1057)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.$anonfun$readParquetFootersInParallel$1(ParquetFileFormat.scala:456)
	at org.apache.spark.util.ThreadUtils$.$anonfun$parmap$2(ThreadUtils.scala:384)
	at scala.concurrent.Future$.$anonfun$apply$1(Future.scala:659)
	at scala.util.Success.$anonfun$map$1(Try.scala:255)
	at scala.util.Success.map(Try.scala:213)
	at scala.concurrent.Future.$anonfun$map$1(Future.scala:292)
	at scala.concurrent.impl.Promise.liftedTree1$1(Promise.scala:33)
	at scala.concurrent.impl.Promise.$anonfun$transform$1(Promise.scala:33)
	at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:64)
	at java.base/java.util.concurrent.ForkJoinTask$RunnableExecuteAction.exec(ForkJoinTask.java:1423)
	at java.base/java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:387)
	at java.base/java.util.concurrent.ForkJoinPool$WorkQueue.topLevelExec(ForkJoinPool.java:1312)
	at java.base/java.util.concurrent.ForkJoinPool.scan(ForkJoinPool.java:1843)
	at java.base/java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1808)
	at java.base/java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:188)
Caused by: java.lang.RuntimeException: file:/Users/user/Desktop/data4_project_group5/examples/FileStore/tables/empPostalCodes.csv is not a Parquet file. Expected magic number at tail, but found [57, 48, 48, 48]
	at org.apache.parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:565)
	at org.apache.parquet.hadoop.ParquetFileReader.<init>(ParquetFileReader.java:799)
	at org.apache.parquet.hadoop.ParquetFileReader.open(ParquetFileReader.java:666)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFooterReader.readFooter(ParquetFooterReader.java:85)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFooterReader.readFooter(ParquetFooterReader.java:76)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.$anonfun$readParquetFootersInParallel$1(ParquetFileFormat.scala:450)
	... 14 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2393)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2414)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2433)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2458)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1049)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1048)
	at org.apache.spark.sql.execution.datasources.SchemaMergeUtils$.mergeSchemasInParallel(SchemaMergeUtils.scala:74)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.mergeSchemasInParallel(ParquetFileFormat.scala:497)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetUtils$.inferSchema(ParquetUtils.scala:132)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat.inferSchema(ParquetFileFormat.scala:79)
	at org.apache.spark.sql.execution.datasources.DataSource.$anonfun$getOrInferFileFormatSchema$11(DataSource.scala:208)
	at scala.Option.orElse(Option.scala:447)
	at org.apache.spark.sql.execution.datasources.DataSource.getOrInferFileFormatSchema(DataSource.scala:205)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:407)
	at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:229)
	at org.apache.spark.sql.DataFrameReader.$anonfun$load$2(DataFrameReader.scala:211)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:211)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:186)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:75)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:52)
	at java.base/java.lang.reflect.Method.invoke(Method.java:580)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:1583)
Caused by: org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.util.ThreadUtils$.parmap(ThreadUtils.scala:387)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.readParquetFootersInParallel(ParquetFileFormat.scala:443)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.$anonfun$mergeSchemasInParallel$1(ParquetFileFormat.scala:493)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.$anonfun$mergeSchemasInParallel$1$adapted(ParquetFileFormat.scala:485)
	at org.apache.spark.sql.execution.datasources.SchemaMergeUtils$.$anonfun$mergeSchemasInParallel$2(SchemaMergeUtils.scala:80)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:858)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:858)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	... 1 more
Caused by: org.apache.spark.SparkException: [CANNOT_READ_FILE_FOOTER] Could not read footer for file: file:/Users/user/Desktop/data4_project_group5/examples/FileStore/tables/empPostalCodes.csv. Please ensure that the file is in either ORC or Parquet format. If not, please convert it to a valid format. If the file is in the valid format, please check if it is corrupt. If it is, you can choose to either ignore it or fix the corruption.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.cannotReadFooterForFileError(QueryExecutionErrors.scala:1057)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.$anonfun$readParquetFootersInParallel$1(ParquetFileFormat.scala:456)
	at org.apache.spark.util.ThreadUtils$.$anonfun$parmap$2(ThreadUtils.scala:384)
	at scala.concurrent.Future$.$anonfun$apply$1(Future.scala:659)
	at scala.util.Success.$anonfun$map$1(Try.scala:255)
	at scala.util.Success.map(Try.scala:213)
	at scala.concurrent.Future.$anonfun$map$1(Future.scala:292)
	at scala.concurrent.impl.Promise.liftedTree1$1(Promise.scala:33)
	at scala.concurrent.impl.Promise.$anonfun$transform$1(Promise.scala:33)
	at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:64)
	at java.base/java.util.concurrent.ForkJoinTask$RunnableExecuteAction.exec(ForkJoinTask.java:1423)
	at java.base/java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:387)
	at java.base/java.util.concurrent.ForkJoinPool$WorkQueue.topLevelExec(ForkJoinPool.java:1312)
	at java.base/java.util.concurrent.ForkJoinPool.scan(ForkJoinPool.java:1843)
	at java.base/java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1808)
	at java.base/java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:188)
Caused by: java.lang.RuntimeException: file:/Users/user/Desktop/data4_project_group5/examples/FileStore/tables/empPostalCodes.csv is not a Parquet file. Expected magic number at tail, but found [57, 48, 48, 48]
	at org.apache.parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:565)
	at org.apache.parquet.hadoop.ParquetFileReader.<init>(ParquetFileReader.java:799)
	at org.apache.parquet.hadoop.ParquetFileReader.open(ParquetFileReader.java:666)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFooterReader.readFooter(ParquetFooterReader.java:85)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFooterReader.readFooter(ParquetFooterReader.java:76)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.$anonfun$readParquetFootersInParallel$1(ParquetFileFormat.scala:450)
	... 14 more


# Write the results to a DeltaTable in the spark-warehouse

In [17]:
df_empPerPc.write.format("delta").mode("overwrite").saveAsTable("employeesPerPostalCode")

NameError: name 'df_empPerPc' is not defined

In [15]:
spark.stop()

Stuff to create the excercises. Not part of the excercise

In [14]:
#df_PC =spark.read.format("csv").option("header", "true").load("./FileStore/tables/empPostalCodes.csv")
#df_PC.write.format("parquet").mode("overwrite").save("./FileStore/tables/empPostalCodes")