# Creating a Spark DB
## Loading a JSON file using PySpark

In [1]:
from pyspark.sql import SparkSession


# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Create Database from JSON") \
    .config("spark.driver.memory", "8g") \
    .getOrCreate()

spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

# Read JSON file into DataFrame

df = spark.read.format("json") \
    .load(r"C:\Users\jose\Downloads\goodreads_reviews.json") # This file will be pushed into GitHub in a zipped folder using Git LFS

# Create a database
spark.sql("CREATE DATABASE IF NOT EXISTS goodreads_reviews")

# Use the database
spark.sql("USE goodreads_reviews")

# Save DataFrame as a table in the database
df.write.mode("overwrite").saveAsTable("reviews")

# Save DataFrame as a table in a database
df.write.mode("overwrite").saveAsTable("goodreads_reviews.reviews")


## Spark Web UI access localhost:4040

In [2]:
# Use the database
spark.sql("USE goodreads_reviews")

# Query the data from the table
result = spark.sql("SELECT * FROM reviews")

# Show the result
result.show()

+--------+--------------------+--------------------+----------+-------+------+--------------------+--------------------+--------------------+--------------------+--------------------+
| book_id|          date_added|        date_updated|n_comments|n_votes|rating|             read_at|           review_id|         review_text|          started_at|             user_id|
+--------+--------------------+--------------------+----------+-------+------+--------------------+--------------------+--------------------+--------------------+--------------------+
|25913122|Tue Dec 29 13:45:...|Fri Aug 26 11:01:...|         2|      5|     4|Tue Jul 12 00:00:...|6083f89f3e7ea38c4...|Compre este libro...|Tue Jul 12 00:00:...|2ce9cf233c1503ed5...|
|27823971|Wed Oct 28 07:51:...|Wed Feb 17 21:02:...|         1|      2|     3|Wed Feb 17 00:00:...|28b70411d46da4471...|No puedo ser impa...|Sat Feb 13 00:00:...|2ce9cf233c1503ed5...|
|25796670|Sat Oct 03 14:14:...|Wed Mar 16 00:51:...|         1|      4|     2|Tu

In [3]:
# Save DataFrame as parquet files (a columnar storage format) in a directory
# Parket files will be pushed into GitHub in a zipped folder using Git LFS

df.write.mode("overwrite").parquet(r"C:\Users\jose\Downloads\goodreads_reviews_parquet") 

# Register the DataFrame as a temporary view
df.createOrReplaceTempView("reviews")

In [3]:
# Assuming df is your Spark DataFrame
columns_to_drop = ["date_added", "date_updated","n_comments","n_votes","review_id","started_at","read_at"]  # List of columns to drop

# Drop the specified columns
result = result.drop(*columns_to_drop)

# Show the result
result.show()

+--------+------+--------------------+--------------------+
| book_id|rating|         review_text|             user_id|
+--------+------+--------------------+--------------------+
|25913122|     4|Compre este libro...|2ce9cf233c1503ed5...|
|27823971|     3|No puedo ser impa...|2ce9cf233c1503ed5...|
|25796670|     2|Habia desistido d...|2ce9cf233c1503ed5...|
|25778324|     5|Llegue a este lib...|2ce9cf233c1503ed5...|
|18712886|     2|Confieso desde un...|2ce9cf233c1503ed5...|
|28963557|     5|           Perfecto.|2ce9cf233c1503ed5...|
|25469092|     4|Aunque me gusto m...|2ce9cf233c1503ed5...|
|22037944|     4|Que dificil es re...|2ce9cf233c1503ed5...|
|18499402|     4|Quise leer este l...|2ce9cf233c1503ed5...|
|26798985|     4|En la resena ante...|2ce9cf233c1503ed5...|
|25432891|     4|Bueno, voy a ser ...|2ce9cf233c1503ed5...|
| 3373897|     2|Estuvo bien, pero...|2ce9cf233c1503ed5...|
|16080337|     4|Cuando vi este li...|2ce9cf233c1503ed5...|
|18076991|     4|I liked this one ...|2c

In [4]:
from pyspark.sql import *  
from pyspark.sql.functions import *  
from pyspark.sql.types import *  
import numpy as np    
import pandas as pd


reviews_pandas_df= result.select("*").toPandas()

  An error occurred while calling o58.getResult.
: org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.security.SocketAuthServer.getResult(SocketAuthServer.scala:98)
	at org.apache.spark.security.SocketAuthServer.getResult(SocketAuthServer.scala:94)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:75)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:52)
	at java.base/java.lang.reflect.Method.invoke(Method.java:580)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.co

Py4JJavaError: An error occurred while calling o58.getResult.
: org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.security.SocketAuthServer.getResult(SocketAuthServer.scala:98)
	at org.apache.spark.security.SocketAuthServer.getResult(SocketAuthServer.scala:94)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:75)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:52)
	at java.base/java.lang.reflect.Method.invoke(Method.java:580)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:1583)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 6 in stage 5.0 failed 1 times, most recent failure: Lost task 6.0 in stage 5.0 (TID 95) (DESKTOP-IK25UHA executor driver): java.lang.UnsupportedOperationException: sun.misc.Unsafe or java.nio.DirectByteBuffer.<init>(long, int) not available
	at org.apache.arrow.memory.util.MemoryUtil.directBuffer(MemoryUtil.java:174)
	at org.apache.arrow.memory.ArrowBuf.getDirectBuffer(ArrowBuf.java:229)
	at org.apache.arrow.memory.ArrowBuf.nioBuffer(ArrowBuf.java:224)
	at org.apache.arrow.vector.ipc.WriteChannel.write(WriteChannel.java:133)
	at org.apache.arrow.vector.ipc.message.MessageSerializer.writeBatchBuffers(MessageSerializer.java:303)
	at org.apache.arrow.vector.ipc.message.MessageSerializer.serialize(MessageSerializer.java:276)
	at org.apache.arrow.vector.ipc.message.MessageSerializer.serialize(MessageSerializer.java:237)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.$anonfun$next$1(ArrowConverters.scala:117)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.next(ArrowConverters.scala:120)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.next(ArrowConverters.scala:77)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.foreach(ArrowConverters.scala:77)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.to(ArrowConverters.scala:77)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.toBuffer(ArrowConverters.scala:77)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.toArray(ArrowConverters.scala:77)
	at org.apache.spark.sql.Dataset.$anonfun$collectAsArrowToPython$6(Dataset.scala:4264)
	at org.apache.spark.SparkContext.$anonfun$runJob$6(SparkContext.scala:2492)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	at java.base/java.lang.Thread.run(Thread.java:1583)
	Suppressed: org.apache.spark.util.TaskCompletionListenerException: Memory was leaked by query. Memory leaked: (17760256)
Allocator(toArrowBatchIterator) 0/17760256/25657344/9223372036854775807 (res/actual/peak/limit)


Previous exception in task: sun.misc.Unsafe or java.nio.DirectByteBuffer.<init>(long, int) not available
	org.apache.arrow.memory.util.MemoryUtil.directBuffer(MemoryUtil.java:174)
	org.apache.arrow.memory.ArrowBuf.getDirectBuffer(ArrowBuf.java:229)
	org.apache.arrow.memory.ArrowBuf.nioBuffer(ArrowBuf.java:224)
	org.apache.arrow.vector.ipc.WriteChannel.write(WriteChannel.java:133)
	org.apache.arrow.vector.ipc.message.MessageSerializer.writeBatchBuffers(MessageSerializer.java:303)
	org.apache.arrow.vector.ipc.message.MessageSerializer.serialize(MessageSerializer.java:276)
	org.apache.arrow.vector.ipc.message.MessageSerializer.serialize(MessageSerializer.java:237)
	org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.$anonfun$next$1(ArrowConverters.scala:117)
	scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.next(ArrowConverters.scala:120)
	org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.next(ArrowConverters.scala:77)
	scala.collection.Iterator.foreach(Iterator.scala:943)
	scala.collection.Iterator.foreach$(Iterator.scala:943)
	org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.foreach(ArrowConverters.scala:77)
	scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.to(ArrowConverters.scala:77)
	scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.toBuffer(ArrowConverters.scala:77)
	scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.toArray(ArrowConverters.scala:77)
	org.apache.spark.sql.Dataset.$anonfun$collectAsArrowToPython$6(Dataset.scala:4264)
	org.apache.spark.SparkContext.$anonfun$runJob$6(SparkContext.scala:2492)
	org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	org.apache.spark.scheduler.Task.run(Task.scala:141)
	org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	java.base/java.lang.Thread.run(Thread.java:1583)
		at org.apache.spark.TaskContextImpl.invokeListeners(TaskContextImpl.scala:254)
		at org.apache.spark.TaskContextImpl.invokeTaskCompletionListeners(TaskContextImpl.scala:144)
		at org.apache.spark.TaskContextImpl.markTaskCompleted(TaskContextImpl.scala:137)
		at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:177)
		... 9 more
		Suppressed: java.lang.IllegalStateException: Memory was leaked by query. Memory leaked: (17760256)
Allocator(toArrowBatchIterator) 0/17760256/25657344/9223372036854775807 (res/actual/peak/limit)

			at org.apache.arrow.memory.BaseAllocator.close(BaseAllocator.java:476)
			at org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.close(ArrowConverters.scala:128)
			at org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.$anonfun$new$2(ArrowConverters.scala:96)
			at org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.$anonfun$new$2$adapted(ArrowConverters.scala:95)
			at org.apache.spark.TaskContext$$anon$1.onTaskCompletion(TaskContext.scala:137)
			at org.apache.spark.TaskContextImpl.$anonfun$invokeTaskCompletionListeners$1(TaskContextImpl.scala:144)
			at org.apache.spark.TaskContextImpl.$anonfun$invokeTaskCompletionListeners$1$adapted(TaskContextImpl.scala:144)
			at org.apache.spark.TaskContextImpl.invokeListeners(TaskContextImpl.scala:199)
			... 12 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2398)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2493)
	at org.apache.spark.sql.Dataset.$anonfun$collectAsArrowToPython$5(Dataset.scala:4262)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.sql.Dataset.$anonfun$collectAsArrowToPython$2(Dataset.scala:4266)
	at org.apache.spark.sql.Dataset.$anonfun$collectAsArrowToPython$2$adapted(Dataset.scala:4242)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:4322)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:546)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:4320)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:4320)
	at org.apache.spark.sql.Dataset.$anonfun$collectAsArrowToPython$1(Dataset.scala:4242)
	at org.apache.spark.sql.Dataset.$anonfun$collectAsArrowToPython$1$adapted(Dataset.scala:4241)
	at org.apache.spark.security.SocketAuthServer$.$anonfun$serveToStream$2(SocketAuthServer.scala:140)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.security.SocketAuthServer$.$anonfun$serveToStream$1(SocketAuthServer.scala:142)
	at org.apache.spark.security.SocketAuthServer$.$anonfun$serveToStream$1$adapted(SocketAuthServer.scala:137)
	at org.apache.spark.security.SocketFuncServer.handleConnection(SocketAuthServer.scala:114)
	at org.apache.spark.security.SocketFuncServer.handleConnection(SocketAuthServer.scala:108)
	at org.apache.spark.security.SocketAuthServer$$anon$1.$anonfun$run$4(SocketAuthServer.scala:69)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.security.SocketAuthServer$$anon$1.run(SocketAuthServer.scala:69)
Caused by: java.lang.UnsupportedOperationException: sun.misc.Unsafe or java.nio.DirectByteBuffer.<init>(long, int) not available
	at org.apache.arrow.memory.util.MemoryUtil.directBuffer(MemoryUtil.java:174)
	at org.apache.arrow.memory.ArrowBuf.getDirectBuffer(ArrowBuf.java:229)
	at org.apache.arrow.memory.ArrowBuf.nioBuffer(ArrowBuf.java:224)
	at org.apache.arrow.vector.ipc.WriteChannel.write(WriteChannel.java:133)
	at org.apache.arrow.vector.ipc.message.MessageSerializer.writeBatchBuffers(MessageSerializer.java:303)
	at org.apache.arrow.vector.ipc.message.MessageSerializer.serialize(MessageSerializer.java:276)
	at org.apache.arrow.vector.ipc.message.MessageSerializer.serialize(MessageSerializer.java:237)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.$anonfun$next$1(ArrowConverters.scala:117)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.next(ArrowConverters.scala:120)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.next(ArrowConverters.scala:77)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.foreach(ArrowConverters.scala:77)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.to(ArrowConverters.scala:77)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.toBuffer(ArrowConverters.scala:77)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.toArray(ArrowConverters.scala:77)
	at org.apache.spark.sql.Dataset.$anonfun$collectAsArrowToPython$6(Dataset.scala:4264)
	at org.apache.spark.SparkContext.$anonfun$runJob$6(SparkContext.scala:2492)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	at java.base/java.lang.Thread.run(Thread.java:1583)
	Suppressed: org.apache.spark.util.TaskCompletionListenerException: Memory was leaked by query. Memory leaked: (17760256)
Allocator(toArrowBatchIterator) 0/17760256/25657344/9223372036854775807 (res/actual/peak/limit)


Previous exception in task: sun.misc.Unsafe or java.nio.DirectByteBuffer.<init>(long, int) not available
	org.apache.arrow.memory.util.MemoryUtil.directBuffer(MemoryUtil.java:174)
	org.apache.arrow.memory.ArrowBuf.getDirectBuffer(ArrowBuf.java:229)
	org.apache.arrow.memory.ArrowBuf.nioBuffer(ArrowBuf.java:224)
	org.apache.arrow.vector.ipc.WriteChannel.write(WriteChannel.java:133)
	org.apache.arrow.vector.ipc.message.MessageSerializer.writeBatchBuffers(MessageSerializer.java:303)
	org.apache.arrow.vector.ipc.message.MessageSerializer.serialize(MessageSerializer.java:276)
	org.apache.arrow.vector.ipc.message.MessageSerializer.serialize(MessageSerializer.java:237)
	org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.$anonfun$next$1(ArrowConverters.scala:117)
	scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.next(ArrowConverters.scala:120)
	org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.next(ArrowConverters.scala:77)
	scala.collection.Iterator.foreach(Iterator.scala:943)
	scala.collection.Iterator.foreach$(Iterator.scala:943)
	org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.foreach(ArrowConverters.scala:77)
	scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.to(ArrowConverters.scala:77)
	scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.toBuffer(ArrowConverters.scala:77)
	scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.toArray(ArrowConverters.scala:77)
	org.apache.spark.sql.Dataset.$anonfun$collectAsArrowToPython$6(Dataset.scala:4264)
	org.apache.spark.SparkContext.$anonfun$runJob$6(SparkContext.scala:2492)
	org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	org.apache.spark.scheduler.Task.run(Task.scala:141)
	org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	java.base/java.lang.Thread.run(Thread.java:1583)
		at org.apache.spark.TaskContextImpl.invokeListeners(TaskContextImpl.scala:254)
		at org.apache.spark.TaskContextImpl.invokeTaskCompletionListeners(TaskContextImpl.scala:144)
		at org.apache.spark.TaskContextImpl.markTaskCompleted(TaskContextImpl.scala:137)
		at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:177)
		... 9 more
		Suppressed: java.lang.IllegalStateException: Memory was leaked by query. Memory leaked: (17760256)
Allocator(toArrowBatchIterator) 0/17760256/25657344/9223372036854775807 (res/actual/peak/limit)

			at org.apache.arrow.memory.BaseAllocator.close(BaseAllocator.java:476)
			at org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.close(ArrowConverters.scala:128)
			at org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.$anonfun$new$2(ArrowConverters.scala:96)
			at org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.$anonfun$new$2$adapted(ArrowConverters.scala:95)
			at org.apache.spark.TaskContext$$anon$1.onTaskCompletion(TaskContext.scala:137)
			at org.apache.spark.TaskContextImpl.$anonfun$invokeTaskCompletionListeners$1(TaskContextImpl.scala:144)
			at org.apache.spark.TaskContextImpl.$anonfun$invokeTaskCompletionListeners$1$adapted(TaskContextImpl.scala:144)
			at org.apache.spark.TaskContextImpl.invokeListeners(TaskContextImpl.scala:199)
			... 12 more


In [5]:
# Assuming df is your Spark DataFrame
columns_to_drop = ["review_text"]  # List of columns to drop

# Drop the specified columns
result = result.drop(*columns_to_drop)

# Show the result
result.show()

+--------+------+--------------------+
| book_id|rating|             user_id|
+--------+------+--------------------+
|25913122|     4|2ce9cf233c1503ed5...|
|27823971|     3|2ce9cf233c1503ed5...|
|25796670|     2|2ce9cf233c1503ed5...|
|25778324|     5|2ce9cf233c1503ed5...|
|18712886|     2|2ce9cf233c1503ed5...|
|28963557|     5|2ce9cf233c1503ed5...|
|25469092|     4|2ce9cf233c1503ed5...|
|22037944|     4|2ce9cf233c1503ed5...|
|18499402|     4|2ce9cf233c1503ed5...|
|26798985|     4|2ce9cf233c1503ed5...|
|25432891|     4|2ce9cf233c1503ed5...|
| 3373897|     2|2ce9cf233c1503ed5...|
|16080337|     4|2ce9cf233c1503ed5...|
|18076991|     4|2ce9cf233c1503ed5...|
|18483104|     3|2ce9cf233c1503ed5...|
|20560907|     3|2ce9cf233c1503ed5...|
|17661625|     4|2ce9cf233c1503ed5...|
|20613470|     5|2ce9cf233c1503ed5...|
|22037901|     5|2ce9cf233c1503ed5...|
|13266803|     5|2ce9cf233c1503ed5...|
+--------+------+--------------------+
only showing top 20 rows



In [6]:
reviews_pandas_df= result.select("*").toPandas()

  An error occurred while calling o75.getResult.
: org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.security.SocketAuthServer.getResult(SocketAuthServer.scala:98)
	at org.apache.spark.security.SocketAuthServer.getResult(SocketAuthServer.scala:94)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:75)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:52)
	at java.base/java.lang.reflect.Method.invoke(Method.java:580)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.co

Py4JJavaError: An error occurred while calling o75.getResult.
: org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.security.SocketAuthServer.getResult(SocketAuthServer.scala:98)
	at org.apache.spark.security.SocketAuthServer.getResult(SocketAuthServer.scala:94)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:75)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:52)
	at java.base/java.lang.reflect.Method.invoke(Method.java:580)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:1583)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 2 in stage 7.0 failed 1 times, most recent failure: Lost task 2.0 in stage 7.0 (TID 107) (DESKTOP-IK25UHA executor driver): java.lang.UnsupportedOperationException: sun.misc.Unsafe or java.nio.DirectByteBuffer.<init>(long, int) not available
	at org.apache.arrow.memory.util.MemoryUtil.directBuffer(MemoryUtil.java:174)
	at org.apache.arrow.memory.ArrowBuf.getDirectBuffer(ArrowBuf.java:229)
	at org.apache.arrow.memory.ArrowBuf.nioBuffer(ArrowBuf.java:224)
	at org.apache.arrow.vector.ipc.WriteChannel.write(WriteChannel.java:133)
	at org.apache.arrow.vector.ipc.message.MessageSerializer.writeBatchBuffers(MessageSerializer.java:303)
	at org.apache.arrow.vector.ipc.message.MessageSerializer.serialize(MessageSerializer.java:276)
	at org.apache.arrow.vector.ipc.message.MessageSerializer.serialize(MessageSerializer.java:237)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.$anonfun$next$1(ArrowConverters.scala:117)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.next(ArrowConverters.scala:120)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.next(ArrowConverters.scala:77)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.foreach(ArrowConverters.scala:77)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.to(ArrowConverters.scala:77)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.toBuffer(ArrowConverters.scala:77)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.toArray(ArrowConverters.scala:77)
	at org.apache.spark.sql.Dataset.$anonfun$collectAsArrowToPython$6(Dataset.scala:4264)
	at org.apache.spark.SparkContext.$anonfun$runJob$6(SparkContext.scala:2492)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	at java.base/java.lang.Thread.run(Thread.java:1583)
	Suppressed: org.apache.spark.util.TaskCompletionListenerException: Memory was leaked by query. Memory leaked: (917504)
Allocator(toArrowBatchIterator) 0/917504/1114112/9223372036854775807 (res/actual/peak/limit)


Previous exception in task: sun.misc.Unsafe or java.nio.DirectByteBuffer.<init>(long, int) not available
	org.apache.arrow.memory.util.MemoryUtil.directBuffer(MemoryUtil.java:174)
	org.apache.arrow.memory.ArrowBuf.getDirectBuffer(ArrowBuf.java:229)
	org.apache.arrow.memory.ArrowBuf.nioBuffer(ArrowBuf.java:224)
	org.apache.arrow.vector.ipc.WriteChannel.write(WriteChannel.java:133)
	org.apache.arrow.vector.ipc.message.MessageSerializer.writeBatchBuffers(MessageSerializer.java:303)
	org.apache.arrow.vector.ipc.message.MessageSerializer.serialize(MessageSerializer.java:276)
	org.apache.arrow.vector.ipc.message.MessageSerializer.serialize(MessageSerializer.java:237)
	org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.$anonfun$next$1(ArrowConverters.scala:117)
	scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.next(ArrowConverters.scala:120)
	org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.next(ArrowConverters.scala:77)
	scala.collection.Iterator.foreach(Iterator.scala:943)
	scala.collection.Iterator.foreach$(Iterator.scala:943)
	org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.foreach(ArrowConverters.scala:77)
	scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.to(ArrowConverters.scala:77)
	scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.toBuffer(ArrowConverters.scala:77)
	scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.toArray(ArrowConverters.scala:77)
	org.apache.spark.sql.Dataset.$anonfun$collectAsArrowToPython$6(Dataset.scala:4264)
	org.apache.spark.SparkContext.$anonfun$runJob$6(SparkContext.scala:2492)
	org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	org.apache.spark.scheduler.Task.run(Task.scala:141)
	org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	java.base/java.lang.Thread.run(Thread.java:1583)
		at org.apache.spark.TaskContextImpl.invokeListeners(TaskContextImpl.scala:254)
		at org.apache.spark.TaskContextImpl.invokeTaskCompletionListeners(TaskContextImpl.scala:144)
		at org.apache.spark.TaskContextImpl.markTaskCompleted(TaskContextImpl.scala:137)
		at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:177)
		... 9 more
		Suppressed: java.lang.IllegalStateException: Memory was leaked by query. Memory leaked: (917504)
Allocator(toArrowBatchIterator) 0/917504/1114112/9223372036854775807 (res/actual/peak/limit)

			at org.apache.arrow.memory.BaseAllocator.close(BaseAllocator.java:476)
			at org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.close(ArrowConverters.scala:128)
			at org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.$anonfun$new$2(ArrowConverters.scala:96)
			at org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.$anonfun$new$2$adapted(ArrowConverters.scala:95)
			at org.apache.spark.TaskContext$$anon$1.onTaskCompletion(TaskContext.scala:137)
			at org.apache.spark.TaskContextImpl.$anonfun$invokeTaskCompletionListeners$1(TaskContextImpl.scala:144)
			at org.apache.spark.TaskContextImpl.$anonfun$invokeTaskCompletionListeners$1$adapted(TaskContextImpl.scala:144)
			at org.apache.spark.TaskContextImpl.invokeListeners(TaskContextImpl.scala:199)
			... 12 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2398)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2493)
	at org.apache.spark.sql.Dataset.$anonfun$collectAsArrowToPython$5(Dataset.scala:4262)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.sql.Dataset.$anonfun$collectAsArrowToPython$2(Dataset.scala:4266)
	at org.apache.spark.sql.Dataset.$anonfun$collectAsArrowToPython$2$adapted(Dataset.scala:4242)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:4322)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:546)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:4320)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:4320)
	at org.apache.spark.sql.Dataset.$anonfun$collectAsArrowToPython$1(Dataset.scala:4242)
	at org.apache.spark.sql.Dataset.$anonfun$collectAsArrowToPython$1$adapted(Dataset.scala:4241)
	at org.apache.spark.security.SocketAuthServer$.$anonfun$serveToStream$2(SocketAuthServer.scala:140)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.security.SocketAuthServer$.$anonfun$serveToStream$1(SocketAuthServer.scala:142)
	at org.apache.spark.security.SocketAuthServer$.$anonfun$serveToStream$1$adapted(SocketAuthServer.scala:137)
	at org.apache.spark.security.SocketFuncServer.handleConnection(SocketAuthServer.scala:114)
	at org.apache.spark.security.SocketFuncServer.handleConnection(SocketAuthServer.scala:108)
	at org.apache.spark.security.SocketAuthServer$$anon$1.$anonfun$run$4(SocketAuthServer.scala:69)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.security.SocketAuthServer$$anon$1.run(SocketAuthServer.scala:69)
Caused by: java.lang.UnsupportedOperationException: sun.misc.Unsafe or java.nio.DirectByteBuffer.<init>(long, int) not available
	at org.apache.arrow.memory.util.MemoryUtil.directBuffer(MemoryUtil.java:174)
	at org.apache.arrow.memory.ArrowBuf.getDirectBuffer(ArrowBuf.java:229)
	at org.apache.arrow.memory.ArrowBuf.nioBuffer(ArrowBuf.java:224)
	at org.apache.arrow.vector.ipc.WriteChannel.write(WriteChannel.java:133)
	at org.apache.arrow.vector.ipc.message.MessageSerializer.writeBatchBuffers(MessageSerializer.java:303)
	at org.apache.arrow.vector.ipc.message.MessageSerializer.serialize(MessageSerializer.java:276)
	at org.apache.arrow.vector.ipc.message.MessageSerializer.serialize(MessageSerializer.java:237)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.$anonfun$next$1(ArrowConverters.scala:117)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.next(ArrowConverters.scala:120)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.next(ArrowConverters.scala:77)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.foreach(ArrowConverters.scala:77)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.to(ArrowConverters.scala:77)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.toBuffer(ArrowConverters.scala:77)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.toArray(ArrowConverters.scala:77)
	at org.apache.spark.sql.Dataset.$anonfun$collectAsArrowToPython$6(Dataset.scala:4264)
	at org.apache.spark.SparkContext.$anonfun$runJob$6(SparkContext.scala:2492)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	at java.base/java.lang.Thread.run(Thread.java:1583)
	Suppressed: org.apache.spark.util.TaskCompletionListenerException: Memory was leaked by query. Memory leaked: (917504)
Allocator(toArrowBatchIterator) 0/917504/1114112/9223372036854775807 (res/actual/peak/limit)


Previous exception in task: sun.misc.Unsafe or java.nio.DirectByteBuffer.<init>(long, int) not available
	org.apache.arrow.memory.util.MemoryUtil.directBuffer(MemoryUtil.java:174)
	org.apache.arrow.memory.ArrowBuf.getDirectBuffer(ArrowBuf.java:229)
	org.apache.arrow.memory.ArrowBuf.nioBuffer(ArrowBuf.java:224)
	org.apache.arrow.vector.ipc.WriteChannel.write(WriteChannel.java:133)
	org.apache.arrow.vector.ipc.message.MessageSerializer.writeBatchBuffers(MessageSerializer.java:303)
	org.apache.arrow.vector.ipc.message.MessageSerializer.serialize(MessageSerializer.java:276)
	org.apache.arrow.vector.ipc.message.MessageSerializer.serialize(MessageSerializer.java:237)
	org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.$anonfun$next$1(ArrowConverters.scala:117)
	scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.next(ArrowConverters.scala:120)
	org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.next(ArrowConverters.scala:77)
	scala.collection.Iterator.foreach(Iterator.scala:943)
	scala.collection.Iterator.foreach$(Iterator.scala:943)
	org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.foreach(ArrowConverters.scala:77)
	scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.to(ArrowConverters.scala:77)
	scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.toBuffer(ArrowConverters.scala:77)
	scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.toArray(ArrowConverters.scala:77)
	org.apache.spark.sql.Dataset.$anonfun$collectAsArrowToPython$6(Dataset.scala:4264)
	org.apache.spark.SparkContext.$anonfun$runJob$6(SparkContext.scala:2492)
	org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	org.apache.spark.scheduler.Task.run(Task.scala:141)
	org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	java.base/java.lang.Thread.run(Thread.java:1583)
		at org.apache.spark.TaskContextImpl.invokeListeners(TaskContextImpl.scala:254)
		at org.apache.spark.TaskContextImpl.invokeTaskCompletionListeners(TaskContextImpl.scala:144)
		at org.apache.spark.TaskContextImpl.markTaskCompleted(TaskContextImpl.scala:137)
		at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:177)
		... 9 more
		Suppressed: java.lang.IllegalStateException: Memory was leaked by query. Memory leaked: (917504)
Allocator(toArrowBatchIterator) 0/917504/1114112/9223372036854775807 (res/actual/peak/limit)

			at org.apache.arrow.memory.BaseAllocator.close(BaseAllocator.java:476)
			at org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.close(ArrowConverters.scala:128)
			at org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.$anonfun$new$2(ArrowConverters.scala:96)
			at org.apache.spark.sql.execution.arrow.ArrowConverters$ArrowBatchIterator.$anonfun$new$2$adapted(ArrowConverters.scala:95)
			at org.apache.spark.TaskContext$$anon$1.onTaskCompletion(TaskContext.scala:137)
			at org.apache.spark.TaskContextImpl.$anonfun$invokeTaskCompletionListeners$1(TaskContextImpl.scala:144)
			at org.apache.spark.TaskContextImpl.$anonfun$invokeTaskCompletionListeners$1$adapted(TaskContextImpl.scala:144)
			at org.apache.spark.TaskContextImpl.invokeListeners(TaskContextImpl.scala:199)
			... 12 more


## Ending Spark Session to allow parket files to be loaded

In [10]:
# Stop SparkSession
spark.stop()