# Creating a Spark DB
## Loading a JSON file using PySpark

In [1]:
from pyspark.sql import SparkSession


# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Create Database from JSON") \
    .getOrCreate()

spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

# Read JSON file into DataFrame

df = spark.read.format("json") \
    .load(r"C:\Users\jose\Downloads\goodreads_reviews.json") # This file will be pushed into GitHub in a zipped folder using Git LFS

# Create a database
spark.sql("CREATE DATABASE IF NOT EXISTS goodreads_reviews")

# Use the database
spark.sql("USE goodreads_reviews")

# Save DataFrame as a table in the database
df.write.mode("overwrite").saveAsTable("reviews")

# Save DataFrame as a table in a database
df.write.mode("overwrite").saveAsTable("goodreads_reviews.reviews")


## Spark Web UI access localhost:4040

In [2]:
# Use the database
spark.sql("USE goodreads_reviews")

# Query the data from the table
result = spark.sql("SELECT * FROM reviews")

# Show the result
result.show()

+--------+--------------------+--------------------+----------+-------+------+--------------------+--------------------+--------------------+--------------------+--------------------+
| book_id|          date_added|        date_updated|n_comments|n_votes|rating|             read_at|           review_id|         review_text|          started_at|             user_id|
+--------+--------------------+--------------------+----------+-------+------+--------------------+--------------------+--------------------+--------------------+--------------------+
|25913122|Tue Dec 29 13:45:...|Fri Aug 26 11:01:...|         2|      5|     4|Tue Jul 12 00:00:...|6083f89f3e7ea38c4...|Compre este libro...|Tue Jul 12 00:00:...|2ce9cf233c1503ed5...|
|27823971|Wed Oct 28 07:51:...|Wed Feb 17 21:02:...|         1|      2|     3|Wed Feb 17 00:00:...|28b70411d46da4471...|No puedo ser impa...|Sat Feb 13 00:00:...|2ce9cf233c1503ed5...|
|25796670|Sat Oct 03 14:14:...|Wed Mar 16 00:51:...|         1|      4|     2|Tu

In [3]:
# Save DataFrame as parquet files (a columnar storage format) in a directory
# Parket files will be pushed into GitHub in a zipped folder using Git LFS

df.write.mode("overwrite").parquet(r"C:\Users\jose\Downloads\goodreads_reviews_parquet") 

# Register the DataFrame as a temporary view
df.createOrReplaceTempView("reviews")

## Ending Spark Session to allow parket files to be loaded

In [4]:
# Stop SparkSession
spark.stop()