# PySpark create tables

In [1]:
import os
import sys
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [2]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.master("local").appName("demo").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

## Create managed tables

In [5]:
df = spark.createDataFrame([
    (1, "socks"), 
    (2, "chips"),
    (3, "air conditioner"),
    (4, "tea"),
], ["transaction_id", "item_name"])

In [6]:
df.write.format("parquet").saveAsTable("transactions")

                                                                                

In [7]:
spark.table("transactions").show()

+--------------+---------------+
|transaction_id|      item_name|
+--------------+---------------+
|             1|          socks|
|             2|          chips|
|             3|air conditioner|
|             4|            tea|
+--------------+---------------+



In [9]:
voided = spark.createDataFrame([(1,), (4,)], ["transaction_id"])

In [10]:
voided.show()

+--------------+
|transaction_id|
+--------------+
|             1|
|             4|
+--------------+



In [11]:
voided.write.format("parquet").save("voided")

### join the datasets

In [12]:
transactions = spark.read.parquet("transactions")
voided = spark.read.parquet("voided")

transactions.join(
    voided, 
    transactions.transaction_id == voided.transaction_id, 
    "leftanti"
).show()

+--------------+---------------+
|transaction_id|      item_name|
+--------------+---------------+
|             2|          chips|
|             3|air conditioner|
+--------------+---------------+



## Create unmanaged tables

In [17]:
letters = spark.createDataFrame([
    (1, "a"), 
    (2, "b"),
    (3, "c"),
], ["id", "letter"])

In [21]:
letters.write.format("parquet").saveAsTable("letters", path="/tmp/letters")

## Create managed volumes

## Write data to path

In [22]:
people = spark.createDataFrame([
    (1, "li"), 
    (2, "chung"),
], ["id", "first_name"])

In [23]:
people.write.format("parquet").save("/tmp/people")

In [24]:
people = spark.read.format("parquet").load("/tmp/people")

In [25]:
people.show()

+---+----------+
| id|first_name|
+---+----------+
|  1|        li|
|  2|     chung|
+---+----------+



In [None]:
countries = spark.read.format("csv").load("/tmp/countries")