In [4]:
import os
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
assert "JAVA_HOME" in os.environ, "Need to start the notebook JAVA on path"
spark = (
    SparkSession.builder
    .appName("local-app")
    .master("local")
    .config("spark.driver.host", "localhost")
    .config("spark.sql.session.timeZone", "UTC")
    .config("spark.sql.shuffle.partitions", "1")
    .config("spark.master", "local[*]")
    .getOrCreate()
)
DATA_DIR = os.path.abspath(os.path.join(os.getcwd(), "..", "data"))

# Loading Basic CSV into Spark Data Frame

In [10]:
simple_csv_df = spark.read.option("header", "true").csv(f"{DATA_DIR}/simple.csv")
simple_csv_df.show()

+----------+-----+---+
|      Date| Name|Age|
+----------+-----+---+
|2022-02-03|Billy| 10|
|2022-01-02|Jonny| 77|
+----------+-----+---+



# Loading JSON data into Spark Data Frame

Where the jsonfile is multiline


In [11]:
simple_json_df = (
    spark.read
    .option("multiline", "true")
    .json(f"{DATA_DIR}/simple.json")
)
simple_json_df.show()

+---------------+------+-------+
|        hobbies|  name|  place|
+---------------+------+-------+
|      [writing]| Ayaan|Somalia|
|[writing, yoga]|Briget|    USA|
+---------------+------+-------+



# Loading a Dict into a Spark Data Frame

We'll need to add a schema, which introduces some of the Spark types

In [5]:
data_dict_list = [dict(state="New York", capitol="Albany"), dict(state="Virginia", capitol="Richmond")]
from pyspark.sql.types import StructType, StructField, StringType
states_schema = StructType([
    StructField("state", StringType(), True), StructField("capitol", StringType(), True)
])
dict_df = spark.createDataFrame(data=data_dict_list, schema=states_schema)
dict_df.show()

+--------+--------+
|   state| capitol|
+--------+--------+
|New York|  Albany|
|Virginia|Richmond|
+--------+--------+

