In [3]:
import os
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
# assert "JAVA_HOME" in os.environ, "Need to start the notebook with JAVA_HOME on path"
spark = (
    SparkSession.builder
    .appName("local-app")
    .master("local")
    .config("spark.driver.host", "localhost")
    .config("spark.sql.session.timeZone", "UTC")
    .config("spark.sql.shuffle.partitions", "1")
    .config("spark.master", "local[*]")
    .getOrCreate()
)
DATA_DIR = os.path.abspath(os.path.join(os.getcwd(), "..", "data"))

# Loading CSV into Spark Data Frame

In [4]:
simple_csv_df = spark.read.option("header", "true").csv(f"{DATA_DIR}/simple.csv")
simple_csv_df.show()

+----------+-----+---+
|      Date| Name|Age|
+----------+-----+---+
|2022-02-03|Billy| 10|
|2022-01-02|Jonny| 77|
+----------+-----+---+



# Loading JSON data into Spark Data Frame

Where the jsonfile is multiline


In [5]:
simple_json_df = (
    spark.read
    .option("multiline", "true")
    .json(f"{DATA_DIR}/simple.json")
)
simple_json_df.show()

+---------------+------+-------+
|        hobbies|  name|  place|
+---------------+------+-------+
|      [writing]| Ayaan|Somalia|
|[writing, yoga]|Briget|    USA|
+---------------+------+-------+



# Loading a Dict into a Spark Data Frame

We'll need to add a schema, which introduces some of the Spark types

In [6]:
data_dict_list = [dict(state="New York", capitol="Albany"), dict(state="Virginia", capitol="Richmond")]
from pyspark.sql.types import StructType, StructField, StringType
states_schema = StructType([
    StructField("state", StringType(), True), StructField("capitol", StringType(), True)
])
dict_df = spark.createDataFrame(data=data_dict_list, schema=states_schema)
dict_df.show()

+--------+--------+
|   state| capitol|
+--------+--------+
|New York|  Albany|
|Virginia|Richmond|
+--------+--------+



# Loading Typed Rows

Using the schema above

In [7]:
from pyspark.sql.types import Row

rows = [Row(state=x[0], capitol=x[1]) for x in [("Alaska", "Anchorage"), ("Georgia", "Savannah")]]
rows_df = spark.createDataFrame(data=rows, schema=states_schema)
rows_df.show(4, False)

+-------+---------+
|state  |capitol  |
+-------+---------+
|Alaska |Anchorage|
|Georgia|Savannah |
+-------+---------+



# Loading a Table

With an intermediate step to create a table

In [8]:
dict_df.createOrReplaceTempView('tmpTable')
table_df = spark.table("tmpTable")
table_df.show()


+--------+--------+
|   state| capitol|
+--------+--------+
|New York|  Albany|
|Virginia|Richmond|
+--------+--------+



# Reading a downloaded CSV file

Thanks to [this](https://stackoverflow.com/a/47845360/821169) SO answer

In [9]:
import requests

dl = requests.get("https://raw.githubusercontent.com/washingtonpost/data-school-shootings/master/school-shootings-data.csv")
lines = dl.text.splitlines()
dl_df = spark.read.option("header", "true").csv(spark.sparkContext.parallelize(lines))
(
    dl_df
    .groupby("city")
    .count()
    .orderBy(col("count").desc())
    .show(10, False)
)

+------------+-----+
|city        |count|
+------------+-----+
|Los Angeles |5    |
|Chicago     |5    |
|Philadelphia|5    |
|Jacksonville|4    |
|New Orleans |4    |
|Portsmouth  |4    |
|Baltimore   |4    |
|Charlotte   |3    |
|Gary        |2    |
|Oxnard      |2    |
+------------+-----+
only showing top 10 rows



In [10]:
# dl_df.printSchema()

# Other `read` options

* [Text](https://spark.apache.org/docs/3.0.1/api/python/pyspark.sql.html#pyspark.sql.DataFrameReader.text)
* [ORC](https://spark.apache.org/docs/3.0.1/api/python/pyspark.sql.html#pyspark.sql.DataFrameWriter.orc). Also see [Apache ORC](https://orc.apache.org/docs/) format


In [11]:
text = """Date,Name,Age
2022-02-03,Billy,10
2022-01-02,Jonny,77"""
l = text.split("\n")
rdd = spark.sparkContext.parallelize( [x.split(',') for x in l[1:]])
ddf = spark.createDataFrame(rdd, ["Date","Name","Age"])
ddf.show()

+----------+-----+---+
|      Date| Name|Age|
+----------+-----+---+
|2022-02-03|Billy| 10|
|2022-01-02|Jonny| 77|
+----------+-----+---+

