# Intro
This notebook contains various goodies on how Apache Spark works and how to use it in Microsoft Fabric

## Dataframe (with a schema)

In [None]:
from pyspark.sql.types import StructType,StructField, StringType, IntegerType

data2 = [("Olena","","Eldridge","36636","M",90000),
    ("Matthew","J", "Munro","28832","M",45400),
    ("Joffrey","Oway", "Roberts","12114","F",64000),
    ("Lennert","", "Dushane","32192","F",141000),
    ("Jane","Rebecca","Jones","99482","F",56000)
  ]

schema = StructType([ 
    StructField("firstname",StringType(),True), 
    StructField("middlename",StringType(),True), 
    StructField("lastname",StringType(),True), 
    StructField("id", StringType(), True), 
    StructField("gender", StringType(), True), 
    StructField("salary", IntegerType(), True) 
  ])
 
# creation of the dataframe using the above defined schema
df = spark.createDataFrame(data=data2,schema=schema)

# printing the schema of the dataframe
df.printSchema() 

# showing the datatypes of the columns
df.dtypes

# displaying the dataframe (keep in mind there is a limit in number of rows that is displayed)
df.show(truncate=False)

## Reading in files / Writing away to files

- When reading files without specifying a schema or without letting spark infer a schema the default type of all your columns will be String
- Better to use inferschema = True or specify your own schema

#### Reading in CSV

In [None]:
# Declare the path to our file (fe. Files section of a Fabric lakehouse)
csv_path = 'Files/property-sales.csv' 

# Read a csv file from Files/property-sales.csv
df_csv = spark.read.csv(csv_path, header=True) 

# or if we want spark to do some work for us like telling what data type is in each column and in that way infer the schema
df_csv = spark.read.csv(csv_path, header=True, inferSchema=True) 

#### Writing dataframes to Json files

In [None]:
# call write.json() method to write the dataframe to a json file
# the mode parameter is set to 'overwrite' to overwrite the file if it already exists

df_csv.write.json("Files/json/property-sales.json", mode='overwrite')

#### Reading in Json file

In [None]:
df_json = spark.read.json('Files/json/property-sales.json')

#### Writing dataframes to Parquet

In [None]:
df_json.write.parquet('Files/parquet/property-sales2.parquet', mode='overwrite')

#### Reading in multiple parquet files (with metadata)

- Spark provides us with all the file metadata in a 'hidden' column that we can add to our dataframe using _metadata. This metadata contains:
    - file_modification_time
    - row_index
    - file_name
    - file_size
    - file_path

In [None]:
# read all the parquet files, then add the _metadata column 
df_all_parquet_plus_metadata = spark.read\
    .parquet('Files/parquet/*.parquet')\
    .select("*", "_metadata")

# Writing to Fabric Lakehouse tables

- no spaces or special characters in columnnames! -> use ``` .withColumnRenamed ```


In [None]:
# changing column names to allow write to Lakehouse tables
df = df.withColumnRenamed("SalePrice ($)","SalePrice_USD")\
        .withColumnRenamed("Address ", "Address")\
        .withColumnRenamed("City ", "City")

#### Writing DF to Table, with different 'modes'

- Using ``` .saveAsTable ```, we save the DataFrame as a 'Managed Table' (Spark terminology) - meaning both the metadata and the data is managed by Spark.
- With a managed table, a SQL command such as DROP TABLE table_name deletes both the metadata and the data. 
- With an unmanaged table, the same command will delete only the metadata, not the actual data.

In [None]:
delta_table_name = 'PropertySales'

# use saveAsTable to save as a Managed Table
df.write.mode("overwrite").format("delta").saveAsTable(delta_table_name)

In [None]:
# these are four different write 'modes' 

# append the new dataframe to the existing Table
df.write.mode("append").format("delta").saveAsTable(delta_table_name)

# overwrite existing Table with new DataFrame
df.write.mode("overwrite").format("delta").saveAsTable(delta_table_name)

# Throw error if data already exists
df.write.mode("error").format("delta").saveAsTable(delta_table_name)

# Fail silently if data already exists 
df.write.mode("ignore").format("delta").saveAsTable(delta_table_name)

#### Writing an unmanaged table

- use ``` .save ``` instead

In [None]:
# unmanaged table
df.write.mode("overwrite").format("delta").save(path="Files/delta/unmanaged.delta")

# Reading from table in Dataframe

In [None]:
df = spark.sql("SELECT * FROM SparkSeptember.propertysales LIMIT 1000")
display(df)