# Spark Basic Syntax Exploration

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf

from datetime import datetime, timedelta
import random

import os
import sys

In [None]:
# Required if at some point you got 
# 'java.io.IOException: Cannot run program "python3": CreateProcess error=2, The system cannot find the file specified'
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

Open spark session

In [None]:
spark = SparkSession.builder.appName("Spark SQL").getOrCreate()

Build dataframe from JSON

In [None]:
path = "data/spark_loan.jsonl"
loan_df = spark.read.json(path)

## Build View for Spark SQL

Print schema

In [None]:
loan_df.createOrReplaceTempView("loan_table")

Count the data frame rows.

In [None]:
spark.sql("SELECT count(loan_id) FROM loan_table").show()

These two lines is to eliminite line break for text wrapping

In [None]:
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

Select all fields. You can use multi line for spark SQL.

In [None]:
spark.sql("""
    SELECT * 
      FROM loan_table
""").show(truncate=False)

Or, just select few fields to describe.

In [None]:
spark.sql("""
    SELECT full_name, loan_amount
      FROM loan_table
""").show(truncate=False)

Drop duplicates, and show the distinct records, ordered by certain field.

In [None]:
spark.sql("""
    SELECT DISTINCT payment_period, loan_period_weeks
      FROM loan_table
     ORDER BY 1, 2
""").show(truncate=False)

Collect all data into local python memory. Careful, if the data is very large, this might trigger an out-of-memory error.

In [None]:
spark.sql("SELECT * FROM loan_table").collect()

Alternatively, just take the first *n* data

In [None]:
spark.sql("SELECT * FROM loan_table LIMIT 10").collect()

The query result is spark dataframe, and can be processed as regular spark dataframe, including conversion to pandas dataframe.

**Note** : `toPandas` will collects all data into the local python, which can cause an out-of-memory-error when the data is too large to fit into one machine.

In [None]:
loan_df_from_sql = spark.sql("""
    SELECT *
      FROM loan_table 
     LIMIT 5
""")

pandas_loan_df = loan_df_from_sql.toPandas()
pandas_loan_df

Filtering data, show only loan amount between 500-700

In [None]:
loan_df_from_sql = spark.sql("""
    SELECT *
      FROM loan_table 
     WHERE loan_amount >= 500 AND loan_amount <= 700
""").show(truncate=False)

Sort by multiple columns & ascending / descending. For example, sort by `loan_approved_date` (ascending), then by `loan_amount` (descending)

In [None]:
loan_df_from_sql = spark.sql("""
    SELECT *
      FROM loan_table 
     WHERE loan_amount < 500 OR loan_amount > 700
  ORDER BY loan_approved_date, loan_amount DESC
""").show(truncate=False)

## Grouping & Aggregation

Aggregation using SQL-like syntax

In [None]:
loan_df_from_sql = spark.sql("""
    SELECT loan_rating, COUNT(loan_id) AS count_loan_rating
      FROM loan_table
  GROUP BY loan_rating
  ORDER BY loan_rating
""").show(truncate=False)

Nested function like this (rounding the average to 2 decimals) also supported.

In [None]:
loan_df_from_sql = spark.sql("""
    SELECT payment_period, ROUND(AVG(loan_amount), 2) AS avg_amount
      FROM loan_table
  GROUP BY payment_period
  ORDER BY payment_period
""").show(truncate=False)

## UDF (User-Defined-Function)

Calculate and create new column `loan_end_date` based on user-defined-function 
(`udf`) with formula 

`loan_end_date = loan_approved_date + loan_period_weeks`

In [None]:
get_loan_end_date = udf(lambda x, y: (datetime.fromisoformat(x) + timedelta(weeks=y)).strftime('%Y-%m-%d') )

Need to register the UDF to be used by Spark SQL

In [None]:
spark.udf.register("get_loan_end_date", get_loan_end_date)

Now, we can use the registered udf as function on Spark SQL.

In [None]:
spark.sql("""
    SELECT loan_id, loan_approved_date, loan_period_weeks, 
           get_loan_end_date(loan_approved_date, loan_period_weeks) AS loan_end_date
      FROM loan_table
""").show(truncate=False)

UDF using function

In [None]:
def just_repeat(str):
    if random.choice([True, False]):
        return str + " & " + str
    else:
        return str + " & " + str + " & " + str

Register the UDF

In [None]:
just_repeat_udf = spark.udf.register("just_repeat", just_repeat)

Use the UDF on Spark SQL

In [None]:
spark.sql("""
    SELECT full_name, just_repeat(full_name) AS just_repeating_column
      FROM loan_table
""").show(truncate=False)

## SQL Reference

Spark SQL function reference [available here](https://spark.apache.org/docs/latest/api/sql)