# Spark Basic Syntax Exploration

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf

from datetime import datetime, timedelta
import random

import os
import sys

In [None]:
# Required if at some point you got 
# 'java.io.IOException: Cannot run program "python3": CreateProcess error=2, The system cannot find the file specified'
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

Open spark session

In [None]:
spark = SparkSession.builder.appName("Spark Dataframe").getOrCreate()

Build dataframe from JSON

In [None]:
path = "data/spark_loan.jsonl"
loan_df = spark.read.json(path)

## Data Exploration

Print schema

In [None]:
loan_df.printSchema()

Count the data frame rows.

In [None]:
loan_df.count()

Describe all fields

In [None]:
loan_df.describe().show()

These two lines is to eliminite line break for text wrapping

In [None]:
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

Or, just select few fields to describe.

In [None]:
loan_df.describe("full_name", "loan_amount").show()

Drop duplicates, and show the distinct records, ordered by certain field.

In [None]:
loan_df.select("payment_period", "loan_period_weeks").drop_duplicates().sort("payment_period", "loan_period_weeks").show()

Collect all data into local python memory. Careful, if the data is very large, this might trigger an out-of-memory error.

In [None]:
loan_df.collect()

Alternatively, just take the first *n* data

In [None]:
loan_df.take(10)

Or, the last *n* data.
  
**Note:** Running `tail` will move data into the local machine, so very large num can cause out-of-memory error

In [None]:
loan_df.tail(10)

Pyspark DataFrame also provides the conversion to pandas DataFrame to leverage pandas APIs.  

**Note** : `toPandas` will collects all data into the local python, which can cause an out-of-memory-error when the data is too large to fit into one machine.

In [None]:
pandas_loan_df = loan_df.toPandas()
pandas_loan_df

Accessing column can be done using several ways

In [None]:
col_loan_id = loan_df.loan_id
col_full_name = loan_df["full_name"]

Selecting column from data frame.  

By default, spark will truncate long values. So we use optional parameter `truncate` to show un-truncated values.

In [None]:
loan_df.select(col_loan_id, col_full_name, loan_df.loan_amount, loan_df["loan_approved_date"]).show(truncate=False)

Filtering data, show only loan amount between 500-700

In [None]:
loan_df.filter( (loan_df.loan_amount >= 500) & (loan_df.loan_amount <= 700) ).show(truncate=False)

Or, filtering using SQL-like syntax

In [None]:
loan_df.filter( "loan_amount < 500 OR loan_amount > 700").show(truncate=False)

Pyspark provides `where()` as an alias to `filter()`.  
The example below also sort the filtered rows by loan_amount.

In [None]:
loan_df.where( (loan_df.loan_amount >= 500) & (loan_df.loan_amount <= 700) ).sort(loan_df.loan_amount).show(truncate=False)

Sort descending. Spark also provides `orderBy()` as alias to `sort()`

In [None]:
loan_df.where( "loan_amount < 500 OR loan_amount > 700").orderBy("loan_amount", ascending=False).show(truncate=False)

Sort by multiple columns & ascending / descending. For example, sort by `loan_approved_date` (ascending), then by `loan_amount` (descending)

In [None]:
loan_df.orderBy(["loan_approved_date", "loan_amount"], ascending=[True, False]).show(truncate=False)

## Grouping & Aggregation

Several built-in aggregation functions

In [None]:
for row in loan_df.groupby("loan_rating").count().orderBy("loan_rating").collect():
    print(row)

In [None]:
for row in loan_df.groupby("payment_period").avg("loan_amount").orderBy("payment_period").collect():
    print("Payment period {} has average loan amount {}".format(row[0], row[1]))

## UDF (User-Defined-Function)

Calculate and create new column `loan_end_date` based on user-defined-function 
(`udf`) with formula 

`loan_end_date = loan_approved_date + loan_period_weeks`

In [None]:
get_loan_end_date = udf(lambda x, y: (datetime.fromisoformat(x) + timedelta(weeks=y)).strftime('%Y-%m-%d') )

Put into new dataframe

In [None]:
loan_udf = loan_df.withColumn("loan_end_date", get_loan_end_date(loan_df.loan_approved_date, loan_df.loan_period_weeks))

In [None]:
loan_udf.select("loan_id", "loan_approved_date", "loan_period_weeks", "loan_end_date").show(truncate=False)

UDF using function

In [None]:
def just_repeat(str):
    if random.choice([True, False]):
        return str + " & " + str
    else:
        return str + " & " + str + " & " + str

Need to register the UDF

In [None]:
just_repeat_udf = spark.udf.register("just_repeat", just_repeat)

Then, use the registered UDF

In [None]:
loan_udf_repeat = loan_df.select("full_name").withColumn(
    "justRepeatingColumn", just_repeat_udf(loan_df.full_name)).show(truncate=False)

## API Reference

Spark dataframe API reference [available here](http://spark.apache.org/docs/latest/api/python/reference/pyspark.sql.html#dataframe-apis)