In [1]:
import findspark

In [2]:
# /opt/manual/spark: this is SPARK_HOME path
findspark.init("/opt/manual/spark")

In [3]:
from pyspark.sql import SparkSession, functions as F

In [4]:
spark = SparkSession.builder \
.appName("User Defined Functions") \
.master("local[2]") \
.getOrCreate()

In [5]:
! ls -l ~/datasets

total 473232
-rw-rw-r--. 1 train train  42658497 Nov  3 22:07 201508_trip_data.csv
-rw-rw-r--. 1 train train      4556 Jul 21 18:58 Advertising.csv
drwxrwxr-x. 2 train train      4096 Nov 10 22:02 cat_images
-rw-rw-r--. 1 train train    674857 Sep 17 22:49 Churn_Modelling.csv
drwxrwxr-x. 3 train train        96 Oct  6 12:18 churn-telecom
-rw-rw-r--. 1 train train  41002480 Oct  6 12:18 Fire_Incidents.csv.gz
drwxrwxr-x. 7 train train        67 Oct  6 12:18 flight-data
-rw-rw-r--. 1 train train  46401315 Oct  6 12:18 Hotel_Reviews.csv.gz
drwxrwxr-x. 2 train train       198 Sep 17 20:53 hotel_reviews_parquet
-rw-rw-r--. 1 train train       180 Sep  2 11:40 insanlar.csv
-rw-rw-r--. 1 train train      4611 Sep  1 16:13 iris.csv
-rw-rw-r--. 1 train train     15802 Sep  1 16:13 iris.json
-rw-rw-r--. 1 train train    325145 Sep  1 16:14 kuruyemisler.txt
-rw-rw-r--. 1 train train  44525776 Oct  6 12:18 market1mil.csv.gz
drwxrwxr-x. 2 train train       198 Nov 10 22:17 market1mil_

In [6]:
df = (spark.read.format("csv")
.option("header",True)
.option("inferSchema", True)
.load("file:///home/train/datasets/flight-data/csv/"))

In [7]:
df.limit(5).toPandas()

Unnamed: 0,DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
0,United States,Romania,1
1,United States,Ireland,264
2,United States,India,69
3,Egypt,United States,24
4,Equatorial Guinea,United States,1


In [8]:
df.createOrReplaceTempView("flight_table")

In [14]:
df_grp_by = spark.sql("""

SELECT ORIGIN_COUNTRY_NAME, SUM(count) AS Total_Count 
FROM flight_table 
GROUP BY ORIGIN_COUNTRY_NAME
ORDER BY Total_Count DESC
LIMIT 15


""")

In [15]:
df_grp_by.show()

+-------------------+-----------+
|ORIGIN_COUNTRY_NAME|Total_Count|
+-------------------+-----------+
|      United States|    2352430|
|             Canada|      49695|
|             Mexico|      38225|
|     United Kingdom|      10358|
|              Japan|       8643|
|            Germany|       8380|
| Dominican Republic|       7194|
|        The Bahamas|       5775|
|             France|       5290|
|           Colombia|       4981|
|        South Korea|       4253|
|            Jamaica|       4087|
|              China|       4021|
|        Netherlands|       3779|
|             Brazil|       3427|
+-------------------+-----------+



In [16]:
from pyspark.sql.types import *

# Define UDF

In [17]:
def lower_case(c):
    return c.lower()

In [18]:
lower_case("ALi")

'ali'

# Register UDF

In [19]:
# spark.udf.register(name, f, returnType=None)
lower_case_func = F.udf(lambda x: lower_case(x), StringType())

spark.udf.register("lower_case_func", lower_case_func)

<function __main__.<lambda>(x)>

# Use the UDF

In [20]:
df.select(lower_case_func("DEST_COUNTRY_NAME")).show(5)

+---------------------------+
|<lambda>(DEST_COUNTRY_NAME)|
+---------------------------+
|              united states|
|              united states|
|              united states|
|                      egypt|
|          equatorial guinea|
+---------------------------+
only showing top 5 rows



# Pandas UDFs

    One of the previous prevailing issues with using PySpark UDFs was that they had
    slower performance than Scala UDFs. This was because the PySpark UDFs required
    data movement between the JVM and Python, which was quite expensive. To resolve
    this problem, Pandas UDFs (also known as vectorized UDFs) were introduced as part
    of Apache Spark 2.3. A Pandas UDF uses Apache Arrow to transfer data and Pandas
    to work with the data. You define a Pandas UDF using the keyword pandas_udf as
    the decorator, or to wrap the function itself. Once the data is in Apache Arrow format,
    there is no longer the need to serialize/pickle the data as it is already in a format
    consumable by the Python process. Instead of operating on individual inputs row by
    row, you are operating on a Pandas Series or DataFrame (i.e., vectorized execution).

In [22]:
import pandas as pd

In [23]:
def upper_case(col: pd.Series) -> pd.Series:
    return col.transform(lambda x: x.upper())

In [None]:
# Format
# F.pandas_udf(function, returnType)

In [25]:
upper_case_pdudf = F.pandas_udf(upper_case, returnType=StringType())

In [26]:
# ! pip install pyarrow

In [28]:
df.select("DEST_COUNTRY_NAME",upper_case_pdudf("DEST_COUNTRY_NAME")).show(5)

+-----------------+-----------------------------+
|DEST_COUNTRY_NAME|upper_case(DEST_COUNTRY_NAME)|
+-----------------+-----------------------------+
|    United States|                UNITED STATES|
|    United States|                UNITED STATES|
|    United States|                UNITED STATES|
|            Egypt|                        EGYPT|
|Equatorial Guinea|            EQUATORIAL GUINEA|
+-----------------+-----------------------------+
only showing top 5 rows

