In [51]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from functools import reduce

# Starting a basic SparkSession

In [2]:
spark = SparkSession.builder \
    .appName("basic_app") \
    .getOrCreate()

23/11/01 19:23:55 WARN Utils: Your hostname, luan-Dell-G15-5520 resolves to a loopback address: 127.0.1.1; using 192.168.1.12 instead (on interface wlp0s20f3)
23/11/01 19:23:55 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/01 19:24:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# First example

In [3]:
myRange = spark.range(1000).toDF("number")

In [4]:
myRange.show(3)

+------+
|number|
+------+
|     0|
|     1|
|     2|
+------+
only showing top 3 rows



## Decomposing myRange variable

### Docs: https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/index.html

### Spark functions: https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/functions.html

In [5]:
# The custom input is id
spark.range(1000).show(3)

+---+
| id|
+---+
|  0|
|  1|
|  2|
+---+
only showing top 3 rows



In [6]:
# Renaming id as number
spark.range(1000).toDF("number").show(3)

+------+
|number|
+------+
|     0|
|     1|
|     2|
+------+
only showing top 3 rows



# Basic operations

In [7]:
# number % 2 = 0 is a SQL query
divisBy2 = myRange.where("number % 2 = 0")
divisBy2.show(3)

+------+
|number|
+------+
|     0|
|     2|
|     4|
+------+
only showing top 3 rows



## Alternatives

### using F.col spark function


In [8]:
divisBy2 = myRange.where(F.col('number') % 2 == 0)
divisBy2.show(3)

+------+
|number|
+------+
|     0|
|     2|
|     4|
+------+
only showing top 3 rows



### using dataframe structure

#### As an attribute

In [9]:
divisBy2 = myRange.where(myRange.number % 2 == 0)
divisBy2.show(3)

+------+
|number|
+------+
|     0|
|     2|
|     4|
+------+
only showing top 3 rows



#### When the column name has spaces, use the form below

In [10]:
divisBy2 = myRange.where(myRange['number'] % 2 == 0)
divisBy2.show(3)

+------+
|number|
+------+
|     0|
|     2|
|     4|
+------+
only showing top 3 rows



# Dataframe actions. Docs: https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/dataframe.html

In [11]:
divisBy2.count()

500

In [12]:
divisBy2.printSchema()

root
 |-- number: long (nullable = false)



In [13]:
divisBy2.dtypes

[('number', 'bigint')]

## quick tip: is useful to have dtypes as a dict

In [14]:
dtype_dict = {key:value for key,value in divisBy2.dtypes}

## To check the jobs, go to: http://localhost:4040/jobs/ . Ignore the clock

# An end-to-end example

In [15]:
flightData2015 = spark\
.read\
.option("inferSchema", "true")\
.option("header", "true")\
.csv("../../../spark_data_examples/flight-data/csv/2015-summary.csv")

## Checking the types

In [16]:
flightData2015.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: integer (nullable = true)



## Getting a type dicts

In [17]:
dtype_dict = {key:value for key,value in flightData2015.dtypes}
dtype_dict

{'DEST_COUNTRY_NAME': 'string',
 'ORIGIN_COUNTRY_NAME': 'string',
 'count': 'int'}

## Use of take

In [18]:
flightData2015.take(3)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Ireland', count=344)]

## Sorting data

In [19]:
flightData2015.sort("count")

DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: int]

## Explaining the operations

In [20]:
flightData2015.sort("count").explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [count#70 ASC NULLS FIRST], true, 0
   +- Exchange rangepartitioning(count#70 ASC NULLS FIRST, 200), ENSURE_REQUIREMENTS, [plan_id=171]
      +- FileScan csv [DEST_COUNTRY_NAME#68,ORIGIN_COUNTRY_NAME#69,count#70] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/jovyan/spark_data_examples/flight-data/csv/2015-summary.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,ORIGIN_COUNTRY_NAME:string,count:int>




## Changing the number of partitions

In [21]:
spark.conf.set("spark.sql.shuffle.partitions", "5")

In [22]:
flightData2015.sort("count").take(2)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Singapore', count=1),
 Row(DEST_COUNTRY_NAME='Moldova', ORIGIN_COUNTRY_NAME='United States', count=1)]

## Explaining again

In [23]:
flightData2015.sort("count").explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [count#70 ASC NULLS FIRST], true, 0
   +- Exchange rangepartitioning(count#70 ASC NULLS FIRST, 5), ENSURE_REQUIREMENTS, [plan_id=187]
      +- FileScan csv [DEST_COUNTRY_NAME#68,ORIGIN_COUNTRY_NAME#69,count#70] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/jovyan/spark_data_examples/flight-data/csv/2015-summary.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,ORIGIN_COUNTRY_NAME:string,count:int>




## Check  http://localhost:4040/jobs/ again and see what happened

## Turning a SparkDataFrame into a SQL table

In [24]:
flightData2015.createOrReplaceTempView("flight_data_2015")

### SQL queries on the wild

In [109]:
spark.sql("""SELECT DEST_COUNTRY_NAME,
                    SUM(count) AS total_flights
             FROM flight_data_2015
             GROUP BY DEST_COUNTRY_NAME
             ORDER BY total_flights DESC 
             LIMIT 5;""").show(5)

+-----------------+-------------+
|DEST_COUNTRY_NAME|total_flights|
+-----------------+-------------+
|    United States|       411352|
|           Canada|         8399|
|           Mexico|         7140|
|   United Kingdom|         2025|
|            Japan|         1548|
+-----------------+-------------+



### A simple example

In [26]:
sqlWay = spark.sql("""
SELECT DEST_COUNTRY_NAME, count(1) AS counter
FROM flight_data_2015
GROUP BY DEST_COUNTRY_NAME
""")

In [27]:
sqlWay.orderBy('counter',ascending = False).show(3)

+-----------------+-------+
|DEST_COUNTRY_NAME|counter|
+-----------------+-------+
|    United States|    125|
|          Moldova|      1|
|          Bolivia|      1|
+-----------------+-------+
only showing top 3 rows



In [28]:
dataFrameWay = flightData2015\
.groupBy("DEST_COUNTRY_NAME")\
.count()

## Checking the differences between queries and DataFrames

In [29]:
sqlWay.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[DEST_COUNTRY_NAME#68], functions=[count(1)])
   +- Exchange hashpartitioning(DEST_COUNTRY_NAME#68, 5), ENSURE_REQUIREMENTS, [plan_id=293]
      +- HashAggregate(keys=[DEST_COUNTRY_NAME#68], functions=[partial_count(1)])
         +- FileScan csv [DEST_COUNTRY_NAME#68] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/jovyan/spark_data_examples/flight-data/csv/2015-summary.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string>




In [30]:
dataFrameWay.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[DEST_COUNTRY_NAME#68], functions=[count(1)])
   +- Exchange hashpartitioning(DEST_COUNTRY_NAME#68, 5), ENSURE_REQUIREMENTS, [plan_id=306]
      +- HashAggregate(keys=[DEST_COUNTRY_NAME#68], functions=[partial_count(1)])
         +- FileScan csv [DEST_COUNTRY_NAME#68] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/jovyan/spark_data_examples/flight-data/csv/2015-summary.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string>




## Getting statistics

In [36]:
spark.sql("""SELECT max(count),
                    min(count),
                    mean(count),
                    median(count),
                    std(count),
                    mode(count),
                    percentile_approx(count, array(0.25, 0.5, 0.75))
             FROM flight_data_2015""")\
.show()

+----------+----------+-----------+-------------+------------------+-----------+-------------------------------------------------------+
|max(count)|min(count)|mean(count)|median(count)|        std(count)|mode(count)|percentile_approx(count, array(0.25, 0.5, 0.75), 10000)|
+----------+----------+-----------+-------------+------------------+-----------+-------------------------------------------------------+
|    370002|         1|1770.765625|         63.5|23126.516918551915|          1|                                          [14, 63, 268]|
+----------+----------+-----------+-------------+------------------+-----------+-------------------------------------------------------+



## Recipe: extracting the statistics of all fields 

In [122]:
def get_stats(table = 'flight_data_2015'):
    field_list = [field.simpleString().split(':')[0] 
                  for field in 
                  spark.sql(f"""SELECT * FROM {table} LIMIT 1""").schema]
    df_list = []
    for field in field_list:
        df = spark.sql(f"""SELECT 
                        '{field}' AS field_name,
                        approx_count_distinct({field}) AS count_distinct,
                        max({field}) AS max,
                        min({field}) AS min,
                        mean({field}) AS mean,
                        median({field}) AS median,
                        std({field}) AS std,
                        std/mean AS cv,
                        kurtosis({field}) AS kurt,
                        skewness({field}) AS skew,
                        mode({field}) AS mode,
                        percentile_approx({field}, array(0.25, 0.5, 0.75)) AS percentiles,
                        sum(nvl2({field},0,1)) AS null_count
                 FROM {table}""")
        df_list.append(df)
    df_final = reduce(lambda df1,df2: df1.union(df2), df_list)
    return df_final

In [123]:
get_stats().toPandas()

Unnamed: 0,field_name,count_distinct,max,min,mean,median,std,cv,kurt,skew,mode,percentiles,null_count
0,DEST_COUNTRY_NAME,121,Zambia,Algeria,,,,,,,United States,,0
1,ORIGIN_COUNTRY_NAME,116,Vietnam,Angola,,,,,,,United States,,0
2,count,164,370002,1,1770.765625,63.5,23126.516919,13.06018,250.047098,15.861327,1,"[14.0, 63.0, 268.0]",0


In [119]:
spark.sql(f"""SELECT nvl2('count',0,1) FROM flight_data_2015""").show()

+-----------------+
|nvl2(count, 0, 1)|
+-----------------+
|                0|
|                0|
|                0|
|                0|
|                0|
|                0|
|                0|
|                0|
|                0|
|                0|
|                0|
|                0|
|                0|
|                0|
|                0|
|                0|
|                0|
|                0|
|                0|
|                0|
+-----------------+
only showing top 20 rows

