## SQL vs Spark

Examples borrowed from:
    * https://github.com/sbartek/intro-to-pyspark
    * https://github.com/carloapp2/SparkPOT.git
    
See doc on: http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#

Data comes from 
<https://www.kaggle.com/c/competitive-data-science-predict-future-sales/data>

In [1]:
## Check if spark session is defined
## Otherwise create one with
#from pyspark.sql import SparkSession
# spark = SparkSession.builder.appName("PySparkShell").getOrCreate()
spark

In [None]:
!wget https://raw.githubusercontent.com/JulienCojan/pyspark_kschool/master/data/competitive-data-science-predict-future-sales/sales_train.csv.gz -P data/competitive-data-science-predict-future-sales/

# Dataframe

In [6]:
cities_rdd = sc.parallelize([
    ("MAD", "Madrid", "ES", 40.4165, -3.70256),
    ("BCN", "Barcelona", "ES", 41.297078, 2.078464),
    ("PAR", "Paris", "FR", 48.85341, 2.3488),
    ("ROM", "Rome", "IT", 41.89193, 12.51133)])
cities_rdd.collect()

[('MAD', 'Madrid', 'ES', 40.4165, -3.70256),
 ('BCN', 'Barcelona', 'ES', 41.297078, 2.078464),
 ('PAR', 'Paris', 'FR', 48.85341, 2.3488),
 ('ROM', 'Rome', 'IT', 41.89193, 12.51133)]

In [8]:
cities_df = cities_rdd.toDF(["city_code","city_name","country_code","latitude","longitude"])
cities_df

DataFrame[city_code: string, city_name: string, country_code: string, latitude: double, longitude: double]

In [10]:
cities_df.show()

+---------+---------+------------+---------+---------+
|city_code|city_name|country_code| latitude|longitude|
+---------+---------+------------+---------+---------+
|      MAD|   Madrid|          ES|  40.4165| -3.70256|
|      BCN|Barcelona|          ES|41.297078| 2.078464|
|      PAR|    Paris|          FR| 48.85341|   2.3488|
|      ROM|     Rome|          IT| 41.89193| 12.51133|
+---------+---------+------------+---------+---------+



In [15]:
import pyspark.sql.functions as F

In [24]:
cities_df.select(cities_df.city_code,F.col('city_name')).show()

+---------+---------+
|city_code|city_name|
+---------+---------+
|      MAD|   Madrid|
|      BCN|Barcelona|
|      PAR|    Paris|
|      ROM|     Rome|
+---------+---------+



In [21]:
cities_df.select(F.lower(cities_df.city_code),F.col('city_name'))

DataFrame[lower(city_code): string, city_name: string]

In [25]:
cities_df.select(F.lower(cities_df.city_code),F.col('city_name')).toPandas()

Unnamed: 0,lower(city_code),city_name
0,mad,Madrid
1,bcn,Barcelona
2,par,Paris
3,rom,Rome


In [3]:
sales_sdf = spark\
    .read\
    .option("header", "true")\
    .csv("data/competitive-data-science-predict-future-sales/sales_train.csv.gz")

In [4]:
sales_sdf.printSchema()

root
 |-- date: string (nullable = true)
 |-- date_block_num: string (nullable = true)
 |-- shop_id: string (nullable = true)
 |-- item_id: string (nullable = true)
 |-- item_price: string (nullable = true)
 |-- item_cnt_day: string (nullable = true)



In [19]:
sales_sdf.show(30,truncate=False)

+----------+--------------+-------+-------+----------+------------+
|date      |date_block_num|shop_id|item_id|item_price|item_cnt_day|
+----------+--------------+-------+-------+----------+------------+
|02.01.2013|0             |59     |22154  |999.0     |1.0         |
|03.01.2013|0             |25     |2552   |899.0     |1.0         |
|05.01.2013|0             |25     |2552   |899.0     |-1.0        |
|06.01.2013|0             |25     |2554   |1709.05   |1.0         |
|15.01.2013|0             |25     |2555   |1099.0    |1.0         |
|10.01.2013|0             |25     |2564   |349.0     |1.0         |
|02.01.2013|0             |25     |2565   |549.0     |1.0         |
|04.01.2013|0             |25     |2572   |239.0     |1.0         |
|11.01.2013|0             |25     |2572   |299.0     |1.0         |
|03.01.2013|0             |25     |2573   |299.0     |3.0         |
|03.01.2013|0             |25     |2574   |399.0     |2.0         |
|05.01.2013|0             |25     |2574   |399.0

In [29]:
sales_sdf.count()

2935849

In [27]:
sales_sdf.select(sales_sdf.date,sales_sdf.item_price).limit(20).show(30)

+----------+----------+
|      date|item_price|
+----------+----------+
|02.01.2013|     999.0|
|03.01.2013|     899.0|
|05.01.2013|     899.0|
|06.01.2013|   1709.05|
|15.01.2013|    1099.0|
|10.01.2013|     349.0|
|02.01.2013|     549.0|
|04.01.2013|     239.0|
|11.01.2013|     299.0|
|03.01.2013|     299.0|
|03.01.2013|     399.0|
|05.01.2013|     399.0|
|07.01.2013|     399.0|
|08.01.2013|     399.0|
|10.01.2013|     399.0|
|11.01.2013|     399.0|
|13.01.2013|     399.0|
|16.01.2013|     399.0|
|26.01.2013|     399.0|
|27.01.2013|     399.0|
+----------+----------+



In [28]:
sales_sdf.select(sales_sdf.date,sales_sdf.item_price).limit(20).toPandas()

Unnamed: 0,date,item_price
0,02.01.2013,999.0
1,03.01.2013,899.0
2,05.01.2013,899.0
3,06.01.2013,1709.05
4,15.01.2013,1099.0
5,10.01.2013,349.0
6,02.01.2013,549.0
7,04.01.2013,239.0
8,11.01.2013,299.0
9,03.01.2013,299.0


In [39]:
sales_sdf[['shop_id','item_id']].head()

Row(shop_id='59', item_id='22154')

In [30]:
sales_sdf.createOrReplaceTempView("sales")

In [34]:
sales_10 = spark.sql(
"""
SELECT *
FROM sales
LIMIT 10
""")
sales_10

DataFrame[date: string, date_block_num: string, shop_id: string, item_id: string, item_price: string, item_cnt_day: string]

In [35]:
sales_10.show()

+----------+--------------+-------+-------+----------+------------+
|      date|date_block_num|shop_id|item_id|item_price|item_cnt_day|
+----------+--------------+-------+-------+----------+------------+
|02.01.2013|             0|     59|  22154|     999.0|         1.0|
|03.01.2013|             0|     25|   2552|     899.0|         1.0|
|05.01.2013|             0|     25|   2552|     899.0|        -1.0|
|06.01.2013|             0|     25|   2554|   1709.05|         1.0|
|15.01.2013|             0|     25|   2555|    1099.0|         1.0|
|10.01.2013|             0|     25|   2564|     349.0|         1.0|
|02.01.2013|             0|     25|   2565|     549.0|         1.0|
|04.01.2013|             0|     25|   2572|     239.0|         1.0|
|11.01.2013|             0|     25|   2572|     299.0|         1.0|
|03.01.2013|             0|     25|   2573|     299.0|         3.0|
+----------+--------------+-------+-------+----------+------------+



In [72]:
from pyspark.sql.types import DateType

In [73]:
sales_sdf2 = sales_sdf\
    .withColumnRenamed('date', 'date_str')\
    .withColumn('date', (F.from_unixtime(F.unix_timestamp(F.col('date_str'), 'dd.MM.yyyy'))).cast(DateType()))
sales_sdf2.show()

+----------+--------------+-------+-------+----------+------------+----------+
|  date_str|date_block_num|shop_id|item_id|item_price|item_cnt_day|      date|
+----------+--------------+-------+-------+----------+------------+----------+
|02.01.2013|             0|     59|  22154|     999.0|         1.0|2013-01-02|
|03.01.2013|             0|     25|   2552|     899.0|         1.0|2013-01-03|
|05.01.2013|             0|     25|   2552|     899.0|        -1.0|2013-01-05|
|06.01.2013|             0|     25|   2554|   1709.05|         1.0|2013-01-06|
|15.01.2013|             0|     25|   2555|    1099.0|         1.0|2013-01-15|
|10.01.2013|             0|     25|   2564|     349.0|         1.0|2013-01-10|
|02.01.2013|             0|     25|   2565|     549.0|         1.0|2013-01-02|
|04.01.2013|             0|     25|   2572|     239.0|         1.0|2013-01-04|
|11.01.2013|             0|     25|   2572|     299.0|         1.0|2013-01-11|
|03.01.2013|             0|     25|   2573|     299.

In [71]:
sales_sdf2.printSchema()

root
 |-- date_str: string (nullable = true)
 |-- date_block_num: string (nullable = true)
 |-- shop_id: string (nullable = true)
 |-- item_id: string (nullable = true)
 |-- item_price: string (nullable = true)
 |-- item_cnt_day: string (nullable = true)
 |-- date: date (nullable = true)



In [77]:
sales_sdf2.createOrReplaceTempView("sales2")

## SELECT ~ select

In [38]:
spark.sql("""
SELECT shop_id, item_id
FROM sales
""").show()

+-------+-------+
|shop_id|item_id|
+-------+-------+
|     59|  22154|
|     25|   2552|
|     25|   2552|
|     25|   2554|
|     25|   2555|
|     25|   2564|
|     25|   2565|
|     25|   2572|
|     25|   2572|
|     25|   2573|
|     25|   2574|
|     25|   2574|
|     25|   2574|
|     25|   2574|
|     25|   2574|
|     25|   2574|
|     25|   2574|
|     25|   2574|
|     25|   2574|
|     25|   2574|
+-------+-------+
only showing top 20 rows



In [36]:
sales_sdf\
   .select("shop_id", "item_id")\
   .head()

Row(shop_id='59', item_id='22154')

In [40]:
spark.sql("""
SELECT DISTINCT shop_id
FROM sales
""").show()

+-------+
|shop_id|
+-------+
|      7|
|     51|
|     15|
|     54|
|     11|
|     29|
|     42|
|      3|
|     30|
|     34|
|     59|
|      8|
|     22|
|     28|
|     16|
|     35|
|     52|
|      0|
|     47|
|     43|
+-------+
only showing top 20 rows



In [41]:
spark.sql("""
SELECT DISTINCT shop_id, item_id
FROM sales
""").show()

+-------+-------+
|shop_id|item_id|
+-------+-------+
|     25|   2462|
|     25|   5274|
|     25|   4910|
|     25|   5592|
|     24|   5848|
|     23|  21336|
|     24|   3158|
|     25|  13903|
|     25|  16890|
|     25|  16644|
|     25|  16562|
|     25|  16457|
|     25|  15701|
|     25|   8034|
|     25|   8831|
|     25|   7276|
|     25|  11236|
|     25|  12133|
|     19|  15100|
|     19|  14939|
+-------+-------+
only showing top 20 rows



In [42]:
sales_sdf\
    .select("shop_id","item_id")\
    .distinct()\
    .show()

+-------+-------+
|shop_id|item_id|
+-------+-------+
|     25|   2462|
|     25|   5274|
|     25|   4910|
|     25|   5592|
|     24|   5848|
|     23|  21336|
|     24|   3158|
|     25|  13903|
|     25|  16890|
|     25|  16644|
|     25|  16562|
|     25|  16457|
|     25|  15701|
|     25|   8034|
|     25|   8831|
|     25|   7276|
|     25|  11236|
|     25|  12133|
|     19|  15100|
|     19|  14939|
+-------+-------+
only showing top 20 rows



__Exercise__ List distinct values for `date` and `date_block_num`.

In [43]:
sales_sdf\
    .select("date","date_block_num")\
    .distinct()\
    .show()

+----------+--------------+
|      date|date_block_num|
+----------+--------------+
|13.02.2013|             1|
|02.04.2013|             3|
|22.01.2014|            12|
|10.10.2014|            21|
|20.07.2015|            30|
|27.05.2013|             4|
|09.07.2013|             6|
|28.01.2014|            12|
|28.02.2014|            13|
|26.03.2014|            14|
|11.07.2013|             6|
|01.11.2013|            10|
|08.03.2014|            14|
|02.04.2014|            15|
|06.06.2014|            17|
|13.10.2014|            21|
|06.12.2014|            23|
|28.03.2015|            26|
|15.10.2015|            33|
|14.07.2013|             6|
+----------+--------------+
only showing top 20 rows



## WHERE ~ filter

In [44]:
spark.sql("""
SELECT *
FROM sales
WHERE shop_id = 25
""").show(truncate=False)

+----------+--------------+-------+-------+----------+------------+
|date      |date_block_num|shop_id|item_id|item_price|item_cnt_day|
+----------+--------------+-------+-------+----------+------------+
|03.01.2013|0             |25     |2552   |899.0     |1.0         |
|05.01.2013|0             |25     |2552   |899.0     |-1.0        |
|06.01.2013|0             |25     |2554   |1709.05   |1.0         |
|15.01.2013|0             |25     |2555   |1099.0    |1.0         |
|10.01.2013|0             |25     |2564   |349.0     |1.0         |
|02.01.2013|0             |25     |2565   |549.0     |1.0         |
|04.01.2013|0             |25     |2572   |239.0     |1.0         |
|11.01.2013|0             |25     |2572   |299.0     |1.0         |
|03.01.2013|0             |25     |2573   |299.0     |3.0         |
|03.01.2013|0             |25     |2574   |399.0     |2.0         |
|05.01.2013|0             |25     |2574   |399.0     |1.0         |
|07.01.2013|0             |25     |2574   |399.0

In [45]:
sales_sdf\
   .filter(F.col("shop_id") == 25)\
   .show()

+----------+--------------+-------+-------+----------+------------+
|      date|date_block_num|shop_id|item_id|item_price|item_cnt_day|
+----------+--------------+-------+-------+----------+------------+
|03.01.2013|             0|     25|   2552|     899.0|         1.0|
|05.01.2013|             0|     25|   2552|     899.0|        -1.0|
|06.01.2013|             0|     25|   2554|   1709.05|         1.0|
|15.01.2013|             0|     25|   2555|    1099.0|         1.0|
|10.01.2013|             0|     25|   2564|     349.0|         1.0|
|02.01.2013|             0|     25|   2565|     549.0|         1.0|
|04.01.2013|             0|     25|   2572|     239.0|         1.0|
|11.01.2013|             0|     25|   2572|     299.0|         1.0|
|03.01.2013|             0|     25|   2573|     299.0|         3.0|
|03.01.2013|             0|     25|   2574|     399.0|         2.0|
|05.01.2013|             0|     25|   2574|     399.0|         1.0|
|07.01.2013|             0|     25|   2574|     

__Exercise__ List different items from shop 25.

In [46]:
spark.sql("""
SELECT DISTINCT item_id
FROM sales
WHERE shop_id = 25
""").show(truncate=False)

+-------+
|item_id|
+-------+
|2088   |
|3210   |
|829    |
|14899  |
|13610  |
|17506  |
|6613   |
|10096  |
|11332  |
|20158  |
|18130  |
|18947  |
|6194   |
|18634  |
|19338  |
|21331  |
|15555  |
|17401  |
|6240   |
|3959   |
+-------+
only showing top 20 rows



In [51]:
tmp_df = sales_sdf\
   .filter(F.col("shop_id") == 25)\
   .select('item_id')\
   .distinct()
tmp_df.explain()

== Physical Plan ==
*(2) HashAggregate(keys=[item_id#35], functions=[])
+- Exchange hashpartitioning(item_id#35, 200)
   +- *(1) HashAggregate(keys=[item_id#35], functions=[])
      +- *(1) Project [item_id#35]
         +- *(1) Filter (isnotnull(shop_id#34) && (cast(shop_id#34 as int) = 25))
            +- *(1) FileScan csv [shop_id#34,item_id#35] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/mnt/c/Users/jcojan/Documents/Perso/Curso/KSchool/Spark/pyspark_kschool/da..., PartitionFilters: [], PushedFilters: [IsNotNull(shop_id)], ReadSchema: struct<shop_id:string,item_id:string>


In [53]:
tmp_df.show()

+-------+
|item_id|
+-------+
|   2088|
|   3210|
|    829|
|  14899|
|  13610|
|  17506|
|   6613|
|  10096|
|  11332|
|  20158|
|  18130|
|  18947|
|   6194|
|  18634|
|  19338|
|  21331|
|  15555|
|  17401|
|   6240|
|   3959|
+-------+
only showing top 20 rows



In [52]:
tmp_df2 = sales_sdf\
    .select(F.col('item_id'))\
    .distinct()\
    .filter(F.col("shop_id") == 25)
tmp_df2.explain()

== Physical Plan ==
*(3) Project [item_id#35]
+- *(3) Filter (isnotnull(shop_id#34) && (cast(shop_id#34 as int) = 25))
   +- SortAggregate(key=[item_id#35], functions=[first(shop_id#34, false)])
      +- *(2) Sort [item_id#35 ASC NULLS FIRST], false, 0
         +- Exchange hashpartitioning(item_id#35, 200)
            +- SortAggregate(key=[item_id#35], functions=[partial_first(shop_id#34, false)])
               +- *(1) Sort [item_id#35 ASC NULLS FIRST], false, 0
                  +- *(1) Project [item_id#35, shop_id#34]
                     +- *(1) FileScan csv [shop_id#34,item_id#35] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/mnt/c/Users/jcojan/Documents/Perso/Curso/KSchool/Spark/pyspark_kschool/da..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<shop_id:string,item_id:string>


In [54]:
tmp_df2.show()

+-------+
|item_id|
+-------+
|  10096|
|  11332|
|  11563|
|  13610|
|  14899|
|  16576|
|  17506|
|  20428|
|  20512|
|   2088|
|   2136|
|   3210|
|   6613|
|    829|
|  10272|
|  10309|
|  10603|
|  11236|
|  12542|
|  14218|
+-------+
only showing top 20 rows



## ORDER BY ~ orderBy

In [57]:
spark.sql("""
SELECT *
FROM sales
WHERE shop_id = 25 AND item_id = 2252
ORDER BY date desc
""").show()

+----------+--------------+-------+-------+----------+------------+
|      date|date_block_num|shop_id|item_id|item_price|item_cnt_day|
+----------+--------------+-------+-------+----------+------------+
|31.12.2013|            11|     25|   2252|     599.0|         1.0|
|30.12.2013|            11|     25|   2252|     599.0|         3.0|
|30.08.2013|             7|     25|   2252|     599.0|         1.0|
|30.05.2013|             4|     25|   2252|     599.0|         1.0|
|30.03.2013|             2|     25|   2252|     599.0|         3.0|
|30.01.2013|             0|     25|   2252|     599.0|         1.0|
|29.10.2013|             9|     25|   2252|     599.0|         1.0|
|29.09.2013|             8|     25|   2252|     599.0|         1.0|
|29.08.2013|             7|     25|   2252|     599.0|         1.0|
|29.03.2015|            26|     25|   2252|     399.0|         3.0|
|29.03.2013|             2|     25|   2252|     599.0|         1.0|
|28.12.2013|            11|     25|   2252|     

In [56]:
sales_sdf\
   .filter((F.col("shop_id") == 25) & (F.col("item_id") == 2252))\
   .orderBy("date")\
   .show()

+----------+--------------+-------+-------+----------+------------+
|      date|date_block_num|shop_id|item_id|item_price|item_cnt_day|
+----------+--------------+-------+-------+----------+------------+
|01.02.2013|             1|     25|   2252|     599.0|         1.0|
|01.04.2013|             3|     25|   2252|     598.5|         1.0|
|01.07.2013|             6|     25|   2252|     599.0|         1.0|
|01.07.2015|            30|     25|   2252|     399.0|         1.0|
|01.08.2013|             7|     25|   2252|     599.0|         1.0|
|02.01.2015|            24|     25|   2252|     399.0|         1.0|
|02.05.2013|             4|     25|   2252|     599.0|         1.0|
|02.06.2014|            17|     25|   2252|     419.0|         1.0|
|02.09.2013|             8|     25|   2252|     599.0|         1.0|
|02.11.2013|            10|     25|   2252|     599.0|         1.0|
|02.12.2014|            23|     25|   2252|     279.0|         1.0|
|03.03.2013|             2|     25|   2252|     

In [58]:
sales_sdf\
   .filter((F.col("shop_id") == 25) & (F.col("item_id") == 2252))\
   .orderBy(F.desc("item_cnt_day"))\
   .show()

+----------+--------------+-------+-------+----------+------------+
|      date|date_block_num|shop_id|item_id|item_price|item_cnt_day|
+----------+--------------+-------+-------+----------+------------+
|30.12.2013|            11|     25|   2252|     599.0|         3.0|
|29.03.2015|            26|     25|   2252|     399.0|         3.0|
|30.03.2013|             2|     25|   2252|     599.0|         3.0|
|26.05.2013|             4|     25|   2252|     599.0|         3.0|
|16.11.2014|            22|     25|   2252|     419.0|         2.0|
|15.03.2013|             2|     25|   2252|     599.0|         2.0|
|05.10.2013|             9|     25|   2252|     599.0|         2.0|
|10.10.2014|            21|     25|   2252|     419.0|         2.0|
|16.08.2015|            31|     25|   2252|     399.0|         2.0|
|21.07.2013|             6|     25|   2252|     599.0|         2.0|
|03.03.2013|             2|     25|   2252|     599.0|         2.0|
|18.04.2014|            15|     25|   2252|     

__Exercise__ List different items and its price that were sold on 20th or 21st of August 2015 ordered by price starting from the most expensive.

In [60]:
sales_sdf\
   .filter((F.col("date") == "21.08.2015") | (F.col("date") == "22.08.2015"))\
   .orderBy(F.desc("item_price"))\
   .show()

+----------+--------------+-------+-------+-------------+------------+
|      date|date_block_num|shop_id|item_id|   item_price|item_cnt_day|
+----------+--------------+-------+-------+-------------+------------+
|21.08.2015|            31|     31|   1540|       999.75|         2.0|
|22.08.2015|            31|     42|   1540|999.666666667|         3.0|
|22.08.2015|            31|     14|   1540|        999.5|         1.0|
|22.08.2015|            31|     39|   3464|        999.5|         1.0|
|22.08.2015|            31|     39|   1557|        999.5|         1.0|
|21.08.2015|            31|     57|   2430|        999.5|         1.0|
|22.08.2015|            31|     35|   1540|        999.5|         1.0|
|22.08.2015|            31|     52|   1540|        999.5|         1.0|
|21.08.2015|            31|     57|   2431|        999.5|         1.0|
|21.08.2015|            31|     52|   2423|        999.5|         1.0|
|22.08.2015|            31|     42|   6503|        999.5|         2.0|
|22.08

In [64]:
sales_sdf\
   .filter(F.col("date").isin(["21.08.2015","22.08.2015"]))\
   .orderBy(F.desc("item_price"))\
   .show()

+----------+--------------+-------+-------+-------------+------------+
|      date|date_block_num|shop_id|item_id|   item_price|item_cnt_day|
+----------+--------------+-------+-------+-------------+------------+
|21.08.2015|            31|     31|   1540|       999.75|         2.0|
|22.08.2015|            31|     42|   1540|999.666666667|         3.0|
|22.08.2015|            31|     14|   1540|        999.5|         1.0|
|22.08.2015|            31|     39|   3464|        999.5|         1.0|
|22.08.2015|            31|     39|   1557|        999.5|         1.0|
|21.08.2015|            31|     57|   2430|        999.5|         1.0|
|22.08.2015|            31|     35|   1540|        999.5|         1.0|
|22.08.2015|            31|     52|   1540|        999.5|         1.0|
|21.08.2015|            31|     57|   2431|        999.5|         1.0|
|21.08.2015|            31|     52|   2423|        999.5|         1.0|
|22.08.2015|            31|     42|   6503|        999.5|         2.0|
|22.08

In [74]:
sales_sdf2\
   .filter(F.col("date").isin(["2015-08-21","2015-08-22"]))\
   .orderBy(F.desc("item_price"))\
   .show()

+----------+--------------+-------+-------+-------------+------------+----------+
|  date_str|date_block_num|shop_id|item_id|   item_price|item_cnt_day|      date|
+----------+--------------+-------+-------+-------------+------------+----------+
|21.08.2015|            31|     31|   1540|       999.75|         2.0|2015-08-21|
|22.08.2015|            31|     42|   1540|999.666666667|         3.0|2015-08-22|
|22.08.2015|            31|     14|   1540|        999.5|         1.0|2015-08-22|
|22.08.2015|            31|     39|   3464|        999.5|         1.0|2015-08-22|
|22.08.2015|            31|     39|   1557|        999.5|         1.0|2015-08-22|
|21.08.2015|            31|     57|   2430|        999.5|         1.0|2015-08-21|
|22.08.2015|            31|     35|   1540|        999.5|         1.0|2015-08-22|
|22.08.2015|            31|     52|   1540|        999.5|         1.0|2015-08-22|
|21.08.2015|            31|     57|   2431|        999.5|         1.0|2015-08-21|
|21.08.2015|    

In [75]:
sales_sdf2\
   .filter((F.year(F.col("date"))==2015) & (F.month(F.col("date"))==8))\
   .orderBy(F.desc("item_price"))\
   .show()

+----------+--------------+-------+-------+-------------+------------+----------+
|  date_str|date_block_num|shop_id|item_id|   item_price|item_cnt_day|      date|
+----------+--------------+-------+-------+-------------+------------+----------+
|01.08.2015|            31|     25|  10213|       9999.0|         1.0|2015-08-01|
|04.08.2015|            31|     16|  13398|       9990.0|         1.0|2015-08-04|
|13.08.2015|            31|     57|  13398|       9990.0|         1.0|2015-08-13|
|12.08.2015|            31|     57|  13398|       9990.0|         1.0|2015-08-12|
|04.08.2015|            31|     49|  13398|       9990.0|         1.0|2015-08-04|
|30.08.2015|            31|     48|   7935|       9990.0|         1.0|2015-08-30|
|19.08.2015|            31|     26|   1540|999.833333333|         3.0|2015-08-19|
|20.08.2015|            31|     35|   3445|       999.75|         2.0|2015-08-20|
|19.08.2015|            31|      6|   1540|       999.75|         2.0|2015-08-19|
|20.08.2015|    

In [80]:
spark.sql(
"""
select *
from sales2
where year(date)=2015 and month(date)=8
"""
).show()

+----------+--------------+-------+-------+----------+------------+----------+
|  date_str|date_block_num|shop_id|item_id|item_price|item_cnt_day|      date|
+----------+--------------+-------+-------+----------+------------+----------+
|31.08.2015|            31|     42|   4261|     299.0|         1.0|2015-08-31|
|10.08.2015|            31|     46|  12752|     499.0|         1.0|2015-08-10|
|09.08.2015|            31|     46|  13556|    2499.0|         1.0|2015-08-09|
|22.08.2015|            31|     46|  12805|     169.0|         1.0|2015-08-22|
|08.08.2015|            31|     46|  12828|      99.0|         1.0|2015-08-08|
|19.08.2015|            31|     46|  12828|      99.0|         1.0|2015-08-19|
|24.08.2015|            31|     46|  12830|     199.0|         1.0|2015-08-24|
|21.08.2015|            31|     46|  13594|    2299.0|         1.0|2015-08-21|
|15.08.2015|            31|     46|  12889|     229.0|         1.0|2015-08-15|
|31.08.2015|            31|     46|  12899|     399.

## AS ~ alias

In [83]:
spark.sql("""
SELECT item_cnt_day
,   item_price
,   item_cnt_day * item_price AS revenue
FROM sales
""").show()

+------------+----------+-------+
|item_cnt_day|item_price|revenue|
+------------+----------+-------+
|         1.0|     999.0|  999.0|
|         1.0|     899.0|  899.0|
|        -1.0|     899.0| -899.0|
|         1.0|   1709.05|1709.05|
|         1.0|    1099.0| 1099.0|
|         1.0|     349.0|  349.0|
|         1.0|     549.0|  549.0|
|         1.0|     239.0|  239.0|
|         1.0|     299.0|  299.0|
|         3.0|     299.0|  897.0|
|         2.0|     399.0|  798.0|
|         1.0|     399.0|  399.0|
|         1.0|     399.0|  399.0|
|         2.0|     399.0|  798.0|
|         1.0|     399.0|  399.0|
|         2.0|     399.0|  798.0|
|         1.0|     399.0|  399.0|
|         1.0|     399.0|  399.0|
|         1.0|     399.0|  399.0|
|         1.0|     399.0|  399.0|
+------------+----------+-------+
only showing top 20 rows



In [84]:
sales_sdf\
    .select(
        F.col("item_cnt_day"),
        F.col("item_price"),
        (F.col("item_cnt_day") * F.col("item_price")).alias("revenue")
    ).show()

+------------+----------+-------+
|item_cnt_day|item_price|revenue|
+------------+----------+-------+
|         1.0|     999.0|  999.0|
|         1.0|     899.0|  899.0|
|        -1.0|     899.0| -899.0|
|         1.0|   1709.05|1709.05|
|         1.0|    1099.0| 1099.0|
|         1.0|     349.0|  349.0|
|         1.0|     549.0|  549.0|
|         1.0|     239.0|  239.0|
|         1.0|     299.0|  299.0|
|         3.0|     299.0|  897.0|
|         2.0|     399.0|  798.0|
|         1.0|     399.0|  399.0|
|         1.0|     399.0|  399.0|
|         2.0|     399.0|  798.0|
|         1.0|     399.0|  399.0|
|         2.0|     399.0|  798.0|
|         1.0|     399.0|  399.0|
|         1.0|     399.0|  399.0|
|         1.0|     399.0|  399.0|
|         1.0|     399.0|  399.0|
+------------+----------+-------+
only showing top 20 rows



## aggregators

In [None]:
spark.sql("""
SELECT AVG(item_cnt_day) AS mean_sale
,   STDDEV(item_cnt_day) AS sd_sales
,   SUM(item_cnt_day) AS sum_sales
,   COUNT(*) AS nitems
FROM sales
""").show()

In [None]:
sales_sdf\
    .select(
        F.mean(F.col("item_cnt_day")).alias("mean_sales"),
        F.stddev(F.col("item_cnt_day")).alias("sd_sales"),
        F.sum(F.col("item_cnt_day")).alias("sum_sales"),
        F.count(F.col("item_cnt_day")).alias("n_items")
    ).show()

__Exercise__ What is mean, standard deviation and median of the number of sold items?

## GROUP BY

In [None]:
spark.sql("""
SELECT date_format(from_unixtime(unix_timestamp(date, 'dd.MM.yyyy')), "yyyy-MM-dd") AS date
,   SUM(item_cnt_day) AS items_sold
FROM sales
GROUP BY date
""").show()

In [None]:
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline

In [None]:
matplotlib.rcParams['figure.figsize'] = [20, 10]
matplotlib.rcParams['font.size'] = 20

In [None]:
spark.sql("""
SELECT date_format(from_unixtime(unix_timestamp(date, 'dd.MM.yyyy')), "yyyy-MM-dd") AS date
,   SUM(item_cnt_day) AS items_sold
FROM sales
GROUP BY date
""").toPandas()\
    .set_index("date")['items_sold'].plot()
plt.show()

In [None]:
sales_sdf\
    .withColumn("date", F.unix_timestamp(F.col("date"), 'dd.MM.yyyy'))\
    .groupBy(F.col("date"))\
    .agg(F.sum(F.col("item_cnt_day")).alias("items_sold"))\
    .show()

__Exercise__ For each day what is total daily revenue? Output table should have columns `date`, `total_revenue`.

## Joins

In [None]:
shops_sdf = spark.read.option("header", "true").csv("data/shops.csv")

In [None]:
shops_sdf.show(truncate=False)

In [None]:
shops_sdf.createOrReplaceTempView("shops")

In [None]:
spark.sql("""
SELECT sales.shop_id
,   shops.shop_name
FROM (
    SELECT DISTINCT shop_id
    FROM sales)  sales
LEFT JOIN shops
ON sales.shop_id == shops.shop_id
""").show(truncate=False)

In [None]:
sales_sdf\
    .select("shop_id")\
    .distinct()\
    .join(shops_sdf, "shop_id", how="left")\
    .select("shop_id", "shop_name")\
    .show(truncate=False)

__Exercise__ Find a product with highest sell. What is its name? (hint use `items.csv`)

In [None]:
# example with cache
# example using struct data
# example using subqueries (explicit or as two dataframes)

__Exercise__ For each day in August 2015 find shop name with maximal sales.

## Window functions

First let's correct the date.

In [None]:
spark.sql("""
SELECT *
,   date_format(from_unixtime(unix_timestamp(date, 'dd.MM.yyyy')), "yyyy-MM-dd") AS date_iso
FROM sales
""").createOrReplaceTempView("sales_iso")

In [None]:
sales_iso_sdf = sales_sdf\
    .withColumn(
    "date_iso", 
    F.unix_timestamp(F.col("date"), 'dd.MM.yyyy'))
sales_iso_sdf.show()

In [None]:
spark.sql("""
SELECT shop_id
,   item_id
,   date_iso
,   item_cnt_day
,   LEAD(item_cnt_day) OVER 
        (PARTITION BY shop_id, item_id ORDER BY date_iso) as lead_item_cnt_day
FROM sales_iso
ORDER BY shop_id
,   item_id
,   date_iso
""").show()

In [None]:
from pyspark.sql import Window

sales_iso_sdf.select(
    F.col("shop_id"),   
    F.col("item_id"),   
    F.col("date_iso"),   
    F.col("item_cnt_day"),   
    F.lead(F.col("item_cnt_day"))\
        .over(Window.partitionBy("shop_id", "item_id").orderBy('date_iso'))\
        .alias("lead_item_cnt_day")
).orderBy("shop_id", "item_id", "date_iso").show()


In [None]:
window = Window.partitionBy("shop_id", "item_id").orderBy('date_iso')

sales_iso_sdf.select(
    F.col("shop_id"),   
    F.col("item_id"),   
    F.col("date_iso"),   
    F.col("item_cnt_day"),   
    F.lead(F.col("item_cnt_day"))\
        .over(window)\
        .alias("lead_item_cnt_day")
).orderBy("shop_id", "item_id", "date_iso").show()


__Exercise__ What is moving average (+-3 days) of total daily revenue?

## Extra Exercises

* Which shop had the highest sells in August 2015. What was his name?
* What is the name of category of with the highest monthly sells. 

# DataFrame as an RDD of Rows

In [None]:
type(sales_sdf)

In [None]:
row = sales_sdf.first()
row

In [None]:
print(row.date)
print(row['date_block_num'])
row.asDict()

In [None]:
sales_sdf.rdd.take(5)

In [None]:
print(sales_sdf.rdd.toDebugString().decode())

In [None]:
sales_per_day = sales_sdf\
    .groupBy(F.col("date"))\
    .agg(F.sum(F.col("item_cnt_day")))

print(sales_per_day.rdd.toDebugString().decode())