## SQL vs Spark

Examples borrowed from:
    * https://github.com/sbartek/intro-to-pyspark
    * https://github.com/carloapp2/SparkPOT.git
    
See doc on: http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#

Data comes from 
<https://www.kaggle.com/c/competitive-data-science-predict-future-sales/data>

In [2]:
## Check if spark session is defined
## Otherwise create one with
#from pyspark.sql import SparkSession
# spark = SparkSession.builder.appName("PySparkShell").getOrCreate()
spark

In [3]:
!wget https://raw.githubusercontent.com/JulienCojan/pyspark_kschool/master/data/competitive-data-science-predict-future-sales/sales_train.csv.gz -P data/competitive-data-science-predict-future-sales/

# Dataframe

In [5]:
cities_rdd = sc.parallelize([
    ("MAD", "Madrid", "ES", 40.4165, -3.70256),
    ("BCN", "Barcelona", "ES", 41.297078, 2.078464),
    ("PAR", "Paris", "FR", 48.85341, 2.3488),
    ("ROM", "Rome", "IT", 41.89193, 12.51133)])
cities_rdd.collect()

In [6]:
cities_df = cities_rdd.toDF(["city_code","city_name","country_code","latitude","longitude"])
cities_df

In [7]:
cities_df.show()

In [3]:
import pyspark.sql.functions as F

In [9]:
cities_df.select(cities_df.city_code,F.col('city_name'),'latitude').show()

In [10]:
cities_df.select(F.lower(cities_df.city_code),F.col('city_name'))

In [11]:
cities_df.select(F.lower(cities_df.city_code),F.col('city_name')).toPandas()

In [1]:
sales_path="data/competitive-data-science-predict-future-sales/sales_train.csv.gz"
#sales_path="/FileStore/tables/sales_train_csv.gz"

In [2]:
sales_sdf = spark\
    .read\
    .option("header", "true")\
    .option("sep", ",")\
    .csv(sales_path)

In [14]:
sales_sdf.printSchema()

In [15]:
sales_sdf.show(30,truncate=False)

In [16]:
sales_sdf.count()

In [17]:
v = sales_sdf.select(sales_sdf.date,sales_sdf.item_price).limit(20).collect()

In [18]:
v

In [19]:
pdf = sales_sdf.select(sales_sdf.date,sales_sdf.item_price).limit(20).toPandas()
pdf

Unnamed: 0,date,item_price
0,02.01.2013,999.0
1,03.01.2013,899.0
2,05.01.2013,899.0
3,06.01.2013,1709.05
4,15.01.2013,1099.0
5,10.01.2013,349.0
6,02.01.2013,549.0
7,04.01.2013,239.0
8,11.01.2013,299.0
9,03.01.2013,299.0


In [20]:
type(pdf)

In [21]:
sales_sdf[['shop_id','item_id']].head()

In [5]:
sales_sdf.createOrReplaceTempView("sales")

In [23]:
sales_10 = spark.sql(
"""
SELECT *
FROM sales
LIMIT 10
""")
sales_10

In [24]:
sales_10.show()

In [25]:
sales_sdf.select(F.year(F.col('date'))).show()

In [8]:
from pyspark.sql.types import DateType

In [9]:
sales_sdf2 = sales_sdf\
    .withColumnRenamed('date', 'date_str')\
    .withColumn('date', (F.from_unixtime(F.unix_timestamp(F.col('date_str'), 'dd.MM.yyyy'))).cast(DateType()))\
    .withColumn('date_str', F.from_unixtime(F.unix_timestamp(F.col('date_str'), 'dd.MM.yyyy')))
sales_sdf2.show()

+-------------------+--------------+-------+-------+----------+------------+----------+
|           date_str|date_block_num|shop_id|item_id|item_price|item_cnt_day|      date|
+-------------------+--------------+-------+-------+----------+------------+----------+
|2013-01-02 00:00:00|             0|     59|  22154|     999.0|         1.0|2013-01-02|
|2013-01-03 00:00:00|             0|     25|   2552|     899.0|         1.0|2013-01-03|
|2013-01-05 00:00:00|             0|     25|   2552|     899.0|        -1.0|2013-01-05|
|2013-01-06 00:00:00|             0|     25|   2554|   1709.05|         1.0|2013-01-06|
|2013-01-15 00:00:00|             0|     25|   2555|    1099.0|         1.0|2013-01-15|
|2013-01-10 00:00:00|             0|     25|   2564|     349.0|         1.0|2013-01-10|
|2013-01-02 00:00:00|             0|     25|   2565|     549.0|         1.0|2013-01-02|
|2013-01-04 00:00:00|             0|     25|   2572|     239.0|         1.0|2013-01-04|
|2013-01-11 00:00:00|           

In [28]:
sales_sdf2.printSchema()

In [29]:
sales_sdf2.select(F.year(F.col('date_str'))).show()

In [10]:
sales_sdf2.createOrReplaceTempView("sales2")

## SELECT ~ select

In [32]:
spark.sql("""
SELECT shop_id, item_id
FROM sales
""").show()

In [33]:
sales_sdf\
   .select("shop_id", "item_id")\
   .head()

In [34]:
spark.sql("""
SELECT DISTINCT shop_id
FROM sales
""").show()

In [35]:
spark.sql("""
SELECT DISTINCT shop_id, item_id
FROM sales
""").show()

In [36]:
sales_sdf\
    .select("shop_id","item_id")\
    .distinct()\
    .show()

__Exercise__ List distinct values for `date` and `date_block_num`.

In [38]:
sales_sdf\
    .select("date","date_block_num")\
    .distinct()\
    .show()

## WHERE ~ filter

In [40]:
spark.sql("""
SELECT *
FROM sales
WHERE shop_id = 25
""").show(truncate=False)

In [41]:
sales_sdf\
   .filter(F.col("shop_id") == 25)\
   .show()

__Exercise__ List different items from shop 25.

In [43]:
spark.sql("""
SELECT DISTINCT item_id
FROM sales
WHERE shop_id = 25
""").show(truncate=False)

In [44]:
tmp_df = sales_sdf[(sales_sdf["shop_id"] == 25)]
tmp_df.show()

In [45]:
tmp_df = sales_sdf\
   .filter(sales_sdf["shop_id"] == 25)\
   .select('item_id')\
   .distinct()
tmp_df.show()
tmp_df.explain()

In [46]:
tmp_df2 = sales_sdf\
    .select(F.col('item_id'))\
    .distinct()\
    .filter(F.col("shop_id") == 25)
tmp_df2.show()
tmp_df2.explain()

In [47]:
tmp_df2.show()

## ORDER BY ~ orderBy

In [49]:
spark.sql("""
SELECT *
FROM sales
WHERE shop_id = 25 AND item_id = 2252
ORDER BY date desc
""").show()

In [50]:
sales_sdf\
   .filter((F.col("shop_id") == 25) & (F.col("item_id") == 2252))\
   .orderBy("date")\
   .show()

In [51]:
sales_sdf\
   .filter((F.col("shop_id") == 25) & (F.col("item_id") == 2252))\
   .orderBy(F.desc("item_cnt_day"))\
   .show()

__Exercise__ List different items and its price that were sold on 20th or 21st of August 2015 ordered by price starting from the most expensive.

In [53]:
sales_sdf\
   .filter((F.col("date") == "21.08.2015") | (F.col("date") == "22.08.2015"))\
   .orderBy(F.desc("item_price"))\
   .show()

In [54]:
sales_sdf\
   .filter(F.col("date").isin(["21.08.2015","22.08.2015"]))\
   .orderBy(F.desc("item_price"))\
   .show()

In [55]:
sales_sdf2\
   .filter(F.col("date").isin(["2015-08-21","2015-08-22"]))\
   .orderBy(F.desc("item_price"))\
   .show()

In [56]:
sales_sdf2\
   .filter((F.year(F.col("date"))==2015) & (F.month(F.col("date"))==8))\
   .orderBy(F.desc("item_price"))\
   .show()

In [57]:
spark.sql(
"""
select *
from sales2
where year(date)=2015 and month(date)=8
"""
).show()

## AS ~ alias

In [59]:
spark.sql("""
SELECT item_cnt_day
,   item_price
,   item_cnt_day * item_price AS revenue
FROM sales
""").show()

In [60]:
sales_sdf\
    .select(
        F.col("item_cnt_day"),
        F.col("item_price"),
        (F.col("item_cnt_day") * F.col("item_price")).alias("revenue")
    ).show()

## aggregators

In [11]:
spark.sql("""
SELECT AVG(item_cnt_day) AS mean_sale
,   STDDEV(item_cnt_day) AS sd_sales
,   SUM(item_cnt_day) AS sum_sales
,   COUNT(*) AS nitems
FROM sales
""").show()

+-----------------+------------------+---------+-------+
|        mean_sale|          sd_sales|sum_sales| nitems|
+-----------------+------------------+---------+-------+
|1.242640885140891|2.6188344308954035|3648206.0|2935849|
+-----------------+------------------+---------+-------+



In [63]:
sales_sdf\
    .select(
        F.mean(F.col("item_cnt_day")).alias("mean_sales"),
        F.stddev(F.col("item_cnt_day")).alias("sd_sales"),
        F.sum(F.col("item_cnt_day")).alias("sum_sales"),
        F.count(F.col("item_cnt_day")).alias("n_items")
    ).show()

__Exercise__ What is mean, standard deviation and median of the number of sold items?

In [65]:
spark.sql("""
SELECT AVG(item_cnt_day) AS mean_sale
,   STDDEV(item_cnt_day) AS sd_sales
,   SUM(item_cnt_day) AS sum_sales
,   COUNT(*) AS nitems
,   PERCENTILE(item_cnt_day, 0.5) as median
FROM sales
""").show()

In [66]:
sales_sdf.show(50)

## GROUP BY

In [68]:
sales_sdf\
  .groupby("date", "shop_id")\
  .agg(
    F.sum(sales_sdf.item_cnt_day).alias("items_sold"),
    F.avg(sales_sdf.item_cnt_day).alias("avg_items_sold"))\
  .orderBy("shop_id","date")\
  .show()

In [69]:
spark.sql("""
SELECT 
    date_format(from_unixtime(unix_timestamp(date, 'dd.MM.yyyy')), "yyyy-MM-dd") AS date
,   SUM(item_cnt_day) AS items_sold
FROM sales
GROUP BY date
""").show()

In [13]:
sales_sdf\
  .groupby("date", "shop_id")\
  .agg(F.sum(sales_sdf.item_cnt_day).alias("items_sold"))\
  .orderBy("shop_id","date")\
  .show()

+----------+-------+----------+
|      date|shop_id|items_sold|
+----------+-------+----------+
|01.02.2013|      0|     228.0|
|02.02.2013|      0|     317.0|
|03.01.2013|      0|     203.0|
|03.02.2013|      0|     200.0|
|04.01.2013|      0|     352.0|
|04.02.2013|      0|     121.0|
|05.01.2013|      0|     292.0|
|05.02.2013|      0|     173.0|
|06.01.2013|      0|     209.0|
|06.02.2013|      0|     194.0|
|07.01.2013|      0|     132.0|
|07.02.2013|      0|     135.0|
|08.01.2013|      0|     263.0|
|08.02.2013|      0|     287.0|
|09.01.2013|      0|     181.0|
|09.02.2013|      0|     313.0|
|10.01.2013|      0|     184.0|
|10.02.2013|      0|     205.0|
|11.01.2013|      0|     139.0|
|11.02.2013|      0|     162.0|
+----------+-------+----------+
only showing top 20 rows



In [71]:
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline

In [72]:
matplotlib.rcParams['figure.figsize'] = [20, 10]
matplotlib.rcParams['font.size'] = 20

In [73]:
pdf = spark.sql("""
SELECT date_format(from_unixtime(unix_timestamp(date, 'dd.MM.yyyy')), "yyyy-MM-dd") AS date
,   SUM(item_cnt_day) AS items_sold
FROM sales
GROUP BY date
""").toPandas()
pdf.set_index("date")['items_sold'].plot()
plt.show()

In [74]:
import pandas as pd

In [75]:
#pdf.head()
#pdf['date'] = pd.to_datetime(pdf.date)
#pdf.date.dt.year.head()
pdf = pdf[pdf.date.dt.year==2014] #.head()
pdf.sort_values("date").set_index("date")['items_sold'].plot()
plt.show()

In [76]:
sales_sdf\
    .withColumn("date", F.unix_timestamp(F.col("date"), 'dd.MM.yyyy'))\
    .groupBy(F.col("date"))\
    .agg(F.sum(F.col("item_cnt_day")).alias("items_sold"))\
    .show()

__Exercise__ For each day what is total daily revenue? Output table should have columns `date`, `total_revenue`.

In [78]:
sales_sdf\
  .groupBy(F.col("date"))\
  .agg(F.sum(F.col("item_cnt_day") * F.col('item_price')).alias("total_revenue"))\
  .show()

## Joins

In [80]:
sales_sdf.show()

In [81]:
shops_sdf = spark.read.option("header", "true").csv("/FileStore/tables/shops.csv")

In [82]:
shops_sdf.show(truncate=False)

In [83]:
shops_sdf.createOrReplaceTempView("shops")

In [84]:
spark.sql("""
SELECT sales.shop_id
,   shops.shop_name
FROM (
    SELECT DISTINCT shop_id
    FROM sales)  sales
LEFT JOIN shops
ON sales.shop_id == shops.shop_id
""").show(truncate=False)

In [85]:
spark.sql("""
select
  shop_id,
  count(*) as nb
from shops
group by shop_id
having nb > 1
order by nb desc
limit 10
"""
).show()

In [86]:
spark.sql("""
SELECT 
    sales.shop_id
,   shops.shop_name
,   sum(sales.item_cnt_day)
FROM sales
LEFT JOIN shops
ON sales.shop_id = shops.shop_id
GROUP BY sales.shop_id, shops.shop_name
""").show(truncate=False)

In [87]:
spark.sql("""
SELECT 
    sales.shop_id
,   shops.shop_name
,   sum(sales.item_cnt_day)
FROM sales
LEFT JOIN shops
USING (shop_id)
GROUP BY sales.shop_id, shops.shop_name
""").show(truncate=False)

In [88]:
sales_sdf\
  .join(shops_sdf, "shop_id", how="left")\
  .groupBy("shop_id", "shop_name")\
  .agg(F.sum(sales_sdf.item_cnt_day))\
  .show()

In [89]:
sales_sdf\
    .select("shop_id")\
    .distinct()\
    .join(shops_sdf, "shop_id", how="left")\
    .select("shop_id", "shop_name")\
    .show(truncate=False)

__Exercise__ Find a product with highest sell.

In [91]:
# example with cache
# example using struct data
# example using subqueries (explicit or as two dataframes)

In [41]:
item_daily_sales = sales_sdf2\
  .groupBy("date","item_id")\
  .agg(F.sum(F.col("item_cnt_day")).alias("item_cnt_day")).cache()
item_daily_sales.show()

+----------+-------+------------+
|      date|item_id|item_cnt_day|
+----------+-------+------------+
|2013-01-02|   3193|         6.0|
|2013-01-12|   1924|         1.0|
|2013-01-03|   1802|         4.0|
|2013-01-03|   5198|         5.0|
|2013-01-09|   3686|         6.0|
|2013-01-31|   3686|         4.0|
|2013-01-23|   3320|         8.0|
|2013-01-11|   4443|         2.0|
|2013-01-23|   5822|         4.0|
|2013-01-20|   3468|         7.0|
|2013-01-11|  16594|         1.0|
|2013-01-22|    482|         6.0|
|2013-01-20|     32|         7.0|
|2013-01-03|  10774|         1.0|
|2013-01-21|   9043|         7.0|
|2013-01-02|  12829|         2.0|
|2013-01-13|  14217|         3.0|
|2013-01-21|  13418|         3.0|
|2013-01-04|  14164|         2.0|
|2013-01-14|  12874|         2.0|
+----------+-------+------------+
only showing top 20 rows



In [42]:
max_daily_sale = item_daily_sales\
  .groupBy("date")\
  .agg(F.max("item_cnt_day").alias("max_item_cnt_day"))
max_daily_sale.show()

+----------+----------------+
|      date|max_item_cnt_day|
+----------+----------------+
|2013-01-22|            32.0|
|2013-03-26|           527.0|
|2013-05-21|           140.0|
|2013-09-09|           155.0|
|2014-09-26|           288.0|
|2014-11-12|           133.0|
|2015-03-09|           134.0|
|2015-05-19|          2005.0|
|2013-09-19|           452.0|
|2015-03-06|           101.0|
|2013-02-02|           112.0|
|2014-08-01|           228.0|
|2015-04-09|           100.0|
|2015-09-02|            87.0|
|2014-06-03|           140.0|
|2013-02-04|            37.0|
|2013-09-12|           181.0|
|2014-01-24|           231.0|
|2014-02-16|           316.0|
|2014-06-11|           207.0|
+----------+----------------+
only showing top 20 rows



In [43]:
max_daily_sale.alias("ms")\
  .join(item_daily_sales.alias("ds"), F.expr("(ms.date = ds.date) AND (ms.max_item_cnt_day = ds.item_cnt_day)"))\
  .select("ds.date","item_id","item_cnt_day")\
  .orderBy("date")\
  .show()

+----------+-------+------------+
|      date|item_id|item_cnt_day|
+----------+-------+------------+
|2013-01-01|  16450|        39.0|
|2013-01-02|  16450|       120.0|
|2013-01-03|  16450|       103.0|
|2013-01-04|  16450|        91.0|
|2013-01-05|  16450|        83.0|
|2013-01-06|  16450|        77.0|
|2013-01-07|  16450|        57.0|
|2013-01-08|  16450|        53.0|
|2013-01-09|   3432|        31.0|
|2013-01-10|   2808|        23.0|
|2013-01-11|   2808|        50.0|
|2013-01-12|   2808|        65.0|
|2013-01-12|  16450|        65.0|
|2013-01-13|   3432|        47.0|
|2013-01-14|   3432|        34.0|
|2013-01-15|   2973|       134.0|
|2013-01-16|   2973|        80.0|
|2013-01-17|   2973|        43.0|
|2013-01-18|   2973|        41.0|
|2013-01-19|  14346|        79.0|
+----------+-------+------------+
only showing top 20 rows



In [44]:
item_and_sales = item_daily_sales\
  .select("date", F.struct(F.col("item_cnt_day"),F.col("item_id")).alias("par"))\
  .groupBy("date")\
  .agg(F.max("par").alias("best_sale"))\
  .select("date","best_sale.item_cnt_day","best_sale.item_id")\
  .orderBy("date")
item_and_sales.show()

+----------+------------+-------+
|      date|item_cnt_day|item_id|
+----------+------------+-------+
|2013-01-01|        39.0|  16450|
|2013-01-02|       120.0|  16450|
|2013-01-03|       103.0|  16450|
|2013-01-04|        91.0|  16450|
|2013-01-05|        83.0|  16450|
|2013-01-06|        77.0|  16450|
|2013-01-07|        57.0|  16450|
|2013-01-08|        53.0|  16450|
|2013-01-09|        31.0|   3432|
|2013-01-10|        23.0|   2808|
|2013-01-11|        50.0|   2808|
|2013-01-12|        65.0|   2808|
|2013-01-13|        47.0|   3432|
|2013-01-14|        34.0|   3432|
|2013-01-15|       134.0|   2973|
|2013-01-16|        80.0|   2973|
|2013-01-17|        43.0|   2973|
|2013-01-18|        41.0|   2973|
|2013-01-19|        79.0|  14346|
|2013-01-20|        78.0|  14346|
+----------+------------+-------+
only showing top 20 rows



In [96]:
item_and_sales.printSchema()

Bonus: anadir el nombre del producto ? (indice `items.csv`)

__Exercise__ For each day in August 2015 find shop name with maximal sales.

## Window functions

First let's correct the date.

In [19]:
from pyspark.sql import Window

In [18]:
df = spark.createDataFrame([("a", 1), ("b",1), ("b", 2), ("c",  3)], ["Col1", "Col2"])

In [26]:
w = Window.partitionBy("Col1").orderBy('Col2')
df.select("Col1","Col2",F.rank("Col2",1).over(w)).show()

+----+----+----------------------------------------------------------------------------------------------+
|Col1|Col2|lead(Col2, 1, NULL) OVER (PARTITION BY Col1 ORDER BY Col2 ASC NULLS FIRST unspecifiedframe$())|
+----+----+----------------------------------------------------------------------------------------------+
|   c|   3|                                                                                          null|
|   b|   1|                                                                                             2|
|   b|   2|                                                                                          null|
|   a|   1|                                                                                          null|
+----+----+----------------------------------------------------------------------------------------------+



In [28]:
w = Window.partitionBy("Col1").orderBy('Col2')
df.select("Col1","Col2",F.rank().over(w)).show()

+----+----+---------------------------------------------------------------------------------+
|Col1|Col2|RANK() OVER (PARTITION BY Col1 ORDER BY Col2 ASC NULLS FIRST unspecifiedframe$())|
+----+----+---------------------------------------------------------------------------------+
|   c|   3|                                                                                1|
|   b|   1|                                                                                1|
|   b|   2|                                                                                2|
|   a|   1|                                                                                1|
+----+----+---------------------------------------------------------------------------------+



In [31]:
w = Window.partitionBy("Col1").orderBy('Col2')
df\
    .select("Col1","Col2",F.rank().over(w).alias("rank"))\
    .filter(F.col("rank")==1).show()

+----+----+----+
|Col1|Col2|rank|
+----+----+----+
|   c|   3|   1|
|   b|   1|   1|
|   a|   1|   1|
+----+----+----+



In [46]:
item_daily_sales.createOrReplaceTempView("item_daily_sales")

In [45]:
item_daily_sales.show()

+----------+-------+------------+
|      date|item_id|item_cnt_day|
+----------+-------+------------+
|2013-01-02|   3193|         6.0|
|2013-01-12|   1924|         1.0|
|2013-01-03|   1802|         4.0|
|2013-01-03|   5198|         5.0|
|2013-01-09|   3686|         6.0|
|2013-01-31|   3686|         4.0|
|2013-01-23|   3320|         8.0|
|2013-01-11|   4443|         2.0|
|2013-01-23|   5822|         4.0|
|2013-01-20|   3468|         7.0|
|2013-01-11|  16594|         1.0|
|2013-01-22|    482|         6.0|
|2013-01-20|     32|         7.0|
|2013-01-03|  10774|         1.0|
|2013-01-21|   9043|         7.0|
|2013-01-02|  12829|         2.0|
|2013-01-13|  14217|         3.0|
|2013-01-21|  13418|         3.0|
|2013-01-04|  14164|         2.0|
|2013-01-14|  12874|         2.0|
+----------+-------+------------+
only showing top 20 rows



In [47]:
spark.sql("""
SELECT
    date,
    item_id,
    item_cnt_day
FROM(
SELECT
    date,
    item_id,
    item_cnt_day,
    RANK() OVER (partition by date order by item_cnt_day desc) as rank
FROM item_daily_sales)
WHERE rank=1
ORDER BY date
""").show()

+----------+-------+------------+
|      date|item_id|item_cnt_day|
+----------+-------+------------+
|2013-01-01|  16450|        39.0|
|2013-01-02|  16450|       120.0|
|2013-01-03|  16450|       103.0|
|2013-01-04|  16450|        91.0|
|2013-01-05|  16450|        83.0|
|2013-01-06|  16450|        77.0|
|2013-01-07|  16450|        57.0|
|2013-01-08|  16450|        53.0|
|2013-01-09|   3432|        31.0|
|2013-01-10|   2808|        23.0|
|2013-01-11|   2808|        50.0|
|2013-01-12|  16450|        65.0|
|2013-01-12|   2808|        65.0|
|2013-01-13|   3432|        47.0|
|2013-01-14|   3432|        34.0|
|2013-01-15|   2973|       134.0|
|2013-01-16|   2973|        80.0|
|2013-01-17|   2973|        43.0|
|2013-01-18|   2973|        41.0|
|2013-01-19|  14346|        79.0|
+----------+-------+------------+
only showing top 20 rows



In [51]:
spark.sql("""
SELECT shop_id
,   item_id
,   date
,   item_cnt_day
,   LEAD(item_cnt_day) OVER 
        (PARTITION BY shop_id, item_id ORDER BY date) as lead_item_cnt_day
FROM sales2
ORDER BY shop_id
,   item_id
,   date
""").show()

+-------+-------+----------+------------+-----------------+
|shop_id|item_id|      date|item_cnt_day|lead_item_cnt_day|
+-------+-------+----------+------------+-----------------+
|      0|   1000|2013-01-03|         1.0|              1.0|
|      0|   1000|2013-01-08|         1.0|              1.0|
|      0|   1000|2013-01-09|         1.0|              1.0|
|      0|   1000|2013-01-10|         1.0|              1.0|
|      0|   1000|2013-01-20|         1.0|              1.0|
|      0|   1000|2013-02-15|         1.0|              1.0|
|      0|   1000|2013-02-16|         1.0|              1.0|
|      0|   1000|2013-02-17|         1.0|              1.0|
|      0|   1000|2013-02-27|         1.0|             null|
|      0|  10004|2013-02-09|         1.0|             null|
|      0|   1001|2013-01-03|         1.0|              1.0|
|      0|   1001|2013-01-12|         1.0|             null|
|      0|  10012|2013-01-22|         1.0|              1.0|
|      0|  10012|2013-02-14|         1.0

In [52]:
from pyspark.sql import Window

sales_sdf2.select(
    F.col("shop_id"),   
    F.col("item_id"),   
    F.col("date"),   
    F.col("item_cnt_day"),   
    F.lead(F.col("item_cnt_day"))\
        .over(Window.partitionBy("shop_id", "item_id").orderBy('date'))\
        .alias("lead_item_cnt_day")
).orderBy("shop_id", "item_id", "date").show()


+-------+-------+----------+------------+-----------------+
|shop_id|item_id|      date|item_cnt_day|lead_item_cnt_day|
+-------+-------+----------+------------+-----------------+
|      0|   1000|2013-01-03|         1.0|              1.0|
|      0|   1000|2013-01-08|         1.0|              1.0|
|      0|   1000|2013-01-09|         1.0|              1.0|
|      0|   1000|2013-01-10|         1.0|              1.0|
|      0|   1000|2013-01-20|         1.0|              1.0|
|      0|   1000|2013-02-15|         1.0|              1.0|
|      0|   1000|2013-02-16|         1.0|              1.0|
|      0|   1000|2013-02-17|         1.0|              1.0|
|      0|   1000|2013-02-27|         1.0|             null|
|      0|  10004|2013-02-09|         1.0|             null|
|      0|   1001|2013-01-03|         1.0|              1.0|
|      0|   1001|2013-01-12|         1.0|             null|
|      0|  10012|2013-01-22|         1.0|              1.0|
|      0|  10012|2013-02-14|         1.0

In [54]:
window = Window.partitionBy("shop_id", "item_id").orderBy('date')

sales_sdf2.select(
    F.col("shop_id"),   
    F.col("item_id"),   
    F.col("date"),   
    F.col("item_cnt_day"),   
    F.lead(F.col("item_cnt_day"))\
        .over(window)\
        .alias("lead_item_cnt_day")
).orderBy("shop_id", "item_id", "date").show()


+-------+-------+----------+------------+-----------------+
|shop_id|item_id|      date|item_cnt_day|lead_item_cnt_day|
+-------+-------+----------+------------+-----------------+
|      0|   1000|2013-01-03|         1.0|              1.0|
|      0|   1000|2013-01-08|         1.0|              1.0|
|      0|   1000|2013-01-09|         1.0|              1.0|
|      0|   1000|2013-01-10|         1.0|              1.0|
|      0|   1000|2013-01-20|         1.0|              1.0|
|      0|   1000|2013-02-15|         1.0|              1.0|
|      0|   1000|2013-02-16|         1.0|              1.0|
|      0|   1000|2013-02-17|         1.0|              1.0|
|      0|   1000|2013-02-27|         1.0|             null|
|      0|  10004|2013-02-09|         1.0|             null|
|      0|   1001|2013-01-03|         1.0|              1.0|
|      0|   1001|2013-01-12|         1.0|             null|
|      0|  10012|2013-01-22|         1.0|              1.0|
|      0|  10012|2013-02-14|         1.0

__Exercise__ What is moving average (+-3 days) of total daily revenue?

## Extra Exercises

* Which shop had the highest sells in August 2015. What was his name?
* What is the name of category of with the highest monthly sells.

# DataFrame as an RDD of Rows

In [113]:
type(sales_sdf)

In [114]:
row = sales_sdf.first()
row

In [115]:
print(row.date)
print(row['date_block_num'])
row.asDict()

In [116]:
sales_sdf.rdd.take(5)

In [117]:
print(sales_sdf.rdd.toDebugString().decode())

In [118]:
sales_per_day = sales_sdf\
    .groupBy(F.col("date"))\
    .agg(F.sum(F.col("item_cnt_day")))

print(sales_per_day.rdd.toDebugString().decode())