In [None]:
!apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop2.7.tgz
!tar -xvf spark-3.1.1-bin-hadoop2.7.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop2.7"

In [None]:
from google.colab import drive
drive.mount("/content/gdrive", force_remount=True)
%cd '/content/gdrive/MyDrive/LDS9_K265_TranHoangBach/Week_4/data_day_7'

Mounted at /content/gdrive
/content/gdrive/MyDrive/LDS9_K265_TranHoangBach/Week_4/data_day_7


In [None]:
import findspark
findspark.init()
from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

import matplotlib.pyplot as plt
import seaborn as sb
import numpy as np
import pandas as pd

%matplotlib inline

In [None]:
spark = SparkSession \
  .builder \
  .master("local[*]")\
  .appName("New-Spark") \
  .config("spark.memory.fraction", 0.8) \
  .config("spark.executor.memory", "10g") \
  .config("spark.driver.memory", "10g")\
  .config("spark.sql.shuffle.partitions" , "800") \
  .config("spark.memory.offHeap.enabled",'true')\
  .config("spark.memory.offHeap.size","10g")\
  .getOrCreate()
spark

In [None]:
orders = spark.read.csv("instacart_2017_05_01/order_products__train.csv", header=True, inferSchema=True)
orders.show(5)

+--------+----------+-----------------+---------+
|order_id|product_id|add_to_cart_order|reordered|
+--------+----------+-----------------+---------+
|       1|     49302|                1|        1|
|       1|     11109|                2|        1|
|       1|     10246|                3|        0|
|       1|     49683|                4|        0|
|       1|     43633|                5|        1|
+--------+----------+-----------------+---------+
only showing top 5 rows



In [None]:
orders.count()

1384617

In [None]:
orders.select('product_id').distinct().count()

39123

In [None]:
orders.select('order_id').distinct().count()

131209

In [None]:
df = orders.groupby('order_id').agg(collect_set('product_id').alias('items'))
df.show(5)

+--------+--------------------+
|order_id|               items|
+--------+--------------------+
|    4519|             [29270]|
|   40011|[27292, 35213, 21...|
|   46266|[38558, 48642, 13...|
|   61793|[26348, 6184, 433...|
|   91937|[20708, 38200, 26...|
+--------+--------------------+
only showing top 5 rows



In [None]:
df.select('order_id').distinct().count()

131209

- Có 131.209 giao dịch.
- Dựa trên số lượng giao dịch, lựa chọn 1 cách tương đối
    - Các items có ít nhất 5.000 giao dịch, minSupport = 0.038
    - Các kết hợp có trên 5% xác suất xảy ra (cùng mua thêm B khi đã mua A), minConfidence = 0.05

In [None]:
from pyspark.ml.fpm import FPGrowth

fpgrowth = FPGrowth(itemsCol='items', minSupport=0.003, minConfidence=0.003)
model = fpgrowth.fit(df)
predictions = model.transform(df)
predictions.show()

+--------+--------------------+--------------------+
|order_id|               items|          prediction|
+--------+--------------------+--------------------+
|    4519|             [29270]|                  []|
|   40011|[27292, 35213, 21...|[21137, 13176, 24...|
|   46266|[38558, 48642, 13...|[47626, 47766, 47...|
|   61793|[26348, 6184, 433...|[21137, 16797, 39...|
|   91937|[20708, 38200, 26...|                  []|
|  120988|[3798, 16797, 326...|[21137, 39275, 13...|
|  128389|[41220, 21903, 39...|[21137, 16797, 13...|
|  141737|             [14032]|                  []|
|  147958|[12567, 34644, 40...|[21137, 47766, 45...|
|  154034|[30450, 31717, 31...|[26209, 21137, 47...|
|  198430|[49683, 20842, 12...|[21137, 47766, 21...|
|  219523|[28842, 40174, 44...|[21137, 13176, 21...|
|  222710|[27651, 37895, 49...|                  []|
|  227470|[47766, 49683, 34...|[21137, 13176, 21...|
|  237810|      [17275, 37687]|                  []|
|  246944|[41400, 26914, 13...|[21137, 21903, 

In [None]:
predictions.count()

131209

In [None]:
model.freqItemsets.show()

+--------------------+-----+
|               items| freq|
+--------------------+-----+
|             [24852]|18726|
|             [13176]|15480|
|             [21137]|10894|
|      [21137, 13176]| 3074|
|      [21137, 24852]| 2174|
|             [21903]| 9784|
|      [21903, 21137]| 1639|
|[21903, 21137, 13...|  587|
|      [21903, 13176]| 2236|
|      [21903, 24852]| 2000|
|             [47626]| 8135|
|      [47626, 21137]| 1017|
|      [47626, 13176]| 1057|
|      [47626, 21903]| 1238|
|      [47626, 24852]| 2158|
|             [47766]| 7409|
|      [47766, 21137]| 1088|
|      [47766, 47626]| 1349|
|[47766, 47626, 24...|  477|
|      [47766, 13176]| 1070|
+--------------------+-----+
only showing top 20 rows



In [None]:
products = spark.read.csv("instacart_2017_05_01/products.csv", header=True, inferSchema=True)
products.show(5)

+----------+--------------------+--------+-------------+
|product_id|        product_name|aisle_id|department_id|
+----------+--------------------+--------+-------------+
|         1|Chocolate Sandwic...|      61|           19|
|         2|    All-Seasons Salt|     104|           13|
|         3|Robust Golden Uns...|      94|            7|
|         4|Smart Ones Classi...|      38|            1|
|         5|Green Chile Anyti...|       5|           13|
+----------+--------------------+--------+-------------+
only showing top 5 rows



In [None]:
orders = orders.join(products, on=['product_id'], how='left')
orders.show() 

+----------+--------+-----------------+---------+--------------------+--------+-------------+
|product_id|order_id|add_to_cart_order|reordered|        product_name|aisle_id|department_id|
+----------+--------+-----------------+---------+--------------------+--------+-------------+
|     49302|       1|                1|        1|    Bulgarian Yogurt|     120|           16|
|     11109|       1|                2|        1|Organic 4% Milk F...|     108|           16|
|     10246|       1|                3|        0|Organic Celery He...|      83|            4|
|     49683|       1|                4|        0|      Cucumber Kirby|      83|            4|
|     43633|       1|                5|        1|Lightly Smoked Sa...|      95|           15|
|     13176|       1|                6|        0|Bag of Organic Ba...|      24|            4|
|     47209|       1|                7|        0|Organic Hass Avocado|      24|            4|
|     22035|       1|                8|        1|Organic Who

In [None]:
df = orders.groupby('order_id').agg(collect_set('product_name').alias('items'))
df.show(5)

+--------+--------------------+
|order_id|               items|
+--------+--------------------+
|    4519|[Beet Apple Carro...|
|   40011|[Organic Baby Spi...|
|   46266|[Uncured Beef Hot...|
|   61793|[Raspberries, Gre...|
|   91937|[No. 485 Gin, Mon...|
+--------+--------------------+
only showing top 5 rows



In [None]:
fpgrowth = FPGrowth(itemsCol='items', minSupport=0.003, minConfidence=0.003)
model = fpgrowth.fit(df)
predictions = model.transform(df)
predictions.show()

+--------+--------------------+--------------------+
|order_id|               items|          prediction|
+--------+--------------------+--------------------+
|    4519|[Beet Apple Carro...|                  []|
|   40011|[Organic Baby Spi...|[Organic Strawber...|
|   46266|[Uncured Beef Hot...|[Organic Hass Avo...|
|   61793|[Raspberries, Gre...|[Organic Strawber...|
|   91937|[No. 485 Gin, Mon...|                  []|
|  120988|[Raspberries, Clu...|[Organic Avocado,...|
|  128389|[Caramel Almondmi...|[Organic Strawber...|
|  141737|    [Tortilla Chips]|                  []|
|  147958|[Limon, Organic G...|[Organic Strawber...|
|  154034|[Roasted Garlic H...|[Limes, Organic S...|
|  198430|[Organic Fuji App...|[Limes, Organic S...|
|  219523|[Firm Tofu, Organ...|[Banana, Limes, O...|
|  222710|[Real Mayonnaise,...|                  []|
|  227470|[Organic Shredded...|[Bag of Organic B...|
|  237810|[Egg Pappardelle ...|                  []|
|  246944|[Crunchy Oats 'n ...|[Organic Strawb

In [None]:
model.freqItemsets.sort(col('freq').desc()).show()

+--------------------+----+
|               items|freq|
+--------------------+----+
|[Squeeze Tomato K...| 394|
|[Goldfish Cheddar...| 394|
|[Pesto Tortellini...| 394|
|[Organic Greek Wh...| 395|
|[Jalapeno Peppers...| 395|
|[Vitamin D Whole ...| 395|
|[Yellow Bell Pepp...| 396|
|[Cheddar Bunnies ...| 397|
|[Fresh CA Grown E...| 397|
|[Organic Grape To...| 397|
|[Original Rice Pi...| 397|
|[Scoops! Tortilla...| 398|
|[Organic Mixed Ve...| 398|
|[Blueberries, Str...| 398|
|[Organic Cilantro...| 399|
|[Sliced White Mus...| 399|
|[Organic Dijon Mu...| 400|
|[Organic Garnet S...| 400|
|[Organic Whole Wh...| 400|
|[Organic Garnet S...| 400|
+--------------------+----+
only showing top 20 rows



In [None]:
predictions.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- items: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- prediction: array (nullable = true)
 |    |-- element: string (containsNull = false)



In [None]:
df_cast = predictions.select('order_id', col('items').cast(StringType()), col('prediction').cast(StringType()))
df_cast.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- items: string (nullable = false)
 |-- prediction: string (nullable = true)



In [None]:
df_cast.show(5)

+--------+--------------------+--------------------+
|order_id|               items|          prediction|
+--------+--------------------+--------------------+
|    4519|[Beet Apple Carro...|                  []|
|   40011|[Organic Baby Spi...|[Organic Strawber...|
|   46266|[Uncured Beef Hot...|[Organic Hass Avo...|
|   61793|[Raspberries, Gre...|[Organic Strawber...|
|   91937|[No. 485 Gin, Mon...|                  []|
+--------+--------------------+--------------------+
only showing top 5 rows

