In [1]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317130 sha256=63fa3b93a541ec14f3571437ee9cab957ed62d0fed826c95b87d4315f139d142
  Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0


####Install pyspark, import the libraries and create a spark session.

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.fpm import FPGrowth
from pyspark.sql.functions import *

In [3]:
spark = SparkSession.builder.appName("Session").getOrCreate()

####Import the train data and join them with the products dataset in order to get the product names.

[Data link](https://www.kaggle.com/c/instacart-market-basket-analysis/data)

In [4]:
train_data  = spark.read.csv("order_products__train.csv", header=True, inferSchema=True)
products = spark.read.csv("products.csv", header=True, inferSchema=True)
df = train_data.join(products, products.product_id == train_data.product_id, "inner")
df.show()

+--------+----------+-----------------+---------+----------+--------------------+--------+-------------+
|order_id|product_id|add_to_cart_order|reordered|product_id|        product_name|aisle_id|department_id|
+--------+----------+-----------------+---------+----------+--------------------+--------+-------------+
|       1|     49302|                1|        1|     49302|    Bulgarian Yogurt|     120|           16|
|       1|     11109|                2|        1|     11109|Organic 4% Milk F...|     108|           16|
|       1|     10246|                3|        0|     10246|Organic Celery He...|      83|            4|
|       1|     49683|                4|        0|     49683|      Cucumber Kirby|      83|            4|
|       1|     43633|                5|        1|     43633|Lightly Smoked Sa...|      95|           15|
|       1|     13176|                6|        0|     13176|Bag of Organic Ba...|      24|            4|
|       1|     47209|                7|        0|     4

####Group the products in lists by order_id

In [5]:
transactional_data = df.groupBy("order_id").agg({"product_name": "collect_list"})\
  .withColumnRenamed("collect_list(product_name)", "products")
transactional_data.show()

+--------+--------------------+
|order_id|            products|
+--------+--------------------+
|       1|[Bulgarian Yogurt...|
|      96|[Roasted Turkey, ...|
|     112|[Fresh Cauliflowe...|
|     218|[Natural Artisan ...|
|     456|[Chorizo Pork, Pe...|
|     473|[Organic Whole Mi...|
|     631|[Organic Strawber...|
|     762|[Organic Strawber...|
|     774|[Ice Cream Variet...|
|     844|[Green Beans, Org...|
|     904|[Cup Noodles Chic...|
|     988|[Natural Vanilla ...|
|    1032|[Clover Org Greek...|
|    1077|[Bag of Organic B...|
|    1119|[Boneless Skinles...|
|    1139|[Banana, Organic ...|
|    1143|[Natural Premium ...|
|    1145|[Banana, Original...|
|    1275|[Boneless Skinles...|
|    1280|[Lactose Free Hal...|
+--------+--------------------+
only showing top 20 rows



####Get the min support by setting it initially to 1 and then divide the number of rows by 100 and at the same time divide the minsupport by 10. When the x reaches bellow 1, we create an instance of the fpgrowth algorithm, we put our min support and set min confidence to 0.3 because we have a huge dataset.

In [6]:
x = transactional_data.count()
mins = 1
while x > 1:
  x = x / 100
  mins = mins / 10
fp_growth = FPGrowth(itemsCol="products", minSupport=mins, minConfidence=0.3)

####We create and train the model of the train dataset

In [7]:
model = fp_growth.fit(transactional_data)

####We get a dataset of the most frequent itemsets along with their frequency

In [8]:
frequent_itemsets = model.freqItemsets
print("Frequent Itemsets:")
frequent_itemsets.show(truncate=False)

Frequent Itemsets:
+-------------------------------------------------------+----+
|items                                                  |freq|
+-------------------------------------------------------+----+
|[White Cheddar Popcorn]                                |370 |
|[Organic YoKids Very Berry Smoothies]                  |259 |
|[Total 0% Nonfat Greek Yogurt]                         |993 |
|[Total 0% Nonfat Greek Yogurt, Organic Avocado]        |137 |
|[Total 0% Nonfat Greek Yogurt, Bag of Organic Bananas] |157 |
|[Total 0% Nonfat Greek Yogurt, Organic Baby Spinach]   |143 |
|[Total 0% Nonfat Greek Yogurt, Banana]                 |258 |
|[Organic Large Green Asparagus]                        |730 |
|[Organic Large Green Asparagus, Organic Strawberries]  |185 |
|[Organic Large Green Asparagus, Bag of Organic Bananas]|263 |
|[Organic Large Green Asparagus, Organic Baby Spinach]  |136 |
|[Organic Large Green Asparagus, Organic Hass Avocado]  |164 |
|[Organic Cream Cheese Bar]         

####We generate and show the association rules

In [9]:
association_rules = model.associationRules

print("Association Rules:")
association_rules.orderBy(col("confidence").desc()).show()

Association Rules:
+--------------------+--------------------+-------------------+------------------+--------------------+
|          antecedent|          consequent|         confidence|              lift|             support|
+--------------------+--------------------+-------------------+------------------+--------------------+
|[Organic Raspberr...|[Bag of Organic B...| 0.5984251968503937| 5.072272070642333|0.001737685677049...|
|[Organic Cucumber...|[Bag of Organic B...|           0.546875| 4.635330870478036|0.001066999977135...|
|[Organic Kiwi, Or...|[Bag of Organic B...| 0.5459770114942529| 4.627719489738336|0.001448071397541327|
|[Organic Navel Or...|[Bag of Organic B...| 0.5412186379928315| 4.587387356098284|0.001150835689624...|
|[Yellow Onions, S...|            [Banana]| 0.5357142857142857|3.7536332219526702|0.001143214261216...|
|[Organic Whole St...|[Bag of Organic B...| 0.5314685314685315| 4.504745125675359|0.001158457118033...|
|[Organic Navel Or...|[Bag of Organic B...| 0

####We upload and transform our order data

In [10]:
order_products  = spark.read.csv("order_products__prior.csv", header=True, inferSchema=True)
df2 = order_products.join(products, products.product_id == order_products.product_id, "inner")
order_data = df.groupBy("order_id").agg({"product_name": "collect_list"})\
  .withColumnRenamed("collect_list(product_name)", "products")

####Apply the model to the orders dataset in order to make predictions and show the predicted associations for the orders dataset.

In [11]:
predictions = model.transform(order_data)

In [13]:
predictions.show(truncate=False)

+--------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------