In [1]:
# Install pyspark
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317145 sha256=7b92c42a9e21064a10560c459cc906b40db0fd997690b647861962a66e03e05f
  Stored in directory: /root/.cache/pip/wheels/9f/34/a4/159aa12d0a510d5ff7c8f0220abbea42e5d81ecf588c4fd884
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0


In [2]:
# Import Apache Spark SQL
from pyspark.sql import SparkSession

# Create Spark Session/Context
# We are using local machine with all the CPU cores [*]
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("Hello Pyspark") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [3]:
# Check spark session
spark

In [4]:
# Mounting google drive so we can access any file inside it
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [5]:
# Reading a csv file with a semicolon(;) delimiter
import pandas as pd
data = pd.read_csv('/content/gdrive/My Drive/market-basket.csv', sep=";")
data.head()

  data = pd.read_csv('/content/gdrive/My Drive/market-basket.csv', sep=";")


Unnamed: 0,BillNo,Itemname,Quantity,Date,Price,CustomerID,Country
0,536365,WHITE HANGING HEART T-LIGHT HOLDER,6,01.12.2010 08:26,255,17850.0,United Kingdom
1,536365,WHITE METAL LANTERN,6,01.12.2010 08:26,339,17850.0,United Kingdom
2,536365,CREAM CUPID HEARTS COAT HANGER,8,01.12.2010 08:26,275,17850.0,United Kingdom
3,536365,KNITTED UNION FLAG HOT WATER BOTTLE,6,01.12.2010 08:26,339,17850.0,United Kingdom
4,536365,RED WOOLLY HOTTIE WHITE HEART.,6,01.12.2010 08:26,339,17850.0,United Kingdom


In [6]:
from pyspark.sql import functions as F

sparkdata = spark.createDataFrame(data) # Create a dataframe
basketdata = sparkdata.dropDuplicates(['BillNo', 'Itemname']).sort('BillNo') # Drop any duplicates BillNo and Itemname, then sort the dataframe according to BillNo
basketdata = basketdata.groupBy("BillNo").agg(F.collect_list("Itemname")).sort('BillNo') # Group the items with the same BillNo and then sort them according to BillNo

In [9]:
from pyspark.ml.fpm import FPGrowth

#Frequent Pattern Growth – FP Growth is a method of mining frequent itemsets
fpGrowth = FPGrowth(itemsCol="collect_list(Itemname)", minSupport=0.06, minConfidence=0.06) 
model = fpGrowth.fit(basketdata)

# Display frequent itemsets.
model.freqItemsets.show()

+--------------------+----+
|               items|freq|
+--------------------+----+
|[REGENCY CAKESTAN...|1904|
|[LUNCH BAG RED RE...|1541|
|[WHITE HANGING HE...|2202|
|[ASSORTED COLOUR ...|1431|
|               [NaN]|1455|
|[SET OF 3 CAKE TI...|1346|
|[JUMBO BAG RED RE...|2064|
|     [PARTY BUNTING]|1656|
+--------------------+----+



In [11]:
frequent_itemsets = model.freqItemsets.select("*").toPandas() # changing the frequent itemsets dataframe into Pandas
frequent_itemsets.to_excel('market_basket_frequent_itemsets.xlsx') # saving the Pandas frequent itemsets into excel

In [12]:
# Display generated association rules.
model.associationRules.show()

+----------+----------+----------+----+-------+
|antecedent|consequent|confidence|lift|support|
+----------+----------+----------+----+-------+
+----------+----------+----------+----+-------+



In [None]:
rules = model.associationRules.select("*").toPandas()
rules.to_excel('market_basket_rules.xlsx')

In [13]:
# Initializing different minSupport and minConfidence
minSupp = [0.01, 0.03, 0.05, 0.07, 0.1] 
minConfi = [0.01, 0.03, 0.05, 0.07, 0.1]

#Frequent Pattern Growth – FP Growth is a method of mining frequent itemsets with a looping
for minConf in minConfi:
  for minSup in minSupp: 
    fpGrowth = FPGrowth(itemsCol="collect_list(Itemname)", minSupport=minSup, minConfidence=minConf) 
    model = fpGrowth.fit(basketdata)

    # Display association rules.
    print("associatiobn rules for frequent itemsets with minSupport=" + str(minSup) + " and minConfidence=" + str(minConf) +"\n")
    model.associationRules.show()

associatiobn rules for frequent itemsets with minSupport=0.01 and minConfidence=0.01

+--------------------+--------------------+-------------------+------------------+--------------------+
|          antecedent|          consequent|         confidence|              lift|             support|
+--------------------+--------------------+-------------------+------------------+--------------------+
|[JUMBO BAG ALPHAB...|  [JUMBO BAG APPLES]| 0.5669099756690997|12.739596268588908| 0.01075566634353506|
|[SET OF 6 SPICE T...|[RECIPE BOX PANTR...| 0.2952029520295203|5.8939922118115184| 0.01107879795042238|
|[SET OF 6 SPICE T...|[SET OF 3 CAKE TI...| 0.4944649446494465| 7.958093681976939|0.018556986566957485|
|[SET OF 6 SPICE T...|[JAM MAKING SET P...|0.26691266912669126| 5.181119311193112|  0.0100170798135069|
|[SET OF 6 SPICE T...|[JAM MAKING SET W...|0.31857318573185733| 6.337236843442815|0.011955869454830818|
|[PICNIC BASKET WI...|    [DOTCOM POSTAGE]|                0.5|15.298728813559322|

The best result in these associations are that even if the Support is low, it doesn't mean that it will have a bad lift, and a low confidence doesn't mean a bad lift too, but a low confidence and a low support usually result in a low lift