#Market Basket Analysis

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import pandas as pd
data = pd.read_excel('/content/gdrive/My Drive/DDDDFolder/DDDD.xlsx')
data.head()

In [3]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.2.tar.gz (281.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 KB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.2-py2.py3-none-any.whl size=281824028 sha256=b6a728ec4dab551f67b6a84075a3192e1eb4cbda1237375703d935c270412525
  Stored in directory: /root/.cache/pip/wheels/6c/e3/9b/0525ce8a69478916513509d43693511463c6468db0de237c86
Successfully built pyspark
Installing collected packages: py4j, pyspa

In [4]:
# Import Apache Spark SQL
from pyspark.sql import SparkSession

# Create Spark Session/Context
# We are using local machine with all the CPU cores [*]
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("Hello Pyspark") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [6]:
# Check spark session
spark

In [7]:
from google.colab import files
from pyspark.sql import functions as F
from pyspark.ml.fpm import FPGrowth
import pandas


sparkdata = spark.createDataFrame(data)
basketdata = sparkdata.dropDuplicates(['SalesTransactionID', 'SalesItem']).sort('SalesTransactionID')
basketdata = basketdata.groupBy("SalesTransactionID").agg(F.collect_list("SalesItem")).sort('SalesTransactionID')

In [9]:
#Frequent Pattern Growth – FP Growth is a method of mining frequent itemsets
fpGrowth = FPGrowth(itemsCol="collect_list(SalesItem)", minSupport=0.006, minConfidence=0.006) 
model = fpGrowth.fit(basketdata)

# Display frequent itemsets.
model.freqItemsets.show()
items = model.freqItemsets

+----------+----+
|     items|freq|
+----------+----+
|     [315]| 553|
|     [274]| 407|
|     [137]| 730|
|    [1491]| 432|
|     [295]| 471|
|     [565]| 368|
|     [363]| 512|
|     [159]| 671|
|[159, 161]| 319|
| [159, 20]| 296|
|     [131]| 599|
|[131, 132]| 364|
|      [19]| 382|
|     [135]| 690|
|     [302]| 622|
|      [22]| 449|
|     [205]| 484|
|     [152]| 761|
|     [146]| 524|
|     [128]| 929|
+----------+----+
only showing top 20 rows



In [10]:
# Display generated association rules.
model.associationRules.show()
rules = model.associationRules

+----------+----------+-------------------+------------------+--------------------+
|antecedent|consequent|         confidence|              lift|             support|
+----------+----------+-------------------+------------------+--------------------+
|     [132]|     [131]|  0.603648424543947| 48.80180067940764|0.007516623301532235|
| [63, 104]|      [20]| 0.7111650485436893|12.139188805349558|0.006050468756453145|
|     [285]|     [102]| 0.5182291666666666|28.713690646453088|0.008218725478048982|
|     [285]|     [514]| 0.3997395833333333|13.777785809608542|0.006339569652665924|
|     [285]|      [83]| 0.4700520833333333| 11.52543908227848| 0.00745467310948664|
|     [285]|       [8]|         0.41015625|16.860973312818334|0.006504770164787511|
|     [285]|     [120]| 0.3802083333333333|10.390501551918735|0.006029818692437...|
|     [285]|      [20]| 0.4322916666666667| 7.378976471624956|0.006855821253045884|
| [67, 103]|     [312]|0.41866330390920553|19.072614445067906|0.006855821253

In [11]:
# transform examines the input items against all the association rules and summarize the
# consequents as prediction
model.transform(basketdata).show()
transformed = model.transform(basketdata)

+------------------+-----------------------+--------------------+
|SalesTransactionID|collect_list(SalesItem)|          prediction|
+------------------+-----------------------+--------------------+
|                 0|                    [0]|                  []|
|                 1|              [0, 1, 2]|                  []|
|                 2|                    [1]|                  []|
|                 3|                    [0]|                  []|
|                 4|                    [0]|                  []|
|                 5|                    [0]|                  []|
|                 6|                    [2]|                  []|
|                 7|                    [2]|                  []|
|                 8|                    [0]|                  []|
|                10|                 [1, 0]|                  []|
|                11|                    [0]|                  []|
|                12|                 [4, 3]|                  []|
|         

In [12]:
# Convert the Spark DataFrame back to a Pandas DataFrame using Arrow
result_pdf = items.select("*").toPandas()
result_pdf.head()

Unnamed: 0,items,freq
0,[315],553
1,[274],407
2,[137],730
3,[1491],432
4,[295],471


In [13]:
result_pdf.to_excel('result_pdfItemsFreq.xlsx')

In [14]:
rules_pdf = rules.select("*").toPandas()
rules_pdf.head()

Unnamed: 0,antecedent,consequent,confidence,lift,support
0,[132],[131],0.603648,48.801801,0.007517
1,"[63, 104]",[20],0.711165,12.139189,0.00605
2,[285],[102],0.518229,28.713691,0.008219
3,[285],[514],0.39974,13.777786,0.00634
4,[285],[83],0.470052,11.525439,0.007455


In [15]:
rules_pdf.to_excel('rules_pdfAnteConseConfLift.xlsx')

In [16]:
transformed_pdf = transformed.select("*").toPandas()
transformed_pdf.head()

Unnamed: 0,SalesTransactionID,collect_list(SalesItem),prediction
0,0,[0],[]
1,1,"[0, 1, 2]",[]
2,2,[1],[]
3,3,[0],[]
4,4,[0],[]


In [17]:
transformed_pdf.to_excel('transformed_pdfSalesTransactionIDCollectListPred.xlsx')