In [1]:
%%capture
!sudo apt-get update --fix-missing

!apt-get install openjdk-8-jdk-headless -qq > /dev/null

!wget -q https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop3.2.tgz
#!wget -q https://downloads.apache.org/spark/spark-3.0.0/spark-3.0.0-bin-hadoop3.2.tgz

!mv spark-3.0.0-bin-hadoop3.2.tgz sparkkk
!tar xf sparkkk
!pip install -q findspark
     

#pip install spark
     

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.0-bin-hadoop3.2"

import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .appName('fpgrowth') \
    .getOrCreate()

spark 

In [10]:
# To be able to use your data stored in your Google Drive you first need to mount your Google Drive so you can load and save files to it. 
from google.colab import drive
drive.mount('/content/gdrive')
#You'll need to put in a token which Google will generate for you as soon as you click on the link

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [19]:
import pandas as pd

data = pd.read_excel('/content/gdrive/My Drive/market-basketFolder/market-basket.xlsx', dtype=str)
data.head()

Unnamed: 0,BillNo;Itemname;Quantity;Date;Price;CustomerID;Country,Unnamed: 1,Unnamed: 2,Unnamed: 3
0,536365;WHITE HANGING HEART T-LIGHT HOLDER;6;01...,55;17850;United Kingdom,,
1,536365;WHITE METAL LANTERN;6;01.12.2010 08:26;3,39;17850;United Kingdom,,
2,536365;CREAM CUPID HEARTS COAT HANGER;8;01.12....,75;17850;United Kingdom,,
3,536365;KNITTED UNION FLAG HOT WATER BOTTLE;6;0...,39;17850;United Kingdom,,
4,536365;RED WOOLLY HOTTIE WHITE HEART.;6;01.12....,39;17850;United Kingdom,,


In [None]:
df1 = data[['BillNo','CustomerID','Quantity','Price']]
df1.head()

In [None]:
df=df1.groupby ('BillNo') ['CustomerID']. apply (lambda x: x.reset_index (drop = True)). unstack (). reset_index ()
df.drop('BillNo',axis=1, inplace=True)
df.head()

In [None]:
items = (df[0].unique())
items

In [None]:
encoded_vals = []
def custom():
    for index, row in df.iterrows():
        labels = {}
        uncommons = list(set(items) - set(row))
        commons = list(set(items).intersection(row))
        for uc in uncommons:
            labels[uc] = 0
        for com in commons:
            labels[com] = 1
        encoded_vals.append(labels)
custom()
ohe_df = pd.DataFrame(encoded_vals)
print(ohe_df)

In [None]:
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [None]:
freq_items = apriori(ohe_df, min_support=0.006, use_colnames=True) #, verbose=1 #min_support=0.006, 
freq_items

In [None]:
freq_items.to_excel('Export&Supportvalues.xlsx')

In [None]:
rules = association_rules(freq_items, metric="confidence", min_threshold=0.006)
rules

In [None]:
import matplotlib.pyplot as plt
plt.scatter(rules['support'], rules['confidence'], alpha=0.2,color='brown')
plt.xlabel('support')
plt.ylabel('confidence')
plt.title('Support vs Confidence')
plt.show()

In [None]:
plt.scatter(rules['support'], rules['lift'], alpha=0.2,color='orange')
plt.xlabel('support')
plt.ylabel('lift')
plt.title('Support vs Lift')
plt.show()

In [None]:
from google.colab import files
from pyspark.sql import functions as F
from pyspark.ml.fpm import FPGrowth
import pandas


sparkdata = spark.createDataFrame(data)
basketdata = sparkdata.dropDuplicates(['BillNo', 'CustomerID']).sort('BillNo')
basketdata = basketdata.groupBy("BillNo").agg(F.collect_list("CustomerID")).sort('BillNo')  

In [None]:
#Frequent Pattern Growth â€“ FP Growth is a method of mining frequent itemsets
fpGrowth = FPGrowth(itemsCol="collect_list(SalesItem)", minSupport=0.006, minConfidence=0.006) 
model = fpGrowth.fit(basketdata)

# Display frequent itemsets.
model.freqItemsets.show()
items = model.freqItemsets
# Display generated association rules.
model.associationRules.show()
rules = model.associationRules
# transform examines the input items against all the association rules and summarize the
# consequents as prediction
model.transform(basketdata).show()
transformed = model.transform(basketdata)

In [None]:
# Convert the Spark DataFrame back to a Pandas DataFrame using Arrow
result_pdf = items.select("*").toPandas()
result_pdf.head()

In [None]:
result_pdf.to_excel('result_pdfItemsFreq.xlsx')

In [None]:
rules_pdf = rules.select("*").toPandas()
rules_pdf.head()

In [None]:
rules_pdf.to_excel('rules_pdfAnteConseConfLift.xlsx')

In [None]:
transformed_pdf = transformed.select("*").toPandas()
transformed_pdf.head()

In [None]:
transformed_pdf.to_excel('transformed_pdfSalesTransactionIDCollectListPred.xlsx')

In a conclusion, a **higher minSupport value** will result in **fewer frequent** itemsets being generated, while a **lower minSupport value** will result in **more frequent** itemsets being generated.

On the other hand, a **higher minConfidence value** will result in **fewer strong rules** being generated, while a **lower minConfidence value** will result in **more strong rules** being generated.
