In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# !wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!cp drive/MyDrive/MMDS-data/spark-3.1.1-bin-hadoop3.2.tgz .
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

In [None]:
!echo $SPARK_HOME

/content/spark-3.1.1-bin-hadoop3.2


In [None]:
import findspark
findspark.init()

In [None]:
# Import PySpark
from pyspark.sql import SparkSession

# Create SparkSession
spark = SparkSession.builder.appName("PySpark Example").getOrCreate()

In [None]:
# Read csv file into DataFrame
df = spark.read.csv("baskets01.csv", header=True, sep =";")

# Show first 5 rows
df.show(5)

+------+--------------------+--------+----------------+-----+----------+--------------+
|BillNo|            Itemname|Quantity|            Date|Price|CustomerID|       Country|
+------+--------------------+--------+----------------+-----+----------+--------------+
|536365|WHITE HANGING HEA...|       6|01.12.2010 08:26| 2,55|     17850|United Kingdom|
|536365| WHITE METAL LANTERN|       6|01.12.2010 08:26| 3,39|     17850|United Kingdom|
|536365|CREAM CUPID HEART...|       8|01.12.2010 08:26| 2,75|     17850|United Kingdom|
|536365|KNITTED UNION FLA...|       6|01.12.2010 08:26| 3,39|     17850|United Kingdom|
|536365|RED WOOLLY HOTTIE...|       6|01.12.2010 08:26| 3,39|     17850|United Kingdom|
+------+--------------------+--------+----------------+-----+----------+--------------+
only showing top 5 rows



In [None]:
from pyspark.sql import functions as F
df2 = df.groupBy("BillNo").agg(F.collect_list("Itemname").alias("products"))

df2 = df2.withColumn("products",F.concat_ws(",","products"))

# Show first 5 rows
df2.show()

+------+--------------------+
|BillNo|            products|
+------+--------------------+
|536596|VINTAGE UNION JAC...|
|536938|JUMBO BAG PINK PO...|
|537252|SMALL POPCORN HOLDER|
|537691|T-LIGHT GLASS FLU...|
|538041|                    |
|538184|PACK OF 6 BIRDY G...|
|538517|PACK OF 12 COLOUR...|
|538879|DANISH ROSE ROUND...|
|539275|SET OF 20 VINTAGE...|
|539630|CHICK GREY HOT WA...|
|540499|POTTING SHED TEA ...|
|540540|HOME SWEET HOME M...|
|540976|PAPERWEIGHT KINGS...|
|541432|RETROSPOT HEART H...|
|541518|RED RETROSPOT TAP...|
|541783|REGENCY CAKESTAND...|
|542026|HOME BUILDING BLO...|
|542375|RED TOADSTOOL LED...|
|543641|WHITE HANGING HEA...|
|544303|DOORMAT I LOVE LO...|
+------+--------------------+
only showing top 20 rows



In [None]:
!rm -rf baskets-df.csv/
df2.write.csv("baskets-df.csv")


In [None]:
df1 = spark.read.csv("baskets-df.csv")

# Read CSV file using format method
df2 = spark.read.format("csv").load("baskets-df.csv")
df3 = spark.read.option("header", True).csv("baskets-df.csv")

# Rename columns after reading
df4 = df1.withColumnRenamed("_c0", "BillNo").withColumnRenamed("_c1", "products")
df4.show()

+------+--------------------+
|BillNo|            products|
+------+--------------------+
|536778|PORCELAIN ROSE SM...|
|537240|SET/5 RED RETROSP...|
|537382|ALARM CLOCK BAKEL...|
|537692|RED HANGING HEART...|
|538072|              faulty|
|538126|ENAMEL MEASURING ...|
|538527|PARTY CONE CHRIST...|
|538534|HAND WARMER BABUS...|
|538935|CREAM HEART CARD ...|
|539286|CHRISTMAS LIGHTS ...|
|539296|CLOTHES PEGS RETR...|
|539374|SET OF 6 SPICE TI...|
|540413|PARTY BUNTING,SMA...|
|541012|WRAP ALPHABET DES...|
|541218|HEARTS GIFT TAPE,...|
|541703|LUNCH BAG CARS BL...|
|541808|              Manual|
|542001|PINK/YELLOW FLOWE...|
|542145|SET/4 WHITE RETRO...|
|542432|DOORMAT FANCY FON...|
+------+--------------------+
only showing top 20 rows



In [None]:
from pyspark.sql import functions as F

# Convert a string column into an array column by splitting on comma
df = df4.withColumn("products", F.split("products", ","))
df = df.withColumn("products", F.array_distinct("products"))
# Show the schema and first 5 rows of the data frame
df.printSchema()
df.show()

root
 |-- BillNo: string (nullable = true)
 |-- products: array (nullable = true)
 |    |-- element: string (containsNull = true)

+------+--------------------+
|BillNo|            products|
+------+--------------------+
|536778|[PORCELAIN ROSE S...|
|537240|[SET/5 RED RETROS...|
|537382|[ALARM CLOCK BAKE...|
|537692|[RED HANGING HEAR...|
|538072|            [faulty]|
|538126|[ENAMEL MEASURING...|
|538527|[PARTY CONE CHRIS...|
|538534|[HAND WARMER BABU...|
|538935|[CREAM HEART CARD...|
|539286|[CHRISTMAS LIGHTS...|
|539296|[CLOTHES PEGS RET...|
|539374|[SET OF 6 SPICE T...|
|540413|[PARTY BUNTING, S...|
|541012|[WRAP ALPHABET DE...|
|541218|[HEARTS GIFT TAPE...|
|541703|[LUNCH BAG CARS B...|
|541808|            [Manual]|
|542001|[PINK/YELLOW FLOW...|
|542145|[SET/4 WHITE RETR...|
|542432|[DOORMAT FANCY FO...|
+------+--------------------+
only showing top 20 rows



In [None]:
from pyspark.ml.fpm import FPGrowth

# Create an instance of FPGrowth with minSupport=0.01, minConfidence=0.3 and maxPatternLength=2
fpGrowth = FPGrowth(itemsCol="products", minSupport=0.01, minConfidence=0.3)
model = fpGrowth.fit(df)
model.freqItemsets.show()
model.associationRules.show()

+--------------------+----+
|               items|freq|
+--------------------+----+
|[PACK OF 60 SPACE...| 503|
|[PACK OF 60 SPACE...| 204|
|[PACK OF 60 SPACE...| 291|
|[PACK OF 60 SPACE...| 218|
|[JAM JAR WITH GRE...| 298|
|[JAM JAR WITH GRE...| 221|
|[ALARM CLOCK BAKE...| 435|
|[ALARM CLOCK BAKE...| 269|
|[ALARM CLOCK BAKE...| 205|
|[ALARM CLOCK BAKE...| 209|
|[ALARM CLOCK BAKE...| 295|
|[COLOUR GLASS T-L...| 606|
|[QUEENS GUARD COF...| 272|
|[PLAYING CARDS KE...| 333|
|      [ FRONT  DOOR]| 209|
|[ FRONT  DOOR, KE...| 209|
|[ROUND CAKE TIN V...| 205|
|[FANCY FONT BIRTH...| 320|
|[FANCY FONT BIRTH...| 320|
|[ROUND SNACK BOXE...| 723|
+--------------------+----+
only showing top 20 rows

+--------------------+--------------------+-------------------+------------------+--------------------+
|          antecedent|          consequent|         confidence|              lift|             support|
+--------------------+--------------------+-------------------+------------------+------------