In [1]:

!rm -rf spark-3* spark-*hadoop*.tgz


# **Spark Context Creation**

In [1]:
# Install PySpark
!pip install -q pyspark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# Set JAVA_HOME
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

# Start SparkSession
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").appName("Market Basket").getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)




In [2]:
sc = spark.sparkContext


# **Importing the Data set from Kaggle**

In [3]:
# Kaggle credentials
os.environ['KAGGLE_USERNAME'] = "elahezohdi"
os.environ['KAGGLE_KEY'] = "b4d81db0a12258d41bbdde98d1131209"

# Download and unzip dataset
!kaggle datasets download -d mohamedbakhet/amazon-books-reviews
!unzip -q amazon-books-reviews.zip -d data

Dataset URL: https://www.kaggle.com/datasets/mohamedbakhet/amazon-books-reviews
License(s): CC0-1.0
Downloading amazon-books-reviews.zip to /content
 96% 1.02G/1.06G [00:07<00:00, 87.2MB/s]
100% 1.06G/1.06G [00:07<00:00, 145MB/s] 


# **EDA**

In [4]:
df = spark.read.csv("data/Books_rating.csv", header=True, inferSchema=True)
# Check schema to see column names and types
df.printSchema()
df.show(5)
df.describe().show()

root
 |-- Id: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- User_id: string (nullable = true)
 |-- profileName: string (nullable = true)
 |-- review/helpfulness: string (nullable = true)
 |-- review/score: string (nullable = true)
 |-- review/time: string (nullable = true)
 |-- review/summary: string (nullable = true)
 |-- review/text: string (nullable = true)

+----------+--------------------+-----+--------------+--------------------+------------------+------------+-----------+--------------------+--------------------+
|        Id|               Title|Price|       User_id|         profileName|review/helpfulness|review/score|review/time|      review/summary|         review/text|
+----------+--------------------+-----+--------------+--------------------+------------------+------------+-----------+--------------------+--------------------+
|1882931173|Its Only Art If I...| NULL| AVCGYZL8FQQTD|"Jim of Oz ""jim-...|               7/

In [5]:
# Keep only User_id and Title and drop nulls
df = df.select("User_id", "Title").dropna()
df.show(5)


+--------------+--------------------+
|       User_id|               Title|
+--------------+--------------------+
| AVCGYZL8FQQTD|Its Only Art If I...|
|A30TK6U7DNS82R|Dr. Seuss: Americ...|
|A3UH4UZ4RSVO82|Dr. Seuss: Americ...|
|A2MVUWT453QH61|Dr. Seuss: Americ...|
|A22X4XUPKF66MR|Dr. Seuss: Americ...|
+--------------+--------------------+
only showing top 5 rows



# **Creating the baskets**

In [7]:
from pyspark.sql.functions import lower, regexp_replace, trim, collect_set, size


# Clean and sample
df_clean = df.select("User_id", "Title").dropna()
df_clean = df_clean.withColumn("Title", lower(regexp_replace("Title", r"[^a-zA-Z0-9 ]", "")))
df_clean = df_clean.withColumn("Title", regexp_replace("Title", r"\s+", " "))
df_clean = df_clean.withColumn("Title", trim(df_clean["Title"]))
df_clean = df_clean.sample(False, 0.01, seed=42)

# Group by user
df_grouped = df_clean.groupBy("User_id").agg(collect_set("Title").alias("basket"))
df_grouped = df_grouped.filter(size("basket") > 1)

# Print a few rows
df_grouped.show(5, truncate=False)

# Extract baskets
basket_rows = df_grouped.select("basket").limit(10000).toLocalIterator()
baskets = [row.basket for row in basket_rows]

# Show samples and stats
print("First basket:", baskets[0])
print("Number of baskets:", len(baskets))
lengths = [len(b) for b in baskets]
print("Max basket size:", max(lengths))
print("Min basket size:", min(lengths))
print("Average basket size:", sum(lengths)/len(lengths))


+--------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|User_id       |basket                                                                                                                                                                                                              |
+--------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|A103U0Q3IKSXHE|[what clients love a field guide to growing your business, thunder and lightning cracking open the writers craft]                                                                                                   |
|A10872FHIJAKKD|[the lord of the rings the fellowship of the ring bbc audio coll

In [9]:
baskets_rdd = sc.parallelize(baskets)


# **The hash table**

In [10]:
#Create Hash Table
hash = baskets_rdd.flatMap(lambda line: line).distinct()
hash = hash.zipWithIndex()
hash_table = hash.collectAsMap()


In [11]:
#Hash baskets
def hashing(basket):
    return {hash_table[title] for title in basket if title in hash_table}

baskets_hashed = baskets_rdd.map(hashing)

# **A-PRIORI ALGORITHM**

In [12]:
from itertools import combinations

def a_priori(baskets_collection, support, hash_table):
    print("Frequent singletons")
    first_pass = baskets_collection.flatMap(lambda basket: [(item, 1) for item in basket]) \
                                   .reduceByKey(lambda x, y: x + y) \
                                   .filter(lambda x: x[1] >= support)

    if first_pass.isEmpty():
        print("⚠️ No frequent singletons — try lowering support")
        return

    print("Number of frequent singletons:", first_pass.count())
    most_common = first_pass.max(lambda x: x[1])[0]
    print("Most frequent singleton:", [k for k, v in hash_table.items() if v == most_common][0])
    print()

    freq_itemsets = set(first_pass.map(lambda x: (x[0],)).collect())
    k = 2

    while True:
        print(f"Itemsets of size {k}")
        candidates = baskets_collection.flatMap(lambda basket:
            [(combo, 1) for combo in combinations(sorted(basket), k)
             if all(sub in freq_itemsets for sub in combinations(combo, k - 1))]) \
            .reduceByKey(lambda x, y: x + y) \
            .filter(lambda x: x[1] >= support)

        if candidates.isEmpty():
            print(f"No frequent itemsets of size {k}")
            print("✅ Apriori terminated — no more frequent itemsets.")
            break

        print(f"Number of frequent itemsets of size {k}:", candidates.count())

        # Display the most frequent itemset of this size
        top_itemset = candidates.max(lambda x: x[1])
        top_readable = [k for k, v in hash_table.items() if v in top_itemset[0]]
        print("Most frequent itemset:", top_readable)

        freq_itemsets = set(candidates.map(lambda x: x[0]).collect())
        print()
        k += 1


In [13]:
#Run Apriori with support = 2 or 3 to start
support = 2
a_priori(baskets_hashed, support, hash_table)

Frequent singletons
Number of frequent singletons: 381
Most frequent singleton: the hobbit

Itemsets of size 2
Number of frequent itemsets of size 2: 24
Most frequent itemset: ['the hobbit there and back again', 'the hobbitt or there and back again illustrated by the author']

Itemsets of size 3
Number of frequent itemsets of size 3: 2
Most frequent itemset: ['pride and prejudice', 'emma signet classics', 'sense sensibility']

Itemsets of size 4
No frequent itemsets of size 4
✅ Apriori terminated — no more frequent itemsets.


In [14]:
for i in range(5):
    print(baskets[i])


['what clients love a field guide to growing your business', 'thunder and lightning cracking open the writers craft']
['the lord of the rings the fellowship of the ring bbc audio collection', 'the hobbit']
['the count of monte cristo', 'antisemitism myth and hate from antiquity to the present', 'road unseen', 'reckless disregard how liberal democrats undercut our military endanger our soldiers and jeopardize our security']
['cast a yellow shadow', 'charlotte gray', 'shilling for candlesaud csst', 'shutter island']
['the wind in the willows', 'jesus christ and mythology', 'wuthering heights']
