In [None]:
!pip install pyspark



In [None]:
import timeit
start_time = timeit.default_timer()

# Import necessary libraries
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, collect_list, concat_ws, monotonically_increasing_id, broadcast
from pyspark.ml.fpm import FPGrowth

# Set Pandas options for easier debugging (optional)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Initialize Spark session with optimized settings
spark = SparkSession.builder \
    .appName("MarketBasketAnalysis") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.executor.cores", "4") \
    .config("spark.sql.shuffle.partitions", "200") \
    .config("spark.sql.adaptive.enabled", "true") \
    .getOrCreate()

# Define schema for CSV files
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

order_products_schema = StructType([
    StructField("order_id", IntegerType(), True),
    StructField("product_id", IntegerType(), True),
    StructField("add_to_cart_order", IntegerType(), True),
    StructField("reordered", IntegerType(), True)
])

products_schema = StructType([
    StructField("product_id", IntegerType(), True),
    StructField("product_name", StringType(), True),
    StructField("aisle_id", IntegerType(), True),
    StructField("department_id", IntegerType(), True)
])

# Load CSV files into Spark DataFrames
# Convert CSV to Parquet for faster loading in future runs
order_products_prior = spark.read.csv('/content/order_products__prior.csv', header=True, schema=order_products_schema)
products = spark.read.csv('/content/products.csv', header=True, schema=products_schema)

order_products_prior.write.parquet('/content/order_products_prior.parquet', mode='overwrite')
products.write.parquet('/content/products.parquet', mode='overwrite')

# Load data from Parquet
order_products_prior = spark.read.parquet('/content/order_products_prior.parquet')
products = spark.read.parquet('/content/products.parquet')

# Merge DataFrames using broadcast join
merged_df = order_products_prior.join(broadcast(products), on='product_id')

# Perform random sampling (10% for testing, adjust as needed)
sampled_df = merged_df.sample(fraction=0.9999999999999999, seed=42)

# Repartition to balance data distribution across partitions
sampled_df = sampled_df.repartition(200)

# Create a product-level dataset
product_df = sampled_df.select("order_id", "product_name")

# Create transaction data
transactions = product_df.groupBy("order_id").agg(collect_list("product_name").alias("products"))

# Cache transactions for reuse
transactions.persist()

# Apply FPGrowth algorithm with refined parameters
fpGrowth = FPGrowth(itemsCol="products", minSupport=0.001, minConfidence=0.01)
model = fpGrowth.fit(transactions)

# Extract frequent itemsets and association rules
frequent_itemsets = model.freqItemsets
association_rules = model.associationRules

# Convert array columns to strings for easier visualization
frequent_itemsets = frequent_itemsets.withColumn("items", concat_ws(",", col("items")))
association_rules = association_rules.withColumn("antecedent", concat_ws(",", col("antecedent"))) \
                                     .withColumn("consequent", concat_ws(",", col("consequent")))

# Save results to CSV files
frequent_itemsets.coalesce(1).write.csv('frequent_itemsets_product.csv', header=True, mode='overwrite')
association_rules.coalesce(1).write.csv('association_rules_product.csv', header=True, mode='overwrite')

# Optional: Read and sort frequent itemsets for debugging
frequent_itemsets_df = spark.read.csv('frequent_itemsets_product.csv', header=True, inferSchema=True)
frequent_itemsets_df.orderBy('freq', ascending=False).show(n=20, truncate=False)

# Sort association rules by lift and add an index column
association_rules_df = spark.read.csv('association_rules_product.csv', header=True, inferSchema=True)
ordered_df = association_rules_df.orderBy('lift', ascending=False)
ordered_df_with_index = ordered_df.withColumn("index", monotonically_increasing_id())
odd_index_df = ordered_df_with_index.filter(ordered_df_with_index["index"] % 2 == 1)

# Show the results
odd_index_df.show(n=50, truncate=False)

# Unpersist cached DataFrame
transactions.unpersist()

end_time = timeit.default_timer()
print(f"Execution time: {end_time - start_time} seconds")


+------------------------+-----+
|items                   |freq |
+------------------------+-----+
|Banana                  |53707|
|Bag of Organic Bananas  |42885|
|Organic Strawberries    |29855|
|Organic Baby Spinach    |27414|
|Organic Hass Avocado    |24094|
|Organic Avocado         |19848|
|Large Lemon             |17356|
|Strawberries            |16160|
|Limes                   |15939|
|Organic Whole Milk      |15514|
|Organic Raspberries     |15344|
|Organic Yellow Onion    |12851|
|Organic Garlic          |12427|
|Organic Zucchini        |11901|
|Organic Blueberries     |11326|
|Cucumber Kirby          |11018|
|Organic Fuji Apple      |10126|
|Organic Lemon           |9897 |
|Apple Honeycrisp Organic|9700 |
|Organic Grape Tomatoes  |9551 |
+------------------------+-----+
only showing top 20 rows

+----------------------------------------------------------------------------------------------------------+-----------------------------------------------------+--------------------

## Analysis

### High Confidence and Lift Values

- **Lemon Sparkling Water → Grapefruit Sparkling Water**: High confidence (0.35) and extremely high lift (79.28) suggest a strong association. Customers who buy Lemon Sparkling Water are very likely to buy Grapefruit Sparkling Water.
- **Non Fat Raspberry Yogurt → Icelandic Style Skyr Blueberry Non-fat Yogurt**: High confidence (0.45) and lift (75.44) indicate a strong relationship. Promoting these together could be effective.
- Extremely high lift values observed for most items, it is possible that the data includes items that are already part of promotions or sold together in a set causing these lift values .

### Yogurt Products

Multiple yogurt products show strong associations, such as Non Fat Raspberry Yogurt with Nonfat Icelandic Style Strawberry Yogurt and Vanilla Skyr Nonfat Yogurt. This suggests a pattern where customers buying one type of yogurt are likely to buy another.

### Sparkling Water

Several sparkling water combinations, like Sparkling Lemon Water with Lime Sparkling Water and Sparkling Water Grapefruit, show high confidence and lift values. This indicates a strong preference for variety among sparkling water buyers.

### Greek Yogurt

Products like Total 2% Lowfat Greek Strained Yogurt with different flavors (Blueberry, Strawberry, Peach) have strong associations. Customers buying one flavor are likely to buy others.

## Marketing Promotions

Based on these insights, here are some suitable marketing promotions:

### Bundle Offers

- **Sparkling Water Variety Packs**: Create bundles that include Lemon, Grapefruit, and Lime Sparkling Water. Offer a discount on the bundle to encourage customers to try multiple flavors.
- **Yogurt Packs**: Offer mixed packs of Non Fat Raspberry Yogurt, Icelandic Style Skyr Blueberry Non-fat Yogurt, and Nonfat Icelandic Style Strawberry Yogurt. This can attract customers who enjoy variety in their yogurt choices.

### Cross-Promotions

- **Yogurt and Sparkling Water**: Promote a discount when customers buy both yogurt and sparkling water. For example, "Buy any 3 yogurts and get a sparkling water for free."
- **Healthy Snack Combos**: Pair items like Clementines with Greek Yogurt or Sparkling Water to promote healthy snacking options.

### Loyalty Programs

- **Frequent Buyer Rewards**: Implement a loyalty program where customers earn points for purchasing specific combinations, such as different flavors of Greek yogurt or sparkling water. Points can be redeemed for discounts or free products.

### Seasonal Promotions

- **Summer Refreshment Packs**: During summer, promote sparkling water combinations as refreshing drinks. Offer limited-time discounts on variety packs.
- **Back-to-School Snacks**: Promote yogurt and fruit combinations as healthy snacks for students. Offer special deals for parents buying these items in bulk.