In [1]:
import random
import csv

# Define a pool of 30 unique supermarket items
ITEM_POOL = [
    "Milk", "Bread", "Eggs", "Cheese", "Butter", "Chicken Breast", "Ground Beef", "Apples",
    "Bananas", "Oranges", "Tomatoes", "Potatoes", "Onions", "Carrots", "Lettuce", "Spinach",
    "Yogurt", "Cereal", "Rice", "Pasta", "Coffee", "Tea", "Juice", "Soda", "Water",
    "Chocolate", "Cookies", "Ice Cream", "Laundry Detergent", "Toothpaste"
]

def simulate_transactions(num_transactions=3000, item_pool=ITEM_POOL, min_items=2, max_items=7):
    transactions = []
    for txn_id in range(1, num_transactions + 1):
        num_items = random.randint(min_items, max_items)
        items = random.sample(item_pool, num_items)
        transactions.append({
            "TransactionID": txn_id,
            "Items": ", ".join(items)
        })
    return transactions

def save_transactions_csv(transactions, filename="supermarket_transactions.csv"):
    with open(filename, "w", newline='', encoding="utf-8") as csvfile:
        fieldnames = ["TransactionID", "Items"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for txn in transactions:
            writer.writerow(txn)

if __name__ == "__main__":
    transactions = simulate_transactions()
    save_transactions_csv(transactions)
    print(f"Simulated {len(transactions)} supermarket transactions and saved to 'supermarket_transactions.csv'.")

Simulated 3000 supermarket transactions and saved to 'supermarket_transactions.csv'.


In [2]:
#Generated frequent itemsets with min_support=0.05 and saved top 10 to 'frequent_itemsets.csv'.
import pandas as pd
from mlxtend.frequent_patterns import apriori

df = pd.read_csv("supermarket_transactions.csv")
df['ItemList'] = df['Items'].apply(lambda x: [item.strip() for item in x.split(",")])

encoded_rows = []
for items in df['ItemList']:
    row = {item: 1 if item in items else 0 for item in ITEM_POOL}
    encoded_rows.append(row)

df_basket = pd.DataFrame(encoded_rows)

frequent_itemsets = apriori(df_basket, min_support=0.05, use_colnames=True)
frequent_itemsets = frequent_itemsets.sort_values(by='support', ascending=False)

print(frequent_itemsets.head(10))
frequent_itemsets['itemsets'] = frequent_itemsets['itemsets'].apply(lambda x: ', '.join(list(x)))
frequent_itemsets.head(10).to_csv("frequent_itemsets.csv", index=False)

     support             itemsets
24  0.162000              (Water)
29  0.158667         (Toothpaste)
5   0.156000     (Chicken Breast)
13  0.156000            (Carrots)
18  0.155667               (Rice)
11  0.154667           (Potatoes)
28  0.154667  (Laundry Detergent)
0   0.153667               (Milk)
7   0.153333             (Apples)
20  0.153333             (Coffee)




In [3]:
# === Closed Frequent Itemsets Identification ===
# [Student: Claire]
# Logic: An itemset is closed if there is no proper superset among the frequent itemsets with the same support.

# Load the frequent itemsets CSV
import pandas as pd
frequent_itemsets = pd.read_csv("frequent_itemsets.csv")

# Convert 'itemsets' from string to frozenset for set operations
frequent_itemsets['itemsets'] = frequent_itemsets['itemsets'].apply(lambda x: frozenset(map(str.strip, x.split(','))))

# Identify closed frequent itemsets
all_itemsets = list(frequent_itemsets['itemsets'])
all_supports = list(frequent_itemsets['support'])

# Check if a given itemset is closed
def is_closed(idx, itemsets, supports):
    """Return True if itemsets[idx] is closed among all itemsets."""
    current_set = itemsets[idx]
    current_support = supports[idx]
    for j, candidate_set in enumerate(itemsets):
        # Check if candidate is a strict superset and has the same support
        if idx != j and current_set < candidate_set and supports[j] == current_support:
            return False
    return True

# Apply the function to each itemset in the DataFrame
closed_flags = []
for i in range(len(frequent_itemsets)):
    closed_flags.append(is_closed(i, all_itemsets, all_supports))

frequent_itemsets['closed'] = closed_flags

# Extract and display closed frequent itemsets
closed_itemsets = frequent_itemsets[frequent_itemsets['closed'] == True]

print("Closed frequent itemsets:")
print(closed_itemsets[['support', 'itemsets']])

# Save to CSV
closed_itemsets.to_csv('closed_frequent_itemsets.csv', index=False)

Closed frequent itemsets:
    support             itemsets
0  0.162000              (Water)
1  0.158667         (Toothpaste)
2  0.156000     (Chicken Breast)
3  0.156000            (Carrots)
4  0.155667               (Rice)
5  0.154667           (Potatoes)
6  0.154667  (Laundry Detergent)
7  0.153667               (Milk)
8  0.153333             (Apples)
9  0.153333             (Coffee)


## Identifying Maximal Frequent Itemsets
Student:Esther

In [5]:

def is_maximal(itemset, all_itemsets):
    for other_itemset in all_itemsets['itemsets']:
        if itemset < other_itemset:
            return False
    return True

maximal_itemsets = frequent_itemsets[frequent_itemsets.apply(
    lambda row: is_maximal(row['itemsets'], frequent_itemsets), axis=1)]
#saving to CSV
maximal_itemsets.to_csv('maximal_itemsets.csv', index=False)

print(maximal_itemsets)

    support             itemsets  closed
0  0.162000              (Water)    True
1  0.158667         (Toothpaste)    True
2  0.156000     (Chicken Breast)    True
3  0.156000            (Carrots)    True
4  0.155667               (Rice)    True
5  0.154667           (Potatoes)    True
6  0.154667  (Laundry Detergent)    True
7  0.153667               (Milk)    True
8  0.153333             (Apples)    True
9  0.153333             (Coffee)    True


- This table summarizes the top 10 closed frequent itemsets found in the simulated supermarket transactions.
- Each item (e.g., Water, Toothpaste) appears in over 15% of the 3,000 transactions, meaning these are commonly purchased items. 
-They are marked as closed because no larger itemset containing them appears in the same number of transactions — indicating that customers often buy these items on their own or in different combinations.