### Codes reviewed and edited by Mitchel 

## Stimulate Transactions for a Supermarket Dataset
### Student: Julie

In [6]:
# Loading necessary libraries - Mitchel
import random
import csv
import pandas as pd

# Define a pool of 30 unique supermarket items - Julie
ITEM_POOL = [
    "Milk", "Bread", "Eggs", "Cheese", "Butter", "Chicken Breast", "Ground Beef", "Apples",
    "Bananas", "Oranges", "Tomatoes", "Potatoes", "Onions", "Carrots", "Lettuce", "Spinach",
    "Yogurt", "Cereal", "Rice", "Pasta", "Coffee", "Tea", "Juice", "Soda", "Water",
    "Chocolate", "Cookies", "Ice Cream", "Laundry Detergent", "Toothpaste"
]

# generate transactions
transactions = []
for _ in range(3000):
    num_items = random.randint(2, 7)  # each transaction has 2-7 items
    transaction = random.sample(ITEM_POOL, num_items)  # randomly select items
    transactions.append(transaction)

transactions[:5]  # display first 5 transactions for verification

[['Tomatoes', 'Spinach'],
 ['Spinach', 'Bread', 'Tomatoes', 'Tea'],
 ['Bananas', 'Tomatoes'],
 ['Ground Beef', 'Butter', 'Cookies', 'Lettuce'],
 ['Rice', 'Pasta']]

# 2. preprocessing : one-hot encoding

In [14]:
# convert transactions to a one-hot encoded format(rows=transactions, columns=ITEM_POOL)
# this format is required for the apriori algorithm
def create_one_hot_encoded_df(transactions, ITEM_POOL):
    df = pd.DataFrame(0, index=range(len(transactions)), columns=ITEM_POOL)
    for i, transaction in enumerate(transactions):
        for item in transaction:
            df.at[i, item] = 1
    return df
df = create_one_hot_encoded_df(transactions, ITEM_POOL)
df.head()  # display first few rows of the one-hot encoded DataFrame

# save to csv file
df.to_csv('supermarket_transactions.csv', index=False)
df.head()  # display first few rows of the saved DataFrame

Unnamed: 0,Milk,Bread,Eggs,Cheese,Butter,Chicken Breast,Ground Beef,Apples,Bananas,Oranges,...,Coffee,Tea,Juice,Soda,Water,Chocolate,Cookies,Ice Cream,Laundry Detergent,Toothpaste
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Generate Frequent Itemsets
###  Student: Kyra

In [9]:
#Generated frequent itemsets with min_support=0.05 and saved top 10 to 'frequent_itemsets.csv'.
import pandas as pd
from mlxtend.frequent_patterns import apriori

df = pd.read_csv("supermarket_transactions.csv")

# Use the one-hot encoded DataFrame directly
frequent_itemsets = apriori(df, min_support=0.05, use_colnames=True)
frequent_itemsets = frequent_itemsets.sort_values(by='support', ascending=False)

print(frequent_itemsets.head(10))
frequent_itemsets['itemsets'] = frequent_itemsets['itemsets'].apply(lambda x: ', '.join(list(x)))
frequent_itemsets.head(10).to_csv("frequent_itemsets.csv", index=False)

     support             itemsets
27  0.157000          (Ice Cream)
21  0.156000                (Tea)
1   0.155333              (Bread)
28  0.154333  (Laundry Detergent)
5   0.154333     (Chicken Breast)
16  0.154000             (Yogurt)
14  0.154000            (Lettuce)
4   0.153000             (Butter)
29  0.152333         (Toothpaste)
17  0.152333             (Cereal)




## Identify Closed Frequent Itemsets
### Student: Claire

In [12]:
# === Closed Frequent Itemsets Identification ===
# [Student: Claire]
# Logic: An itemset is closed if there is no proper superset among the frequent itemsets with the same support.

# Load the frequent itemsets CSV
import pandas as pd
frequent_itemsets = pd.read_csv("frequent_itemsets.csv")

# Convert 'itemsets' from string to frozenset for set operations
frequent_itemsets['itemsets'] = frequent_itemsets['itemsets'].apply(lambda x: frozenset(map(str.strip, x.split(','))))

# Identify closed frequent itemsets
all_itemsets = list(frequent_itemsets['itemsets'])
all_supports = list(frequent_itemsets['support'])

# Check if a given itemset is closed
def is_closed(idx, itemsets, supports):
    """Return True if itemsets[idx] is closed among all itemsets."""
    current_set = itemsets[idx]
    current_support = supports[idx]
    for j, candidate_set in enumerate(itemsets):
        # Check if candidate is a strict superset and has the same support
        if idx != j and current_set < candidate_set and supports[j] == current_support:
            return False
    return True

# Apply the function to each itemset in the DataFrame
closed_flags = []
for i in range(len(frequent_itemsets)):
    closed_flags.append(is_closed(i, all_itemsets, all_supports))

frequent_itemsets['closed'] = closed_flags

# Extract and display closed frequent itemsets
closed_itemsets = frequent_itemsets[frequent_itemsets['closed'] == True]

print("Closed frequent itemsets:")
print(closed_itemsets[['support', 'itemsets']])

# Save to CSV
closed_itemsets.to_csv('closed_frequent_itemsets.csv', index=False)

Closed frequent itemsets:
    support             itemsets
0  0.157000          (Ice Cream)
1  0.156000                (Tea)
2  0.155333              (Bread)
3  0.154333  (Laundry Detergent)
4  0.154333     (Chicken Breast)
5  0.154000             (Yogurt)
6  0.154000            (Lettuce)
7  0.153000             (Butter)
8  0.152333         (Toothpaste)
9  0.152333             (Cereal)


## Identifying Maximal Frequent Itemsets
### Student:Esther

In [13]:

def is_maximal(itemset, all_itemsets):
    for other_itemset in all_itemsets['itemsets']:
        if itemset < other_itemset:
            return False
    return True

maximal_itemsets = frequent_itemsets[frequent_itemsets.apply(
    lambda row: is_maximal(row['itemsets'], frequent_itemsets), axis=1)]
#saving to CSV
maximal_itemsets.to_csv('maximal_itemsets.csv', index=False)

print(maximal_itemsets)

    support             itemsets  closed
0  0.157000          (Ice Cream)    True
1  0.156000                (Tea)    True
2  0.155333              (Bread)    True
3  0.154333  (Laundry Detergent)    True
4  0.154333     (Chicken Breast)    True
5  0.154000             (Yogurt)    True
6  0.154000            (Lettuce)    True
7  0.153000             (Butter)    True
8  0.152333         (Toothpaste)    True
9  0.152333             (Cereal)    True


- This table summarizes the top 10 closed frequent itemsets found in the simulated supermarket transactions.
- Each item (e.g., Water, Toothpaste) appears in over 15% of the 3,000 transactions, meaning these are commonly purchased items. 
-They are marked as closed because no larger itemset containing them appears in the same number of transactions — indicating that customers often buy these items on their own or in different combinations.