# Frequent Itemsets Analysis: Closed vs Maximal

## Group Assignment for Data Mining / Warehousing

---

### Group Members
- [Student: Alice]
- [Student: Brian]
- [Student: Carol]

## 1. Simulate Transaction Data


In [ ]:
# [Student: Alice] Import libraries
import pandas as pd  # Data manipulation
import numpy as np   # Numerical operations
import random        # Random sampling
from mlxtend.frequent_patterns import apriori  # Apriori algorithm
from mlxtend.frequent_patterns import association_rules  # For possible extension


In [ ]:
# [Student: Alice] Set random seed for reproducibility
random.seed(42)
np.random.seed(42)


In [ ]:
# [Student: Alice] Define pool of 30 unique supermarket items
item_pool = [
    'milk', 'bread', 'eggs', 'cheese', 'butter', 'apples', 'bananas', 'oranges', 'chicken', 'beef',
    'fish', 'rice', 'pasta', 'tomatoes', 'potatoes', 'onions', 'lettuce', 'carrots', 'cereal', 'yogurt',
    'juice', 'soda', 'coffee', 'tea', 'cookies', 'chips', 'waffles', 'candles', 'hard cheese', 'ice cream'
]


In [ ]:
# [Student: Alice] Simulate 3000 supermarket transactions
n_transactions = 3000  # Number of transactions
transactions = []      # List to store transactions
for _ in range(n_transactions):
    n_items = random.randint(2, 7)  # Each transaction has 2-7 items
    transaction = random.sample(item_pool, n_items)
    transactions.append(transaction)
# Convert to DataFrame for export
transactions_df = pd.DataFrame({'Transaction': [', '.join(t) for t in transactions]})
transactions_df.to_csv('supermarket_transactions.csv', index=False)  # Save raw transactions


## 2. Preprocessing: One-Hot Encoding


In [ ]:
# [Student: Brian] One-hot encode the transactions
onehot = pd.DataFrame(0, index=np.arange(n_transactions), columns=item_pool)  # Initialize
for idx, items in enumerate(transactions):
    for item in items:
        onehot.at[idx, item] = 1  # Mark purchased items as 1
onehot.to_csv('onehot_transactions.csv', index=False)  # Save one-hot encoded data


## 3. Generate Frequent Itemsets (Apriori)


In [ ]:
# [Student: Carol] Generate frequent itemsets using Apriori
min_support = 0.05  # Minimum support threshold
frequent_itemsets = apriori(onehot, min_support=min_support, use_colnames=True)
frequent_itemsets = frequent_itemsets.sort_values('support', ascending=False)
frequent_itemsets.to_csv('frequent_itemsets.csv', index=False)  # Save all frequent itemsets
# Display top 10 frequent itemsets
frequent_itemsets.head(10)


## 4. Identify Closed Frequent Itemsets


In [ ]:
# [Student: Brian] Identify closed frequent itemsets
def is_closed(row, all_itemsets):
    # For each superset, check if support is the same
    for _, superset in all_itemsets.iterrows():
        if row['itemsets'] < superset['itemsets'] and row['support'] == superset['support']:
            return False
    return True
frequent_itemsets['itemsets'] = frequent_itemsets['itemsets'].apply(frozenset)  # Ensure frozenset
closed_mask = frequent_itemsets.apply(lambda row: is_closed(row, frequent_itemsets), axis=1)
closed_itemsets = frequent_itemsets[closed_mask].copy()
closed_itemsets.to_csv('closed_itemsets.csv', index=False)  # Save closed itemsets
closed_itemsets.head(10)


## 5. Identify Maximal Frequent Itemsets


In [ ]:
# [Student: Carol] Identify maximal frequent itemsets
def is_maximal(row, all_itemsets):
    # If any superset is also frequent, it's not maximal
    for _, superset in all_itemsets.iterrows():
        if row['itemsets'] < superset['itemsets']:
            return False
    return True
maximal_mask = frequent_itemsets.apply(lambda row: is_maximal(row, frequent_itemsets), axis=1)
maximal_itemsets = frequent_itemsets[maximal_mask].copy()
maximal_itemsets.to_csv('maximal_itemsets.csv', index=False)  # Save maximal itemsets
maximal_itemsets.head(10)


## 6. Export Results to CSV
All results are already exported in each step above.
