In [2]:
# [Student: All] Import necessary libraries for data manipulation and mining
import pandas as pd
import numpy as np
import random
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori


# [Student: Member A] 
# 1. Simulate Transaction Data

# Define a pool of 30+ unique supermarket items
items_pool = [
    'Milk', 'Bread', 'Butter', 'Eggs', 'Cheese', 'Yogurt', 'Apples', 'Bananas',
    'Oranges', 'Grapes', 'Chicken', 'Beef', 'Pork', 'Fish', 'Rice', 'Pasta',
    'Tomato Sauce', 'Cereal', 'Oats', 'Sugar', 'Salt', 'Coffee', 'Tea', 'Juice',
    'Soda', 'Water', 'Chips', 'Cookies', 'Chocolate', 'Soap', 'Shampoo', 'Toothpaste'
]

# Set the seed for reproducibility
random.seed(42)

# Initialize a list to hold all transactions
transactions = []

# Simulate 3000 transactions
for i in range(3000):
    # Randomly select a transaction length between 1 and 6 items
    transaction_length = random.randint(2, 7)
    
    # [Student: Member A] Randomly sample items from the pool without replacement
    transaction = random.sample(items_pool, transaction_length)
    
    # [Student: Member A] Append the transaction to the list
    transactions.append(transaction)

# Convert the list of lists into a DataFrame for easier visualization (optional step)
df_raw = pd.DataFrame({'Transaction': transactions})

# Save the raw simulated data to CSV as required
df_raw.to_csv('C:/Users/HP/Documents/edu/data mining/mining frequent itemsets/Mining_Frequent_Itemsets/supermarket_transactions.csv', index=False)

# Display the first few rows to verify simulation
print("Raw Transactions Sample:")
print(df_raw.head())

#[Student: Member B]
# 2. Preprocessing: One-Hot Encoding

# Initialize the TransactionEncoder from mlxtend
te = TransactionEncoder()

#  Fit and transform the transaction data into a boolean array
te_ary = te.fit(transactions).transform(transactions)

# [Student: Member B] Convert the boolean array into a pandas DataFrame (One-Hot Encoded)
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)

# Display the shape to confirm 3000 rows and 30+ columns
print(f"\nEncoded Data Shape: {df_encoded.shape}")

# [Student: Member C]
# 3. Generate Frequent Itemsets (Apriori)
#  Apply the apriori algorithm with min_support = 0.05, use_colnames=True ensures we get item names instead of column indices
frequent_itemsets = apriori(df_encoded, min_support=0.05, use_colnames=True)

# Add a column for the length of the itemset (useful for filtering later)
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))

# Sort by support in descending order
frequent_itemsets = frequent_itemsets.sort_values(by='support', ascending=False)

# Save all frequent itemsets to CSV
frequent_itemsets.to_csv("C:/Users/HP/Documents/edu/data mining/mining frequent itemsets/Mining_Frequent_Itemsets/frequent_itemsets.csv", index=False)

# Display the top 10 frequent itemsets
print("\nTop 10 Frequent Itemsets:")
print(frequent_itemsets.head(10))

# Student: Member   D]
# 4. Identify Closed Frequent Itemsets
#  Logic for Closed Itemsets: An itemset X is closed if NO immediate superset has the SAME support count.
# Initialize a list to store closed itemsets
closed_itemsets_list = []

# Iterate through each frequent itemset (we call this the 'current' itemset)
for index, row in frequent_itemsets.iterrows():
    current_itemset = row['itemsets']
    current_support = row['support']
    is_closed = True # Assume it is closed initially
    
    # Compare against all other frequent itemsets to find supersets
    for _, row_check in frequent_itemsets.iterrows():
        check_itemset = row_check['itemsets']
        check_support = row_check['support']
        
        # Check if 'check_itemset' is a strict superset of 'current_itemset'
        if current_itemset != check_itemset and current_itemset.issubset(check_itemset):
            # [Student: Member C] If a superset exists with the SAME support, 'current_itemset' is NOT closed
            if current_support == check_support:
                is_closed = False
                break
    
    # If the check passed, add to the list
    if is_closed:
        closed_itemsets_list.append(row)

# Convert the list of closed itemsets to a DataFrame
closed_itemsets = pd.DataFrame(closed_itemsets_list)

# Save closed itemsets to CSV
closed_itemsets.to_csv("C:/Users/HP/Documents/edu/data mining/mining frequent itemsets/Mining_Frequent_Itemsets/closed_itemsets.csv", index=False)

print(f"\nNumber of Closed Itemsets: {len(closed_itemsets)}")



Raw Transactions Sample:
                                         Transaction
0  [Bananas, Milk, Juice, Oranges, Toothpaste, Co...
1  [Apples, Coffee, Juice, Chocolate, Cereal, But...
2                [Butter, Bread, Yogurt, Fish, Rice]
3      [Bread, Cereal, Apples, Tea, Salt, Chocolate]
4       [Chips, Bananas, Rice, Oats, Oranges, Water]

Encoded Data Shape: (3000, 32)

Top 10 Frequent Itemsets:
     support     itemsets  length
10  0.150000     (Coffee)       1
31  0.150000     (Yogurt)       1
20  0.149000       (Pork)       1
22  0.147667       (Salt)       1
0   0.147000     (Apples)       1
11  0.146000    (Cookies)       1
7   0.146000    (Chicken)       1
6   0.145333     (Cheese)       1
30  0.145000      (Water)       1
9   0.144667  (Chocolate)       1

Number of Closed Itemsets: 32
