# Importing Necessary Libraries and Packages

In [6]:
# [Student:Nathan] Import necessary libraries
import numpy as np #[Student:Nathan] For random number generation(our simulated records) and array operations
import pandas as pd #[Student:Nathan] For data manipulation and analysis (Working with CSV files)

from mlxtend.preprocessing import TransactionEncoder  # [Student:Nathan] For encoding (one hot encode) transaction data into a format we can use for association rule mining
from mlxtend.frequent_patterns import apriori, association_rules #[Student:Nathan] For generating frequent itemsets and association rules

# [Student:Nathan] For generating frequent itemsets and association rules

# Ensuring Reproducibility
np.random.seed(42) #[Student:Nathan] For reproducibility

# Simulating Transaction Data

In [7]:
# Create a pool of items
# [Zakariya ] Define a list of unique supermarket items (at least 30 items)
item_pool = [  # [Zakariya] Create a Python list that will act as the pool of available items
    "Milk", "Bread", "Eggs", "Butter", "Cheese", "Yogurt", "Apples", "Bananas",
    "Oranges", "Grapes", "Tomatoes", "Onions", "Potatoes", "Carrots", "Chicken",
    "Beef", "Fish", "Rice", "Pasta", "Sugar", "Salt", "Flour", "Oil", "Cereal",
    "Juice", "Soda", "Coffee", "Tea", "Biscuits", "Chocolate"
]  # [Zakariya] Ensure the list has at least 30 different items as required

# [Zakariya] Set the number of transactions to simulate (at least 3000)
n_transactions = 3000  # [Zakariya] Define how many random supermarket transactions we want to generate

# [Zakariya] Create an empty list to store all transactions
transactions = []  # [Zakariya] This will hold each transaction as a list of item names

# [Zakariya] Loop over the number of transactions to generate each transaction
for _ in range(n_transactions):  # Zakariya] Repeat the process n_transactions times
    transaction_length = np.random.randint(2, 8)  # [Zakariya] Randomly choose a length between 2 and 7 items (upper bound 8 is exclusive)
    transaction_items = list(np.random.choice(item_pool, size=transaction_length, replace=False))  # [Zakariya] Randomly choose unique items for this transaction
    transactions.append(transaction_items)  # [Zakariya] Add the generated transaction list to the main transactions list

# [Zakariya] Build a DataFrame for saving raw transactions to CSV
transaction_strings = [", ".join(t) for t in transactions]  # [Zakariya] Convert each transaction list into a single comma-separated string
transactions_df = pd.DataFrame({  # [Zakariya] Create a DataFrame from the transaction data
    "transaction_id": range(1, n_transactions + 1),  # [Zakariya] Assign sequential transaction IDs starting from 1
    "items": transaction_strings  # [Zakariya] Store the string representation of items in each transaction
})  # [Zakariya] Close the DataFrame constructor

# [Zakariya] Save the simulated transactions to CSV as required
transactions_df.to_csv("supermarket_transactions.csv", index=False)  # [Zakariya] Export raw transaction data to supermarket_transactions.csv without row index

# One-Hot Encoding

In [8]:
te = TransactionEncoder()  # [Student:Nathan] Initialize the TransactionEncoder

te_array = te.fit(transactions).transform(transactions)  # [Student:Nathan] Fit and transform the transaction data into a one-hot encoded array (encode each transaction as True/False per item)

#[Student: Nathan] Convert the encoded array into a pandas DataFrame
one_hot_df = pd.DataFrame(te_array, columns=te.columns_)  # [Student: Nathan] Build a DataFrame where rows are transactions and columns are items

# [Student: Nathan] Inspect the first few rows to help in identifying any issues
one_hot_df.head()


Unnamed: 0,Apples,Bananas,Beef,Biscuits,Bread,Butter,Carrots,Cereal,Cheese,Chicken,...,Oranges,Pasta,Potatoes,Rice,Salt,Soda,Sugar,Tea,Tomatoes,Yogurt
0,False,False,True,False,False,False,False,False,False,False,...,True,False,False,True,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,True
2,False,False,False,False,False,False,False,False,True,False,...,True,False,False,False,False,False,False,False,True,False
3,False,False,False,False,True,True,False,False,True,False,...,False,True,False,False,False,False,False,False,False,True
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False


# Generating Frequent Itemsets using Apiori

In [9]:
# [Student: Nathan] Set the minimum support threshold
min_support = 0.05  # [Student: Nathan] Define the minimum proportion of transactions required for an itemset to be considered frequent

# [Student: Nathan] Run the Apriori algorithm on the one-hot encoded data
frequent_itemsets = apriori(
    one_hot_df,  # [Student: Nathan] Use the dataFrame that I one-hot encoded
    min_support=min_support,  # [Student: Nathan] Pass the minimum support threshold
    use_colnames=True  # [Student: Nathan] Use actual item names instead of column indices in the itemsets
)  

# [Student: Nathan] Add a 'length' column showing the numbers of items in each itemset
frequent_itemsets["length"] = frequent_itemsets["itemsets"].apply(len)  

# [Student: Nathan] Sort frequent itemsets by support (descending) and length (descending)
frequent_itemsets = frequent_itemsets.sort_values(
    by=["support", "length"],  
    ascending=[False, False]  # [Student: Nathan] Put highest support and longest itemsets at the top
) 

# [Student: Nathan] Viewing the top 10 frequent itemsets 
print("\nTop 10 frequent itemsets:")  
print(frequent_itemsets.head(10))  # [Student: Nathan] Show the first 10 rows of the sorted frequent itemsets DataFrame

# [Student: Nathan] Save all frequent itemsets to CSV
frequent_itemsets.to_csv("frequent_itemsets.csv", index=False)  


Top 10 frequent itemsets:
     support    itemsets  length
20  0.157333   (Oranges)       1
14  0.156333     (Flour)       1
17  0.156333      (Milk)       1
8   0.154333    (Cheese)       1
11  0.154000    (Coffee)       1
22  0.154000  (Potatoes)       1
18  0.153333       (Oil)       1
21  0.153333     (Pasta)       1
16  0.152333     (Juice)       1
26  0.152333     (Sugar)       1


# Identifying Closed Frequent Itemsets

In [10]:
# ===== IDENTIFYING CLOSED FREQUENT ITEMSETS =====
# Section handled by Catherine
closed_itemsets = []

for i in range(len(frequent_itemsets)):
    itemset_i = set(frequent_itemsets.iloc[i]['itemsets'])
    support_i = frequent_itemsets.iloc[i]['support']
    
    # Assume the itemset is closed
    is_closed = True
    
    # Compare with all other itemsets
    for j in range(len(frequent_itemsets)):
        if i != j:
            itemset_j = set(frequent_itemsets.iloc[j]['itemsets'])
            support_j = frequent_itemsets.iloc[j]['support']
            
            # If itemset_i is a proper subset of itemset_j and both have same support, it is not closed
            if itemset_i < itemset_j and support_i == support_j:
                is_closed = False
                break
    
    # Add closed itemset to the list
    if is_closed:
        closed_itemsets.append({'itemset': itemset_i, 'support': support_i, 'length': len(itemset_i)})

# Convert to DataFrame for readability
closed_itemsets_df = pd.DataFrame(closed_itemsets)

# Display the top 10 closed frequent itemsets
print("\nTop 10 Closed Frequent Itemsets:")
print(closed_itemsets_df.head(10))

# Optionally, save to CSV
closed_itemsets_df.to_csv("closed_frequent_itemsets.csv", index=False)




Top 10 Closed Frequent Itemsets:
      itemset   support  length
0   {Oranges}  0.157333       1
1     {Flour}  0.156333       1
2      {Milk}  0.156333       1
3    {Cheese}  0.154333       1
4    {Coffee}  0.154000       1
5  {Potatoes}  0.154000       1
6       {Oil}  0.153333       1
7     {Pasta}  0.153333       1
8     {Juice}  0.152333       1
9     {Sugar}  0.152333       1


# Identifying Maximal Frequent Itemsets

# Summary Prints