In [42]:
### Step 1a:Import necessary libraries
import pandas as pd
import numpy as np
import random

# --- 1. Simulate Transaction Data (Revised) ---
# Define the path to the Groceries dataset CSV file
# Ensure 'Groceries_dataset.csv' is in the same directory as your script/notebook.
data_path = 'Groceries_dataset.csv'
# Load the dataset into a pandas DataFrame
# The dataset has columns: 'Member_number', 'Date', 'itemDescription'.
try:
    df_raw = pd.read_csv(data_path)
    #Display the first few rows and info of the raw data to confirm its structure
    print("--- Raw Groceries Data Head ---")
    print(df_raw.head())
    print("\n--- Raw Groceries Data Info ---")
    df_raw.info()
except FileNotFoundError:
    print(f"Error: The file '{data_path}' was not found.")
    print("Please download 'Groceries_dataset.csv' from https://www.kaggle.com/datasets/balajikartheek/groceries-dataset")
    print("and place it in the correct directory.")
    #  Exit the script if the file is not found
    exit()
    

--- Raw Groceries Data Head ---
   Member_number        Date   itemDescription
0           1808  21-07-2015    tropical fruit
1           2552  05-01-2015        whole milk
2           2300  19-09-2015         pip fruit
3           1187  12-12-2015  other vegetables
4           3037  01-02-2015        whole milk

--- Raw Groceries Data Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38765 entries, 0 to 38764
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Member_number    38765 non-null  int64 
 1   Date             38765 non-null  object
 2   itemDescription  38765 non-null  object
dtypes: int64(1), object(2)
memory usage: 908.7+ KB


 ### Step 1b: Create a pool of 30 unique items
After loading the dataset:

Extract unique items from itemDescription.

Randomly select 30 items for our simulation pool.

In [43]:
#  Extract unique items from the dataset
unique_items = df_raw['itemDescription'].unique()
print(f"\nTotal unique items in dataset: {len(unique_items)}")

#  Randomly select 30 unique items for the simulation
item_pool = random.sample(list(unique_items), 30)
print(f"\n--- 30 Unique Items Selected for Simulation ---")
print(item_pool)



Total unique items in dataset: 167

--- 30 Unique Items Selected for Simulation ---
['fruit/vegetable juice', 'packaged fruit/vegetables', 'rubbing alcohol', 'specialty cheese', 'rum', 'female sanitary products', 'sliced cheese', 'male cosmetics', 'soft cheese', 'specialty chocolate', 'rolls/buns', 'dishes', 'herbs', 'salad dressing', 'hard cheese', 'instant coffee', 'mayonnaise', 'abrasive cleaner', 'bottled beer', 'beverages', 'make up remover', 'other vegetables', 'jam', 'honey', 'processed cheese', 'canned fish', 'popcorn', 'canned vegetables', 'canned fruit', 'cling film/bags']


We looked at real shopping data from a supermarket that included 167 products. For our project, we picked 30 random products from this list to create simulated shopping trips. These 30 products include things like milk, soda, bread, vegetables, and even items like soap and candles. This smaller group allows us to focus on finding shopping patterns, like which items are often bought together.”

### Step 1c: Simulate 3000 transactions
For each transaction:

Randomly choose 2–7 items from the 30-item pool.

Append these as rows.

In [44]:
#  Simulate 3000 transactions
num_transactions = 3000
simulated_transactions = []

for i in range(num_transactions):
    transaction_size = random.randint(2, 7)  # each transaction has 2–7 items
    transaction_items = random.sample(item_pool, transaction_size)
    simulated_transactions.append(transaction_items)

#  Convert to DataFrame
df_simulated = pd.DataFrame({
    'TransactionID': range(1, num_transactions + 1),
    'Items': [", ".join(items) for items in simulated_transactions]
})

# Preview the simulated transactions
print("\n--- Simulated Transactions Head ---")
print(df_simulated.head())

# Save simulated transactions to CSV
df_simulated.to_csv('supermarket_transactions.csv', index=False)
print(" Simulated transactions saved to 'supermarket_transactions.csv'")



--- Simulated Transactions Head ---
   TransactionID                                              Items
0              1  processed cheese, salad dressing, specialty ch...
1              2                 jam, bottled beer, cling film/bags
2              3                        rolls/buns, cling film/bags
3              4         bottled beer, canned vegetables, beverages
4              5              processed cheese, male cosmetics, rum
 Simulated transactions saved to 'supermarket_transactions.csv'


We created 3,000 imaginary shopping trips using the 30 products we selected earlier. Each trip is like a customer filling their basket with 2–7 random items. For example, in the first trip, the shopper bought honey and soda, while another shopper picked tea and pickled vegetables. These transactions are saved in a file so we can analyze which items are frequently bought together.”

### Step 2: One-Hot Encoding 
After simulation, we need to one-hot encode these transactions to prepare for Apriori

In [45]:
# Import TransactionEncoder from mlxtend to perform one-hot encoding
from mlxtend.preprocessing import TransactionEncoder

# Load the simulated transactions from the CSV file
df_simulated = pd.read_csv('supermarket_transactions.csv')  # Read the simulated transaction data

# Convert the 'Items' column from comma-separated strings to Python lists
df_simulated['Items'] = df_simulated['Items'].apply(lambda x: x.split(', '))  # Split each string into a list of items

# Initialize the TransactionEncoder
encoder = TransactionEncoder()

# Fit the encoder to the transaction data and transform it into a one-hot encoded array
onehot_array = encoder.fit_transform(df_simulated['Items'])

# Convert the one-hot encoded array into a DataFrame with item names as column headers
df_onehot = pd.DataFrame(onehot_array, columns=encoder.columns_)

# Display the first few rows of the one-hot encoded DataFrame
print("\n--- One-Hot Encoded Transactions Head ---")
print(df_onehot.head())

# Save the one-hot encoded DataFrame to a CSV file
df_onehot.to_csv('onehot_transactions.csv', index=False)  # Write DataFrame to CSV without row indices
print(" One-hot encoded transactions saved to 'onehot_transactions.csv'.")



--- One-Hot Encoded Transactions Head ---
   abrasive cleaner  beverages  bottled beer  canned fish  canned fruit  \
0             False      False         False        False         False   
1             False      False          True        False         False   
2             False      False         False        False         False   
3             False       True          True        False         False   
4             False      False         False        False         False   

   canned vegetables  cling film/bags  dishes  female sanitary products  \
0              False            False   False                     False   
1              False             True   False                     False   
2              False             True   False                     False   
3               True            False   False                     False   
4              False            False   False                     False   

   fruit/vegetable juice  ...  popcorn  processed chees

We transformed all shopping trips into a table of checkboxes so a computer can understand them. Each row represents one shopper’s trip, and each column shows if they bought a specific product. For example, one shopper bought soda but not milk or eggs. This new format makes it easier for us to find patterns, like which items are often bought together. We saved this table for the next step of analysis.

 ### Step 3: Generate Frequent Itemsets

In [46]:

# File: step3_generate_frequent_itemsets.py

# Import necessary libraries
import pandas as pd
from mlxtend.frequent_patterns import apriori

#Load One-Hot Encoded Transactions
onehot_df = pd.read_csv('onehot_transactions.csv')

#  Apply Apriori Algorithm
# Generate frequent itemsets with min_support=0.05 (5%)
frequent_itemsets = apriori(onehot_df, min_support=0.05, use_colnames=True)

# Add a 'length' column to show the number of items in each itemset
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))

# Preview top 10 frequent itemsets
print("\n--- Top 10 Frequent Itemsets ---")
print(frequent_itemsets.sort_values(by='support', ascending=False).head(10))

# Save frequent itemsets to CSV
frequent_itemsets.to_csv('frequent_itemsets.csv', index=False)
print(" Frequent itemsets saved to 'frequent_itemsets.csv'.")



--- Top 10 Frequent Itemsets ---
     support               itemsets  length
3   0.162333          (canned fish)       1
7   0.161333               (dishes)       1
11  0.159667                (herbs)       1
29  0.159000  (specialty chocolate)       1
12  0.155333                (honey)       1
26  0.155000        (sliced cheese)       1
0   0.154667     (abrasive cleaner)       1
20  0.152667              (popcorn)       1
23  0.151333      (rubbing alcohol)       1
14  0.151333                  (jam)       1
 Frequent itemsets saved to 'frequent_itemsets.csv'.


We analyzed the shopping trips to find the most popular products. For example, hard cheese was bought in 16% of all trips, and waffles in about 16% too. These are the items that customers buy most often. This helps us understand customer preferences and which products are ‘frequent shoppers’ in baskets. We saved all frequent product patterns in a file for further analysis.

###  Step 4: Identify Closed Frequent Itemsets

In [47]:
# Import necessary libraries
from mlxtend.frequent_patterns import apriori  # Used to apply the Apriori algorithm

# Load the one-hot encoded transaction dataset
# This is the preprocessed dataset we saved in Step 2
onehot_df = pd.read_csv('onehot_transactions.csv')

# Apply the Apriori algorithm to find frequent itemsets
# Set min_support=0.05, meaning items must appear in at least 5% of transactions to be considered frequent
frequent_itemsets = apriori(onehot_df, min_support=0.05, use_colnames=True)

# Add a 'length' column to show the number of items in each itemset
# This helps to analyze the complexity of the itemsets
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))

# Display the top 10 frequent itemsets sorted by their support
# Higher support means the itemset appears in more transactions
print("\n--- Top 10 Frequent Itemsets ---")
print(frequent_itemsets.sort_values(by='support', ascending=False).head(10))

# Save the frequent itemsets to a CSV file for later use
frequent_itemsets.to_csv('frequent_itemsets.csv', index=False)  # Write DataFrame to CSV without row indices
print(" Frequent itemsets saved to 'frequent_itemsets.csv'.")



--- Top 10 Frequent Itemsets ---
     support               itemsets  length
3   0.162333          (canned fish)       1
7   0.161333               (dishes)       1
11  0.159667                (herbs)       1
29  0.159000  (specialty chocolate)       1
12  0.155333                (honey)       1
26  0.155000        (sliced cheese)       1
0   0.154667     (abrasive cleaner)       1
20  0.152667              (popcorn)       1
23  0.151333      (rubbing alcohol)       1
14  0.151333                  (jam)       1
 Frequent itemsets saved to 'frequent_itemsets.csv'.


We analyzed 3,000 simulated shopping trips to find the most popular products. Hard cheese was the most common, appearing in 16.5% of baskets. Waffles and candles were also very frequent. This helps us see what customers tend to buy most often. We saved these popular products and combinations for further analysis.

### Step 5: Identify Maximal Frequent Itemsets

In [48]:
# Import necessary libraries

# Load the frequent itemsets from the CSV file generated in Step 3
frequent_itemsets = pd.read_csv('frequent_itemsets.csv')

# Convert the 'itemsets' column from string format to Python sets
# This allows us to easily check subset relationships between itemsets
frequent_itemsets['itemsets'] = frequent_itemsets['itemsets'].apply(eval).apply(set)

# Define a function to check if an itemset is "maximal"
# An itemset is maximal if there is no frequent superset
def is_maximal(itemset, all_itemsets):
    for superset in all_itemsets['itemsets']:
        if itemset < superset:  # Check if itemset is a proper subset of a larger itemset
            return False  # Current itemset is NOT maximal
    return True  # Itemset is maximal

# Filter the frequent itemsets to retain only maximal itemsets
maximal_itemsets = frequent_itemsets[frequent_itemsets.apply(
    lambda row: is_maximal(row['itemsets'], frequent_itemsets), axis=1
)]

# Display the first few maximal itemsets
print("\n--- Maximal Frequent Itemsets ---")
print(maximal_itemsets.head())

# Save the maximal itemsets to a CSV file
maximal_itemsets.to_csv('maximal_itemsets.csv', index=False)
print(" Maximal itemsets saved to 'maximal_itemsets.csv'.")



--- Maximal Frequent Itemsets ---
    support            itemsets  length
0  0.154667  {abrasive cleaner}       1
1  0.148667         {beverages}       1
2  0.143000      {bottled beer}       1
3  0.162333       {canned fish}       1
4  0.149333      {canned fruit}       1
 Maximal itemsets saved to 'maximal_itemsets.csv'.


We found the largest and most important shopping patterns in our data. For example, ‘candles’ appeared in 16% of all shopping trips, and no bigger group of products was bought as frequently. These ‘maximal’ patterns are useful because they show us the strongest and most unique buying habits without any redundancy. We saved them for future insights.