# 1️⃣ Library imports & read excel data

In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import math
import warnings
warnings.filterwarnings('ignore')
from itertools import combinations

In [7]:
dataFile = pd.read_excel('Horizontal_Format.xlsx')

### Data Shape

In [8]:
dataFile.shape

(5, 2)

In [9]:
dataFile.head()

Unnamed: 0,TiD,items
0,1,"M,O,N,K,E,Y"
1,2,"D,O,N,K,E,Y"
2,3,"M,A,K,E"
3,4,"M,U,C,K,Y"
4,5,"C,O,O,K,I,E"


### Transform Data

In [10]:
df = pd.DataFrame(dataFile)

# Splitting the items
rows = []

for index, row in df.iterrows():
    tid = row['TiD']
    items = row['items'].split(",")  # Split the items into a list
    for item in items:
        rows.append({'item': item, 'TiD': tid}) # each item appended in a single row, then after that i can form grouping 

# Group by the item
expanded_df = pd.DataFrame(rows)
Data = expanded_df.groupby('item')['TiD'].apply(lambda x: ",".join(map(str, sorted(x)))).reset_index()

Data.columns = ['item', 'TIDs']
print(Data)

   item       TIDs
0     A          3
1     C        4,5
2     D          2
3     E    1,2,3,5
4     I          5
5     K  1,2,3,4,5
6     M      1,3,4
7     N        1,2
8     O    1,2,5,5
9     U          4
10    Y      1,2,4


# 2️⃣ Algorithm

In [11]:
# Total number of unique transactions
total_transactions = expanded_df['TiD'].nunique()
print(f"Total transactions: {total_transactions}")


Total transactions: 5


In [12]:
def eclat(prefix, items):
    """
    Recursive function for ECLAT algorithm.

    Parameters:
        prefix (list): The current prefix (partial itemset).
        items (list): List of tuples (item, TID set) for the current level.

    Returns:
        list of tuples: Frequent itemsets and their support counts.
    """
    frequent = []
    for i, (item, tids) in enumerate(items):
        support = len(tids)
        if support >= min_support:
            # Generate a new frequent itemset
            new_prefix = prefix + [item]
            frequent.append((new_prefix, support))

            # Generate new candidates by intersecting TID sets
            new_items = [(other_item, other_tids & tids)
                         for other_item, other_tids in items[i + 1:]]

            # Recursive call
            frequent.extend(eclat(new_prefix, new_items))
    return frequent

In [13]:
# Define the ECLAT function
def generate_frequent_itemsets(vertical_db, min_support, total_transactions):
    """
    Generate all frequent itemsets using the ECLAT algorithm.

    Parameters:
        vertical_db (dict): A dictionary where keys are items and values are sets of transaction IDs (TIDs).
        min_support (int): Minimum support threshold.
        total_transactions (int): Total number of transactions (optional, for additional metrics like support percentage).

    Returns:
        dict: A dictionary where keys are itemset sizes (1, 2, ..., K), and values are lists of (itemset, support).
    """

    # Create a dictionary from the DataFrame for efficient processing
    vertical_db = {}
    for index, row in Data.iterrows():
        vertical_db[row['item']] = set(map(int, row['TIDs'].split(',')))

    # Sort items lexicographically (optional for consistent output)
    sorted_items = sorted(vertical_db.items())

    # Get all frequent itemsets
    all_itemsets = eclat([], sorted_items)

    # Group by size
    grouped_itemsets = {}
    for itemset, support in all_itemsets:
        size = len(itemset)
        if size not in grouped_itemsets:
            grouped_itemsets[size] = []
        grouped_itemsets[size].append((itemset, support))

    return grouped_itemsets

In [14]:
# Input minimum support
min_support = float(input("Enter minimum support (fractional or count): "))


Enter minimum support (fractional or count):  2


In [15]:
if min_support < 1:  # Fractional support provided
    min_support = math.ceil(min_support * total_transactions)

print(f"Using minimum support: {min_support}")

# Generate frequent itemsets
frequent_itemsets = generate_frequent_itemsets(Data, min_support, total_transactions)

# Output the results grouped by size
for k, itemsets in sorted(frequent_itemsets.items()):
    print(f"\nFrequent itemsets of size {k} (L{k}):")
    for itemset, support in itemsets:
        print(f"  Itemset: {itemset}, Support: {support}")

Using minimum support: 2.0

Frequent itemsets of size 1 (L1):
  Itemset: ['C'], Support: 2
  Itemset: ['E'], Support: 4
  Itemset: ['K'], Support: 5
  Itemset: ['M'], Support: 3
  Itemset: ['N'], Support: 2
  Itemset: ['O'], Support: 3
  Itemset: ['Y'], Support: 3

Frequent itemsets of size 2 (L2):
  Itemset: ['C', 'K'], Support: 2
  Itemset: ['E', 'K'], Support: 4
  Itemset: ['E', 'M'], Support: 2
  Itemset: ['E', 'N'], Support: 2
  Itemset: ['E', 'O'], Support: 3
  Itemset: ['E', 'Y'], Support: 2
  Itemset: ['K', 'M'], Support: 3
  Itemset: ['K', 'N'], Support: 2
  Itemset: ['K', 'O'], Support: 3
  Itemset: ['K', 'Y'], Support: 3
  Itemset: ['M', 'Y'], Support: 2
  Itemset: ['N', 'O'], Support: 2
  Itemset: ['N', 'Y'], Support: 2
  Itemset: ['O', 'Y'], Support: 2

Frequent itemsets of size 3 (L3):
  Itemset: ['E', 'K', 'M'], Support: 2
  Itemset: ['E', 'K', 'N'], Support: 2
  Itemset: ['E', 'K', 'O'], Support: 3
  Itemset: ['E', 'K', 'Y'], Support: 2
  Itemset: ['E', 'N', 'O'], Suppo

# 3️⃣

In [16]:
# Minimum confidence threshold
min_confidence = float(input("Enter minimum confidence (as a decimal, e.g., 0.6 for 60%): "))

Enter minimum confidence (as a decimal, e.g., 0.6 for 60%):  0.6


In [17]:
# Function to generate association rules and calculate confidence
def generate_association_rules(frequent_itemsets):
    """
    Generate and print association rules from frequent itemsets.

    Parameters:
        frequent_itemsets (dict): Dictionary of frequent itemsets grouped by size.

    Returns:
        strong_rules (list): List of strong association rules with support and confidence.
    """
    strong_rules = []  # To store strong rules

    # Iterate over all sizes of itemsets
    for size, itemsets in frequent_itemsets.items():
        for itemset, support in itemsets:
            # Generate all possible non-empty subsets of the itemset
            for i in range(1, len(itemset)):
                subsets = list(combinations(itemset, i))  # Generate subsets of length i
                for antecedent in subsets:
                    consequent = tuple(sorted(set(itemset) - set(antecedent)))  # Consequent is the remaining items
                    
                    if consequent:
                        # Calculate support and confidence
                        antecedent_support = get_support(antecedent, frequent_itemsets)
                        confidence = support / antecedent_support

                        # Check if the rule is strong
                        if confidence >= min_confidence:
                            strong_rules.append({
                                "rule": f"{antecedent} => {consequent}",
                                "support": support,
                                "confidence": confidence
                            })
    
    return strong_rules

In [18]:
# Helper function to get the support of a specific itemset
def get_support(itemset, frequent_itemsets):
    """
    Get the support count of an itemset from the frequent itemsets.

    Parameters:
        itemset (tuple): The itemset whose support is to be retrieved.
        frequent_itemsets (dict): Dictionary of frequent itemsets grouped by size.

    Returns:
        int: Support count of the itemset.
    """
    itemset_size = len(itemset)
    for frequent_item, support in frequent_itemsets[itemset_size]:
        if sorted(frequent_item) == sorted(itemset):
            return support
    return 0

In [21]:
# Generate and display strong association rules
strong_rules = generate_association_rules(frequent_itemsets)

print("\nStrong Association Rules:")
for rule in strong_rules:
    print(f"Rule: {rule['rule']}, Support: {rule['support']}, Confidence: {rule['confidence']:.2f}")


Strong Association Rules:
Rule: ('C',) => ('K',), Support: 2, Confidence: 1.00
Rule: ('E',) => ('K',), Support: 4, Confidence: 1.00
Rule: ('K',) => ('E',), Support: 4, Confidence: 0.80
Rule: ('M',) => ('E',), Support: 2, Confidence: 0.67
Rule: ('N',) => ('E',), Support: 2, Confidence: 1.00
Rule: ('E',) => ('O',), Support: 3, Confidence: 0.75
Rule: ('O',) => ('E',), Support: 3, Confidence: 1.00
Rule: ('Y',) => ('E',), Support: 2, Confidence: 0.67
Rule: ('K',) => ('M',), Support: 3, Confidence: 0.60
Rule: ('M',) => ('K',), Support: 3, Confidence: 1.00
Rule: ('N',) => ('K',), Support: 2, Confidence: 1.00
Rule: ('K',) => ('O',), Support: 3, Confidence: 0.60
Rule: ('O',) => ('K',), Support: 3, Confidence: 1.00
Rule: ('K',) => ('Y',), Support: 3, Confidence: 0.60
Rule: ('Y',) => ('K',), Support: 3, Confidence: 1.00
Rule: ('M',) => ('Y',), Support: 2, Confidence: 0.67
Rule: ('Y',) => ('M',), Support: 2, Confidence: 0.67
Rule: ('N',) => ('O',), Support: 2, Confidence: 1.00
Rule: ('O',) => ('N