DMML Assignment - 1 | Example runs on Enron dataset

### ENRON dataset



In [None]:
import time
from collections import defaultdict


def read_vocab(file_path):
    """ Read the vocabulary file and map word IDs to words. """
    vocab = {}
    with open(file_path, 'r') as f:
        for i, line in enumerate(f, start=1):
            vocab[i] = line.strip()
    return vocab

def read_docword(file_path):
    """ Read the document-word file and store word occurrences efficiently. """
    with open(file_path, 'r') as f:
        D = int(f.readline().strip())  # Number of documents
        W = int(f.readline().strip())  # Number of words
        NNZ = int(f.readline().strip())  # Nonzero entries

        word_docs = defaultdict(set)  # wordID -> set of documents containing it
        for _ in range(NNZ):
            doc_id, word_id, count = map(int, f.readline().strip().split())
            word_docs[word_id].add(doc_id)

    return word_docs

def apriori(word_docs, K, F):
    """ Optimized Apriori algorithm for large datasets. """

    # Step 1: Find frequent 1-itemsets
    freq_itemsets = { (word,): docs for word, docs in word_docs.items() if len(docs) >= F }

    # Step 2: Generate k-itemsets iteratively
    for k in range(2, K + 1):
        candidates = set()
        freq_keys = list(freq_itemsets.keys())  # List of current frequent itemsets

        # Generate candidate itemsets of size k using (k-1)-itemsets
        for i in range(len(freq_keys)):
            for j in range(i + 1, len(freq_keys)):
                a, b = freq_keys[i], freq_keys[j]

                # Merge only if first (k-2) elements are same (Efficient pruning)
                if a[:-1] == b[:-1]:
                    new_itemset = tuple(sorted(set(a) | set(b)))  # Union
                    if len(new_itemset) == k:
                        candidates.add(new_itemset)

        # Count support for candidate itemsets
        new_freq_itemsets = {}
        for c in candidates:
            intersect_docs = set.intersection(*(word_docs[word] for word in c))
            if len(intersect_docs) >= F:
                new_freq_itemsets[c] = intersect_docs

        # If no new frequent itemsets, break early
        if not new_freq_itemsets:
            return []

        freq_itemsets = new_freq_itemsets

    # Convert sets to counts for readability
    return sorted([(itemset, len(docs)) for itemset, docs in freq_itemsets.items()],
                  key=lambda x: x[1], reverse=True)

def main(vocab_file, docword_file, K, F):
    print("Reading dataset...")
    word_docs = read_docword(docword_file)
    vocab = read_vocab(vocab_file)

    print(f"Running Apriori for K={K}, F={F}")
    start_time = time.time()
    frequent_itemsets = apriori(word_docs, K, F)

    elapsed = time.time() - start_time

    print(f"\nTime taken: {elapsed:.2f} seconds")

    if not frequent_itemsets:
        print("\nNo itemsets found.")
    else:
        print(f"\nTotal Frequent K-itemsets Found: {len(frequent_itemsets)}")
        print("Frequent K-itemsets:")
        for itemset, count in frequent_itemsets:
            print(f"Itemset: {tuple(vocab[word] for word in itemset)}, Count: {count}")

## K = 2 and F = 2000

In [12]:
main("vocab.enron.txt", "docword.enron.txt", K=2, F=2000)

Reading dataset...
Running Apriori for K=2, F=2000

Time taken: 9.55 seconds

Total Frequent K-itemsets Found: 18
Frequent K-itemsets:
Itemset: ('market', 'power'), Count: 2545
Itemset: ('energy', 'power'), Count: 2536
Itemset: ('energy', 'market'), Count: 2481
Itemset: ('market', 'price'), Count: 2295
Itemset: ('business', 'market'), Count: 2291
Itemset: ('california', 'energy'), Count: 2280
Itemset: ('california', 'power'), Count: 2274
Itemset: ('california', 'market'), Count: 2258
Itemset: ('company', 'market'), Count: 2169
Itemset: ('going', 'think'), Count: 2151
Itemset: ('business', 'company'), Count: 2124
Itemset: ('cost', 'market'), Count: 2095
Itemset: ('group', 'market'), Count: 2087
Itemset: ('market', 'prices'), Count: 2078
Itemset: ('group', 'meeting'), Count: 2077
Itemset: ('customer', 'market'), Count: 2062
Itemset: ('market', 'month'), Count: 2037
Itemset: ('energy', 'price'), Count: 2008


## K = 2 and F = 1900

In [13]:
main("vocab.enron.txt", "docword.enron.txt", K=2, F=1900)

Reading dataset...
Running Apriori for K=2, F=1900

Time taken: 11.16 seconds

Total Frequent K-itemsets Found: 26
Frequent K-itemsets:
Itemset: ('market', 'power'), Count: 2545
Itemset: ('energy', 'power'), Count: 2536
Itemset: ('energy', 'market'), Count: 2481
Itemset: ('market', 'price'), Count: 2295
Itemset: ('business', 'market'), Count: 2291
Itemset: ('california', 'energy'), Count: 2280
Itemset: ('california', 'power'), Count: 2274
Itemset: ('california', 'market'), Count: 2258
Itemset: ('company', 'market'), Count: 2169
Itemset: ('going', 'think'), Count: 2151
Itemset: ('business', 'company'), Count: 2124
Itemset: ('cost', 'market'), Count: 2095
Itemset: ('group', 'market'), Count: 2087
Itemset: ('market', 'prices'), Count: 2078
Itemset: ('group', 'meeting'), Count: 2077
Itemset: ('customer', 'market'), Count: 2062
Itemset: ('market', 'month'), Count: 2037
Itemset: ('energy', 'price'), Count: 2008
Itemset: ('business', 'group'), Count: 1978
Itemset: ('cost', 'power'), Count: 19

## K = 2 and F = 1800

In [14]:
main("vocab.enron.txt", "docword.enron.txt", K=2, F=1800)

Reading dataset...
Running Apriori for K=2, F=1800

Time taken: 12.90 seconds

Total Frequent K-itemsets Found: 40
Frequent K-itemsets:
Itemset: ('market', 'power'), Count: 2545
Itemset: ('energy', 'power'), Count: 2536
Itemset: ('energy', 'market'), Count: 2481
Itemset: ('market', 'price'), Count: 2295
Itemset: ('business', 'market'), Count: 2291
Itemset: ('california', 'energy'), Count: 2280
Itemset: ('california', 'power'), Count: 2274
Itemset: ('california', 'market'), Count: 2258
Itemset: ('company', 'market'), Count: 2169
Itemset: ('going', 'think'), Count: 2151
Itemset: ('business', 'company'), Count: 2124
Itemset: ('cost', 'market'), Count: 2095
Itemset: ('group', 'market'), Count: 2087
Itemset: ('market', 'prices'), Count: 2078
Itemset: ('group', 'meeting'), Count: 2077
Itemset: ('customer', 'market'), Count: 2062
Itemset: ('market', 'month'), Count: 2037
Itemset: ('energy', 'price'), Count: 2008
Itemset: ('business', 'group'), Count: 1978
Itemset: ('cost', 'power'), Count: 19

## K = 2 and F = 1700

In [15]:
main("vocab.enron.txt", "docword.enron.txt", K=2, F=1700)

Reading dataset...
Running Apriori for K=2, F=1700

Time taken: 14.23 seconds

Total Frequent K-itemsets Found: 69
Frequent K-itemsets:
Itemset: ('market', 'power'), Count: 2545
Itemset: ('energy', 'power'), Count: 2536
Itemset: ('energy', 'market'), Count: 2481
Itemset: ('market', 'price'), Count: 2295
Itemset: ('business', 'market'), Count: 2291
Itemset: ('california', 'energy'), Count: 2280
Itemset: ('california', 'power'), Count: 2274
Itemset: ('california', 'market'), Count: 2258
Itemset: ('company', 'market'), Count: 2169
Itemset: ('going', 'think'), Count: 2151
Itemset: ('business', 'company'), Count: 2124
Itemset: ('cost', 'market'), Count: 2095
Itemset: ('group', 'market'), Count: 2087
Itemset: ('market', 'prices'), Count: 2078
Itemset: ('group', 'meeting'), Count: 2077
Itemset: ('customer', 'market'), Count: 2062
Itemset: ('market', 'month'), Count: 2037
Itemset: ('energy', 'price'), Count: 2008
Itemset: ('business', 'group'), Count: 1978
Itemset: ('cost', 'power'), Count: 19

## K = 2 and F = 1600

In [24]:
main("vocab.enron.txt", "docword.enron.txt", K=2, F=1600)

Reading dataset...
Running Apriori for K=2, F=1600

Time taken: 17.29 seconds

Total Frequent K-itemsets Found: 117
Frequent K-itemsets:
Itemset: ('market', 'power'), Count: 2545
Itemset: ('energy', 'power'), Count: 2536
Itemset: ('energy', 'market'), Count: 2481
Itemset: ('market', 'price'), Count: 2295
Itemset: ('business', 'market'), Count: 2291
Itemset: ('california', 'energy'), Count: 2280
Itemset: ('california', 'power'), Count: 2274
Itemset: ('california', 'market'), Count: 2258
Itemset: ('company', 'market'), Count: 2169
Itemset: ('going', 'think'), Count: 2151
Itemset: ('business', 'company'), Count: 2124
Itemset: ('cost', 'market'), Count: 2095
Itemset: ('group', 'market'), Count: 2087
Itemset: ('market', 'prices'), Count: 2078
Itemset: ('group', 'meeting'), Count: 2077
Itemset: ('customer', 'market'), Count: 2062
Itemset: ('market', 'month'), Count: 2037
Itemset: ('energy', 'price'), Count: 2008
Itemset: ('business', 'group'), Count: 1978
Itemset: ('cost', 'power'), Count: 1

## K = 2 and F = 1500

In [4]:
main("vocab.enron.txt", "docword.enron.txt", K=2, F=1500)

Reading dataset...
Running Apriori for K=2, F=1500

Time taken: 19.34 seconds

Total Frequent K-itemsets Found: 202
Frequent K-itemsets:
Itemset: ('market', 'power'), Count: 2545
Itemset: ('energy', 'power'), Count: 2536
Itemset: ('energy', 'market'), Count: 2481
Itemset: ('market', 'price'), Count: 2295
Itemset: ('business', 'market'), Count: 2291
Itemset: ('california', 'energy'), Count: 2280
Itemset: ('california', 'power'), Count: 2274
Itemset: ('california', 'market'), Count: 2258
Itemset: ('company', 'market'), Count: 2169
Itemset: ('going', 'think'), Count: 2151
Itemset: ('business', 'company'), Count: 2124
Itemset: ('cost', 'market'), Count: 2095
Itemset: ('group', 'market'), Count: 2087
Itemset: ('market', 'prices'), Count: 2078
Itemset: ('group', 'meeting'), Count: 2077
Itemset: ('customer', 'market'), Count: 2062
Itemset: ('market', 'month'), Count: 2037
Itemset: ('energy', 'price'), Count: 2008
Itemset: ('business', 'group'), Count: 1978
Itemset: ('cost', 'power'), Count: 1

## K = 2 and F = 1400

In [9]:
main("vocab.enron.txt", "docword.enron.txt", K=2, F=1400)

Reading dataset...
Running Apriori for K=2, F=1400

Time taken: 20.74 seconds

Total Frequent K-itemsets Found: 347
Frequent K-itemsets:
Itemset: ('market', 'power'), Count: 2545
Itemset: ('energy', 'power'), Count: 2536
Itemset: ('energy', 'market'), Count: 2481
Itemset: ('market', 'price'), Count: 2295
Itemset: ('business', 'market'), Count: 2291
Itemset: ('california', 'energy'), Count: 2280
Itemset: ('california', 'power'), Count: 2274
Itemset: ('california', 'market'), Count: 2258
Itemset: ('company', 'market'), Count: 2169
Itemset: ('going', 'think'), Count: 2151
Itemset: ('business', 'company'), Count: 2124
Itemset: ('cost', 'market'), Count: 2095
Itemset: ('group', 'market'), Count: 2087
Itemset: ('market', 'prices'), Count: 2078
Itemset: ('group', 'meeting'), Count: 2077
Itemset: ('customer', 'market'), Count: 2062
Itemset: ('market', 'month'), Count: 2037
Itemset: ('energy', 'price'), Count: 2008
Itemset: ('business', 'group'), Count: 1978
Itemset: ('cost', 'power'), Count: 1

## K = 2 and F = 1300

In [10]:
main("vocab.enron.txt", "docword.enron.txt", K=2, F=1300)

Reading dataset...
Running Apriori for K=2, F=1300

Time taken: 23.30 seconds

Total Frequent K-itemsets Found: 576
Frequent K-itemsets:
Itemset: ('market', 'power'), Count: 2545
Itemset: ('energy', 'power'), Count: 2536
Itemset: ('energy', 'market'), Count: 2481
Itemset: ('market', 'price'), Count: 2295
Itemset: ('business', 'market'), Count: 2291
Itemset: ('california', 'energy'), Count: 2280
Itemset: ('california', 'power'), Count: 2274
Itemset: ('california', 'market'), Count: 2258
Itemset: ('company', 'market'), Count: 2169
Itemset: ('going', 'think'), Count: 2151
Itemset: ('business', 'company'), Count: 2124
Itemset: ('cost', 'market'), Count: 2095
Itemset: ('group', 'market'), Count: 2087
Itemset: ('market', 'prices'), Count: 2078
Itemset: ('group', 'meeting'), Count: 2077
Itemset: ('customer', 'market'), Count: 2062
Itemset: ('market', 'month'), Count: 2037
Itemset: ('energy', 'price'), Count: 2008
Itemset: ('business', 'group'), Count: 1978
Itemset: ('cost', 'power'), Count: 1

## K = 2 and F = 1200

In [11]:
main("vocab.enron.txt", "docword.enron.txt", K=2, F=1200)

Reading dataset...
Running Apriori for K=2, F=1200

Time taken: 32.79 seconds

Total Frequent K-itemsets Found: 964
Frequent K-itemsets:
Itemset: ('market', 'power'), Count: 2545
Itemset: ('energy', 'power'), Count: 2536
Itemset: ('energy', 'market'), Count: 2481
Itemset: ('market', 'price'), Count: 2295
Itemset: ('business', 'market'), Count: 2291
Itemset: ('california', 'energy'), Count: 2280
Itemset: ('california', 'power'), Count: 2274
Itemset: ('california', 'market'), Count: 2258
Itemset: ('company', 'market'), Count: 2169
Itemset: ('going', 'think'), Count: 2151
Itemset: ('business', 'company'), Count: 2124
Itemset: ('cost', 'market'), Count: 2095
Itemset: ('group', 'market'), Count: 2087
Itemset: ('market', 'prices'), Count: 2078
Itemset: ('group', 'meeting'), Count: 2077
Itemset: ('customer', 'market'), Count: 2062
Itemset: ('market', 'month'), Count: 2037
Itemset: ('energy', 'price'), Count: 2008
Itemset: ('business', 'group'), Count: 1978
Itemset: ('cost', 'power'), Count: 1

## K = 2 and F = 1100

In [16]:
main("vocab.enron.txt", "docword.enron.txt", K=2, F=1100)

Reading dataset...
Running Apriori for K=2, F=1100

Time taken: 38.40 seconds

Total Frequent K-itemsets Found: 1531
Frequent K-itemsets:
Itemset: ('market', 'power'), Count: 2545
Itemset: ('energy', 'power'), Count: 2536
Itemset: ('energy', 'market'), Count: 2481
Itemset: ('market', 'price'), Count: 2295
Itemset: ('business', 'market'), Count: 2291
Itemset: ('california', 'energy'), Count: 2280
Itemset: ('california', 'power'), Count: 2274
Itemset: ('california', 'market'), Count: 2258
Itemset: ('company', 'market'), Count: 2169
Itemset: ('going', 'think'), Count: 2151
Itemset: ('business', 'company'), Count: 2124
Itemset: ('cost', 'market'), Count: 2095
Itemset: ('group', 'market'), Count: 2087
Itemset: ('market', 'prices'), Count: 2078
Itemset: ('group', 'meeting'), Count: 2077
Itemset: ('customer', 'market'), Count: 2062
Itemset: ('market', 'month'), Count: 2037
Itemset: ('energy', 'price'), Count: 2008
Itemset: ('business', 'group'), Count: 1978
Itemset: ('cost', 'power'), Count: 

## K = 2 and F = 1000

In [17]:
main("vocab.enron.txt", "docword.enron.txt", K=2, F=1000)

Reading dataset...
Running Apriori for K=2, F=1000

Time taken: 44.42 seconds

Total Frequent K-itemsets Found: 2578
Frequent K-itemsets:
Itemset: ('market', 'power'), Count: 2545
Itemset: ('energy', 'power'), Count: 2536
Itemset: ('energy', 'market'), Count: 2481
Itemset: ('market', 'price'), Count: 2295
Itemset: ('business', 'market'), Count: 2291
Itemset: ('california', 'energy'), Count: 2280
Itemset: ('california', 'power'), Count: 2274
Itemset: ('california', 'market'), Count: 2258
Itemset: ('company', 'market'), Count: 2169
Itemset: ('going', 'think'), Count: 2151
Itemset: ('business', 'company'), Count: 2124
Itemset: ('cost', 'market'), Count: 2095
Itemset: ('group', 'market'), Count: 2087
Itemset: ('market', 'prices'), Count: 2078
Itemset: ('group', 'meeting'), Count: 2077
Itemset: ('customer', 'market'), Count: 2062
Itemset: ('market', 'month'), Count: 2037
Itemset: ('energy', 'price'), Count: 2008
Itemset: ('business', 'group'), Count: 1978
Itemset: ('cost', 'power'), Count: 

## K = 2 and F = 900

In [18]:
main("vocab.enron.txt", "docword.enron.txt", K=2, F=900)

Reading dataset...
Running Apriori for K=2, F=900

Time taken: 56.21 seconds

Total Frequent K-itemsets Found: 4424
Frequent K-itemsets:
Itemset: ('market', 'power'), Count: 2545
Itemset: ('energy', 'power'), Count: 2536
Itemset: ('energy', 'market'), Count: 2481
Itemset: ('market', 'price'), Count: 2295
Itemset: ('business', 'market'), Count: 2291
Itemset: ('california', 'energy'), Count: 2280
Itemset: ('california', 'power'), Count: 2274
Itemset: ('california', 'market'), Count: 2258
Itemset: ('company', 'market'), Count: 2169
Itemset: ('going', 'think'), Count: 2151
Itemset: ('business', 'company'), Count: 2124
Itemset: ('cost', 'market'), Count: 2095
Itemset: ('group', 'market'), Count: 2087
Itemset: ('market', 'prices'), Count: 2078
Itemset: ('group', 'meeting'), Count: 2077
Itemset: ('customer', 'market'), Count: 2062
Itemset: ('market', 'month'), Count: 2037
Itemset: ('energy', 'price'), Count: 2008
Itemset: ('business', 'group'), Count: 1978
Itemset: ('cost', 'power'), Count: 1

## K = 3 and F = 1500

In [19]:
main("vocab.enron.txt", "docword.enron.txt", K=3, F=1500)

Reading dataset...
Running Apriori for K=3, F=1500

Time taken: 22.21 seconds

Total Frequent K-itemsets Found: 3
Frequent K-itemsets:
Itemset: ('energy', 'market', 'power'), Count: 1764
Itemset: ('california', 'energy', 'power'), Count: 1667
Itemset: ('california', 'market', 'power'), Count: 1602


## K = 4 and F = 1200

In [20]:
main("vocab.enron.txt", "docword.enron.txt", K=4, F=1200)

Reading dataset...
Running Apriori for K=4, F=1200

Time taken: 40.47 seconds

Total Frequent K-itemsets Found: 2
Frequent K-itemsets:
Itemset: ('california', 'energy', 'market', 'power'), Count: 1256
Itemset: ('energy', 'market', 'power', 'price'), Count: 1210


## K = 5 and F = 800

In [25]:
main("vocab.enron.txt", "docword.enron.txt", K=5, F=800)

Reading dataset...
Running Apriori for K=5, F=800

Time taken: 221.84 seconds

Total Frequent K-itemsets Found: 162
Frequent K-itemsets:
Itemset: ('california', 'electricity', 'energy', 'market', 'power'), Count: 976
Itemset: ('california', 'energy', 'market', 'power', 'price'), Count: 972
Itemset: ('california', 'energy', 'market', 'power', 'prices'), Count: 959
Itemset: ('california', 'cost', 'electricity', 'energy', 'power'), Count: 958
Itemset: ('california', 'cost', 'energy', 'market', 'power'), Count: 953
Itemset: ('california', 'electricity', 'energy', 'power', 'states'), Count: 945
Itemset: ('energy', 'market', 'power', 'price', 'prices'), Count: 936
Itemset: ('california', 'market', 'power', 'price', 'prices'), Count: 934
Itemset: ('california', 'electricity', 'energy', 'power', 'prices'), Count: 922
Itemset: ('cost', 'energy', 'market', 'power', 'price'), Count: 922
Itemset: ('electricity', 'energy', 'market', 'power', 'price'), Count: 915
Itemset: ('california', 'electricity

## K = 7 and F = 700

In [31]:
main("vocab.enron.txt", "docword.enron.txt", K=7, F=700)

Reading dataset...
Running Apriori for K=7, F=700

Time taken: 514.36 seconds

Total Frequent K-itemsets Found: 15
Frequent K-itemsets:
Itemset: ('california', 'electricity', 'energy', 'market', 'power', 'price', 'prices'), Count: 746
Itemset: ('california', 'cost', 'electricity', 'energy', 'market', 'power', 'price'), Count: 731
Itemset: ('california', 'cost', 'electricity', 'energy', 'market', 'power', 'prices'), Count: 730
Itemset: ('california', 'cost', 'energy', 'market', 'power', 'price', 'prices'), Count: 724
Itemset: ('cost', 'electricity', 'energy', 'market', 'power', 'price', 'prices'), Count: 723
Itemset: ('california', 'cost', 'electricity', 'energy', 'market', 'power', 'states'), Count: 718
Itemset: ('california', 'cost', 'electricity', 'energy', 'power', 'price', 'prices'), Count: 717
Itemset: ('california', 'cost', 'electricity', 'energy', 'power', 'prices', 'states'), Count: 711
Itemset: ('basis', 'contract', 'corp', 'create', 'offer', 'party', 'review'), Count: 709
Ite

## K = 10 and F = 900

In [23]:
main("vocab.enron.txt", "docword.enron.txt", K=10, F=900)

Reading dataset...
Running Apriori for K=10, F=900

Time taken: 119.89 seconds

No itemsets found.
