# DMML(2025) Assignment 1 submission for

1. Lucky Mathias Kispotta (MCS202411)
2. Hemadri Shekhar Das (MCS 202405)

# Mounting Dataset from Google Drive

The entire dataset was first downloaded and uploaded to a google drive from where it was accessed to run the code.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


The entire code is in a single cell. It has been copied to different cells to run the code on different inputs and store the outputs.

# Code run for k = 2, f = 500 with output on kos dataset

In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict
from itertools import combinations
import time
from pathlib import Path
import os
import errno
from tqdm import tqdm  # Import tqdm for the progress bar

class File:
    def __init__(self, path, name):
        self.name = str(name)
        self.vocabFile = "vocab." + self.name + ".txt"
        self.vocabPath = Path(path) / self.vocabFile
        self.docWordFile = "docword." + self.name + ".txt.gz"
        self.docWordPath = Path(path) / self.docWordFile

        if not self.vocabPath.exists() or self.vocabPath.is_dir():
            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), self.vocabPath)
        if not self.docWordPath.exists() or self.docWordPath.is_dir():
            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), self.docWordPath)

    def __read_csv_chunks(self, lines, path, **params):
        chunks = []
        for i, chunk in enumerate(pd.read_csv(path, **params)):
            prog = min((i + 1) / lines * 100.0 * params["chunksize"], 100)
            print(f"Data Load Progress : {prog:.2f} %.", end="\r", flush=True)
            chunks.append(chunk)
        print("\n")
        df = pd.concat(chunks, axis=0)
        del chunks
        return df

    def load_data(self):
        start_time = time.time()
        print("Reading vocabulary file...")
        self.vocab = pd.read_csv(self.vocabPath, header=None, names=["word"])
        print(f"Vocabulary file loaded. Time taken: {time.time() - start_time:.2f} seconds.")

        self.vocab.index += 1

        start_time = time.time()
        print("Reading docword file (metadata)...")
        tmp = pd.read_csv(self.docWordPath, compression='gzip', header=None, nrows=3)
        self.docCount = tmp[0].values[0]
        self.wordCount = tmp[0].values[1]
        self.NNZ = tmp[0].values[2]
        print(f"Docword file metadata loaded. Time taken: {time.time() - start_time:.2f} seconds.")

        start_time = time.time()
        print("Reading docword file (data)...")
        self.docWord = self.__read_csv_chunks(self.NNZ, self.docWordPath, compression='gzip', header=None, sep=' ',
                                              quotechar='"', on_bad_lines="skip", skiprows=3, chunksize=10000,
                                              names=["docID", "wordID", "count"])
        print(f"Docword file data loaded. Time taken: {time.time() - start_time:.2f} seconds.")

    def get_unique_docs(self):
        try:
            return self.docIDS
        except AttributeError:
            docIDS = self.docWord["docID"].unique()
            docIDS.sort()
            self.docIDS = docIDS
            return self.docIDS

    def get_words_by_docID(self, _id):
        try:
            return self.docWordList[_id]
        except AttributeError:
            return self.docWord[self.docWord['docID'] == _id]["wordID"].tolist()


# Optimized Apriori algorithm to find frequent itemsets of size k
def apriori(dataset, min_support, k):
    itemsets = defaultdict(int)

    # Generate all itemsets of size 1 (frequent 1-itemsets)
    for doc in tqdm(dataset, desc="Generating 1-itemsets", colour="green"):
        for word in doc:
            itemsets[frozenset([word])] += 1

    # Prune infrequent itemsets
    itemsets = {itemset: count for itemset, count in itemsets.items() if count >= min_support}

    # Start generating k-itemsets
    frequent_itemsets = defaultdict(int)
    current_itemsets = list(itemsets.keys())

    # We only need to keep itemsets of size k, so we stop the generation process once we reach k-itemsets.
    for k_itemsets in range(2, k + 1):
        print(f"\nGenerating {k_itemsets}-itemsets...")

        # Generate candidate itemsets from the previous frequent itemsets
        candidate_itemsets = defaultdict(int)

        # Join itemsets of size (k_itemsets - 1) to generate k_itemsets
        for i in tqdm(range(len(current_itemsets)), desc=f"Generating {k_itemsets}-itemsets candidates", colour="yellow"):
            for j in range(i + 1, len(current_itemsets)):
                # Try to join two itemsets
                candidate = current_itemsets[i] | current_itemsets[j]
                if len(candidate) == k_itemsets:  # Only consider itemsets of size k_itemsets
                    # Candidate itemset must not have any infrequent subsets
                    if all(frozenset(comb) in itemsets for comb in combinations(candidate, k_itemsets - 1)):
                        for doc in dataset:
                            if candidate.issubset(doc):
                                candidate_itemsets[candidate] += 1

        # Prune infrequent itemsets
        candidate_itemsets = {itemset: count for itemset, count in candidate_itemsets.items() if count >= min_support}

        if not candidate_itemsets:
            break  # No more frequent itemsets can be generated

        # Update the frequent itemsets dictionary with only size k itemsets
        if k_itemsets == k:
            frequent_itemsets.update(candidate_itemsets)

        # Update current itemsets for the next iteration
        current_itemsets = list(candidate_itemsets.keys())

    return frequent_itemsets


def main(path, name, k, min_support):
    start_time = time.time()

    # Initialize File class to load data
    file_handler = File(path, name)

    # Load data
    file_handler.load_data()

    # Get the list of documents as transactions
    documents = [file_handler.get_words_by_docID(doc_id) for doc_id in file_handler.get_unique_docs()]

    # Apply Apriori algorithm to find frequent itemsets of size k
    print("Running Apriori algorithm...")
    frequent_itemsets = apriori(documents, min_support, k)

    # Print the results for frequent itemsets of size k
    print("\nFrequent Itemsets of size", k, ":")
    if frequent_itemsets:
        for itemset, count in tqdm(frequent_itemsets.items(), desc="Printing itemsets", colour="blue"):
            print(f"Itemset: {set(itemset)}, Support: {count}")
    else:
        print(f"No frequent itemsets of size {k} found.")

    end_time = time.time()
    print(f"Total Time taken: {end_time - start_time:.2f} seconds")


if __name__ == "__main__":
    # Example usage
    dataset_name = input("Enter dataset name (e.g., enron): ")
    k = int(input("Enter k (size of itemsets): "))
    min_support = int(input("Enter min_support (minimum frequency threshold): "))
    path = r"/content/drive/MyDrive/bag+of+words"  # Adjust path if needed
    main(path, dataset_name, k, min_support)


Enter dataset name (e.g., enron): kos
Enter k (size of itemsets): 2
Enter min_support (minimum frequency threshold): 500
Reading vocabulary file...
Vocabulary file loaded. Time taken: 0.01 seconds.
Reading docword file (metadata)...
Docword file metadata loaded. Time taken: 0.01 seconds.
Reading docword file (data)...


Docword file data loaded. Time taken: 0.23 seconds.
Running Apriori algorithm...


Generating 1-itemsets: 100%|[32m██████████[0m| 3430/3430 [00:00<00:00, 10513.66it/s]



Generating 2-itemsets...


Generating 2-itemsets candidates: 100%|[33m██████████[0m| 50/50 [00:24<00:00,  2.06it/s]



Frequent Itemsets of size 2 :


Printing itemsets: 100%|[34m██████████[0m| 101/101 [00:00<00:00, 90848.10it/s]

Itemset: {841, 89}, Support: 748
Itemset: {2640, 841}, Support: 1250
Itemset: {841, 3005}, Support: 873
Itemset: {841, 3282}, Support: 654
Itemset: {841, 3350}, Support: 556
Itemset: {841, 3420}, Support: 1195
Itemset: {841, 4143}, Support: 513
Itemset: {841, 4196}, Support: 514
Itemset: {841, 4735}, Support: 834
Itemset: {5185, 841}, Support: 527
Itemset: {841, 6659}, Support: 593
Itemset: {841, 6689}, Support: 994
Itemset: {841, 879}, Support: 648
Itemset: {1664, 841}, Support: 937
Itemset: {841, 1666}, Support: 783
Itemset: {841, 2030}, Support: 766
Itemset: {841, 3858}, Support: 721
Itemset: {4632, 841}, Support: 876
Itemset: {841, 4635}, Support: 660
Itemset: {841, 4761}, Support: 610
Itemset: {841, 5186}, Support: 794
Itemset: {5552, 841}, Support: 652
Itemset: {6296, 841}, Support: 726
Itemset: {841, 847}, Support: 611
Itemset: {841, 4494}, Support: 654
Itemset: {841, 5891}, Support: 560
Itemset: {5896, 841}, Support: 516
Itemset: {841, 4627}, Support: 556
Itemset: {841, 4093}, 




# Code run for k = 2, f = 700 with output on kos dataset

In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict
from itertools import combinations
import time
from pathlib import Path
import os
import errno
from tqdm import tqdm  # Import tqdm for the progress bar

class File:
    def __init__(self, path, name):
        self.name = str(name)
        self.vocabFile = "vocab." + self.name + ".txt"
        self.vocabPath = Path(path) / self.vocabFile
        self.docWordFile = "docword." + self.name + ".txt.gz"
        self.docWordPath = Path(path) / self.docWordFile

        if not self.vocabPath.exists() or self.vocabPath.is_dir():
            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), self.vocabPath)
        if not self.docWordPath.exists() or self.docWordPath.is_dir():
            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), self.docWordPath)

    def __read_csv_chunks(self, lines, path, **params):
        chunks = []
        for i, chunk in enumerate(pd.read_csv(path, **params)):
            prog = min((i + 1) / lines * 100.0 * params["chunksize"], 100)
            print(f"Data Load Progress : {prog:.2f} %.", end="\r", flush=True)
            chunks.append(chunk)
        print("\n")
        df = pd.concat(chunks, axis=0)
        del chunks
        return df

    def load_data(self):
        start_time = time.time()
        print("Reading vocabulary file...")
        self.vocab = pd.read_csv(self.vocabPath, header=None, names=["word"])
        print(f"Vocabulary file loaded. Time taken: {time.time() - start_time:.2f} seconds.")

        self.vocab.index += 1

        start_time = time.time()
        print("Reading docword file (metadata)...")
        tmp = pd.read_csv(self.docWordPath, compression='gzip', header=None, nrows=3)
        self.docCount = tmp[0].values[0]
        self.wordCount = tmp[0].values[1]
        self.NNZ = tmp[0].values[2]
        print(f"Docword file metadata loaded. Time taken: {time.time() - start_time:.2f} seconds.")

        start_time = time.time()
        print("Reading docword file (data)...")
        self.docWord = self.__read_csv_chunks(self.NNZ, self.docWordPath, compression='gzip', header=None, sep=' ',
                                              quotechar='"', on_bad_lines="skip", skiprows=3, chunksize=10000,
                                              names=["docID", "wordID", "count"])
        print(f"Docword file data loaded. Time taken: {time.time() - start_time:.2f} seconds.")

    def get_unique_docs(self):
        try:
            return self.docIDS
        except AttributeError:
            docIDS = self.docWord["docID"].unique()
            docIDS.sort()
            self.docIDS = docIDS
            return self.docIDS

    def get_words_by_docID(self, _id):
        try:
            return self.docWordList[_id]
        except AttributeError:
            return self.docWord[self.docWord['docID'] == _id]["wordID"].tolist()


# Optimized Apriori algorithm to find frequent itemsets of size k
def apriori(dataset, min_support, k):
    itemsets = defaultdict(int)

    # Generate all itemsets of size 1 (frequent 1-itemsets)
    for doc in tqdm(dataset, desc="Generating 1-itemsets", colour="green"):
        for word in doc:
            itemsets[frozenset([word])] += 1

    # Prune infrequent itemsets
    itemsets = {itemset: count for itemset, count in itemsets.items() if count >= min_support}

    # Start generating k-itemsets
    frequent_itemsets = defaultdict(int)
    current_itemsets = list(itemsets.keys())

    # We only need to keep itemsets of size k, so we stop the generation process once we reach k-itemsets.
    for k_itemsets in range(2, k + 1):
        print(f"\nGenerating {k_itemsets}-itemsets...")

        # Generate candidate itemsets from the previous frequent itemsets
        candidate_itemsets = defaultdict(int)

        # Join itemsets of size (k_itemsets - 1) to generate k_itemsets
        for i in tqdm(range(len(current_itemsets)), desc=f"Generating {k_itemsets}-itemsets candidates", colour="yellow"):
            for j in range(i + 1, len(current_itemsets)):
                # Try to join two itemsets
                candidate = current_itemsets[i] | current_itemsets[j]
                if len(candidate) == k_itemsets:  # Only consider itemsets of size k_itemsets
                    # Candidate itemset must not have any infrequent subsets
                    if all(frozenset(comb) in itemsets for comb in combinations(candidate, k_itemsets - 1)):
                        for doc in dataset:
                            if candidate.issubset(doc):
                                candidate_itemsets[candidate] += 1

        # Prune infrequent itemsets
        candidate_itemsets = {itemset: count for itemset, count in candidate_itemsets.items() if count >= min_support}

        if not candidate_itemsets:
            break  # No more frequent itemsets can be generated

        # Update the frequent itemsets dictionary with only size k itemsets
        if k_itemsets == k:
            frequent_itemsets.update(candidate_itemsets)

        # Update current itemsets for the next iteration
        current_itemsets = list(candidate_itemsets.keys())

    return frequent_itemsets


def main(path, name, k, min_support):
    start_time = time.time()

    # Initialize File class to load data
    file_handler = File(path, name)

    # Load data
    file_handler.load_data()

    # Get the list of documents as transactions
    documents = [file_handler.get_words_by_docID(doc_id) for doc_id in file_handler.get_unique_docs()]

    # Apply Apriori algorithm to find frequent itemsets of size k
    print("Running Apriori algorithm...")
    frequent_itemsets = apriori(documents, min_support, k)

    # Print the results for frequent itemsets of size k
    print("\nFrequent Itemsets of size", k, ":")
    if frequent_itemsets:
        for itemset, count in tqdm(frequent_itemsets.items(), desc="Printing itemsets", colour="blue"):
            print(f"Itemset: {set(itemset)}, Support: {count}")
    else:
        print(f"No frequent itemsets of size {k} found.")

    end_time = time.time()
    print(f"Total Time taken: {end_time - start_time:.2f} seconds")


if __name__ == "__main__":
    # Example usage
    dataset_name = input("Enter dataset name (e.g., enron): ")
    k = int(input("Enter k (size of itemsets): "))
    min_support = int(input("Enter min_support (minimum frequency threshold): "))
    path = r"/content/drive/MyDrive/bag+of+words"  # Adjust path if needed
    main(path, dataset_name, k, min_support)


Enter dataset name (e.g., enron): kos
Enter k (size of itemsets): 2
Enter min_support (minimum frequency threshold): 700
Reading vocabulary file...
Vocabulary file loaded. Time taken: 0.01 seconds.
Reading docword file (metadata)...
Docword file metadata loaded. Time taken: 0.01 seconds.
Reading docword file (data)...


Docword file data loaded. Time taken: 0.21 seconds.
Running Apriori algorithm...


Generating 1-itemsets: 100%|[32m██████████[0m| 3430/3430 [00:00<00:00, 17943.76it/s]



Generating 2-itemsets...


Generating 2-itemsets candidates: 100%|[33m██████████[0m| 31/31 [00:09<00:00,  3.25it/s]



Frequent Itemsets of size 2 :


Printing itemsets: 100%|[34m██████████[0m| 23/23 [00:00<00:00, 45439.94it/s]

Itemset: {841, 89}, Support: 748
Itemset: {2640, 841}, Support: 1250
Itemset: {841, 3005}, Support: 873
Itemset: {841, 3420}, Support: 1195
Itemset: {841, 4735}, Support: 834
Itemset: {841, 6689}, Support: 994
Itemset: {1664, 841}, Support: 937
Itemset: {841, 1666}, Support: 783
Itemset: {841, 2030}, Support: 766
Itemset: {841, 3858}, Support: 721
Itemset: {4632, 841}, Support: 876
Itemset: {841, 5186}, Support: 794
Itemset: {6296, 841}, Support: 726
Itemset: {2640, 3420}, Support: 1064
Itemset: {2640, 6689}, Support: 745
Itemset: {2640, 1664}, Support: 763
Itemset: {2640, 2030}, Support: 715
Itemset: {2640, 4632}, Support: 756
Itemset: {1664, 3420}, Support: 922
Itemset: {4632, 3420}, Support: 845
Itemset: {1664, 1666}, Support: 740
Itemset: {1664, 4632}, Support: 755
Itemset: {1664, 4761}, Support: 894
Total Time taken: 12.06 seconds





# Code run for k = 3, f = 500 with output on kos dataset

In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict
from itertools import combinations
import time
from pathlib import Path
import os
import errno
from tqdm import tqdm  # Import tqdm for the progress bar

class File:
    def __init__(self, path, name):
        self.name = str(name)
        self.vocabFile = "vocab." + self.name + ".txt"
        self.vocabPath = Path(path) / self.vocabFile
        self.docWordFile = "docword." + self.name + ".txt.gz"
        self.docWordPath = Path(path) / self.docWordFile

        if not self.vocabPath.exists() or self.vocabPath.is_dir():
            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), self.vocabPath)
        if not self.docWordPath.exists() or self.docWordPath.is_dir():
            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), self.docWordPath)

    def __read_csv_chunks(self, lines, path, **params):
        chunks = []
        for i, chunk in enumerate(pd.read_csv(path, **params)):
            prog = min((i + 1) / lines * 100.0 * params["chunksize"], 100)
            print(f"Data Load Progress : {prog:.2f} %.", end="\r", flush=True)
            chunks.append(chunk)
        print("\n")
        df = pd.concat(chunks, axis=0)
        del chunks
        return df

    def load_data(self):
        start_time = time.time()
        print("Reading vocabulary file...")
        self.vocab = pd.read_csv(self.vocabPath, header=None, names=["word"])
        print(f"Vocabulary file loaded. Time taken: {time.time() - start_time:.2f} seconds.")

        self.vocab.index += 1

        start_time = time.time()
        print("Reading docword file (metadata)...")
        tmp = pd.read_csv(self.docWordPath, compression='gzip', header=None, nrows=3)
        self.docCount = tmp[0].values[0]
        self.wordCount = tmp[0].values[1]
        self.NNZ = tmp[0].values[2]
        print(f"Docword file metadata loaded. Time taken: {time.time() - start_time:.2f} seconds.")

        start_time = time.time()
        print("Reading docword file (data)...")
        self.docWord = self.__read_csv_chunks(self.NNZ, self.docWordPath, compression='gzip', header=None, sep=' ',
                                              quotechar='"', on_bad_lines="skip", skiprows=3, chunksize=10000,
                                              names=["docID", "wordID", "count"])
        print(f"Docword file data loaded. Time taken: {time.time() - start_time:.2f} seconds.")

    def get_unique_docs(self):
        try:
            return self.docIDS
        except AttributeError:
            docIDS = self.docWord["docID"].unique()
            docIDS.sort()
            self.docIDS = docIDS
            return self.docIDS

    def get_words_by_docID(self, _id):
        try:
            return self.docWordList[_id]
        except AttributeError:
            return self.docWord[self.docWord['docID'] == _id]["wordID"].tolist()


# Optimized Apriori algorithm to find frequent itemsets of size k
def apriori(dataset, min_support, k):
    itemsets = defaultdict(int)

    # Generate all itemsets of size 1 (frequent 1-itemsets)
    for doc in tqdm(dataset, desc="Generating 1-itemsets", colour="green"):
        for word in doc:
            itemsets[frozenset([word])] += 1

    # Prune infrequent itemsets
    itemsets = {itemset: count for itemset, count in itemsets.items() if count >= min_support}

    # Start generating k-itemsets
    frequent_itemsets = defaultdict(int)
    current_itemsets = list(itemsets.keys())

    # We only need to keep itemsets of size k, so we stop the generation process once we reach k-itemsets.
    for k_itemsets in range(2, k + 1):
        print(f"\nGenerating {k_itemsets}-itemsets...")

        # Generate candidate itemsets from the previous frequent itemsets
        candidate_itemsets = defaultdict(int)

        # Join itemsets of size (k_itemsets - 1) to generate k_itemsets
        for i in tqdm(range(len(current_itemsets)), desc=f"Generating {k_itemsets}-itemsets candidates", colour="yellow"):
            for j in range(i + 1, len(current_itemsets)):
                # Try to join two itemsets
                candidate = current_itemsets[i] | current_itemsets[j]
                if len(candidate) == k_itemsets:  # Only consider itemsets of size k_itemsets
                    # Candidate itemset must not have any infrequent subsets
                    if all(frozenset(comb) in itemsets for comb in combinations(candidate, k_itemsets - 1)):
                        for doc in dataset:
                            if candidate.issubset(doc):
                                candidate_itemsets[candidate] += 1

        # Prune infrequent itemsets
        candidate_itemsets = {itemset: count for itemset, count in candidate_itemsets.items() if count >= min_support}

        if not candidate_itemsets:
            break  # No more frequent itemsets can be generated

        # Update the frequent itemsets dictionary with only size k itemsets
        if k_itemsets == k:
            frequent_itemsets.update(candidate_itemsets)

        # Update current itemsets for the next iteration
        current_itemsets = list(candidate_itemsets.keys())

    return frequent_itemsets


def main(path, name, k, min_support):
    start_time = time.time()

    # Initialize File class to load data
    file_handler = File(path, name)

    # Load data
    file_handler.load_data()

    # Get the list of documents as transactions
    documents = [file_handler.get_words_by_docID(doc_id) for doc_id in file_handler.get_unique_docs()]

    # Apply Apriori algorithm to find frequent itemsets of size k
    print("Running Apriori algorithm...")
    frequent_itemsets = apriori(documents, min_support, k)

    # Print the results for frequent itemsets of size k
    print("\nFrequent Itemsets of size", k, ":")
    if frequent_itemsets:
        for itemset, count in tqdm(frequent_itemsets.items(), desc="Printing itemsets", colour="blue"):
            print(f"Itemset: {set(itemset)}, Support: {count}")
    else:
        print(f"No frequent itemsets of size {k} found.")

    end_time = time.time()
    print(f"Total Time taken: {end_time - start_time:.2f} seconds")


if __name__ == "__main__":
    # Example usage
    dataset_name = input("Enter dataset name (e.g., enron): ")
    k = int(input("Enter k (size of itemsets): "))
    min_support = int(input("Enter min_support (minimum frequency threshold): "))
    path = r"/content/drive/MyDrive/bag+of+words"  # Adjust path if needed
    main(path, dataset_name, k, min_support)


Enter dataset name (e.g., enron): kos
Enter k (size of itemsets): 3
Enter min_support (minimum frequency threshold): 500
Reading vocabulary file...
Vocabulary file loaded. Time taken: 0.01 seconds.
Reading docword file (metadata)...
Docword file metadata loaded. Time taken: 0.01 seconds.
Reading docword file (data)...


Docword file data loaded. Time taken: 0.28 seconds.
Running Apriori algorithm...


Generating 1-itemsets: 100%|[32m██████████[0m| 3430/3430 [00:00<00:00, 18067.75it/s]



Generating 2-itemsets...


Generating 2-itemsets candidates: 100%|[33m██████████[0m| 50/50 [00:24<00:00,  2.06it/s]



Generating 3-itemsets...


Generating 3-itemsets candidates: 100%|[33m██████████[0m| 101/101 [00:00<00:00, 19939.03it/s]


Frequent Itemsets of size 3 :
No frequent itemsets of size 3 found.
Total Time taken: 27.42 seconds





# Code run for k = 2, f = 800 with output on nips dataset

In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict
from itertools import combinations
import time
from pathlib import Path
import os
import errno
from tqdm import tqdm  # Import tqdm for the progress bar

class File:
    def __init__(self, path, name):
        self.name = str(name)
        self.vocabFile = "vocab." + self.name + ".txt"
        self.vocabPath = Path(path) / self.vocabFile
        self.docWordFile = "docword." + self.name + ".txt.gz"
        self.docWordPath = Path(path) / self.docWordFile

        if not self.vocabPath.exists() or self.vocabPath.is_dir():
            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), self.vocabPath)
        if not self.docWordPath.exists() or self.docWordPath.is_dir():
            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), self.docWordPath)

    def __read_csv_chunks(self, lines, path, **params):
        chunks = []
        for i, chunk in enumerate(pd.read_csv(path, **params)):
            prog = min((i + 1) / lines * 100.0 * params["chunksize"], 100)
            print(f"Data Load Progress : {prog:.2f} %.", end="\r", flush=True)
            chunks.append(chunk)
        print("\n")
        df = pd.concat(chunks, axis=0)
        del chunks
        return df

    def load_data(self):
        start_time = time.time()
        print("Reading vocabulary file...")
        self.vocab = pd.read_csv(self.vocabPath, header=None, names=["word"])
        print(f"Vocabulary file loaded. Time taken: {time.time() - start_time:.2f} seconds.")

        self.vocab.index += 1

        start_time = time.time()
        print("Reading docword file (metadata)...")
        tmp = pd.read_csv(self.docWordPath, compression='gzip', header=None, nrows=3)
        self.docCount = tmp[0].values[0]
        self.wordCount = tmp[0].values[1]
        self.NNZ = tmp[0].values[2]
        print(f"Docword file metadata loaded. Time taken: {time.time() - start_time:.2f} seconds.")

        start_time = time.time()
        print("Reading docword file (data)...")
        self.docWord = self.__read_csv_chunks(self.NNZ, self.docWordPath, compression='gzip', header=None, sep=' ',
                                              quotechar='"', on_bad_lines="skip", skiprows=3, chunksize=10000,
                                              names=["docID", "wordID", "count"])
        print(f"Docword file data loaded. Time taken: {time.time() - start_time:.2f} seconds.")

    def get_unique_docs(self):
        try:
            return self.docIDS
        except AttributeError:
            docIDS = self.docWord["docID"].unique()
            docIDS.sort()
            self.docIDS = docIDS
            return self.docIDS

    def get_words_by_docID(self, _id):
        try:
            return self.docWordList[_id]
        except AttributeError:
            return self.docWord[self.docWord['docID'] == _id]["wordID"].tolist()


# Optimized Apriori algorithm to find frequent itemsets of size k
def apriori(dataset, min_support, k):
    itemsets = defaultdict(int)

    # Generate all itemsets of size 1 (frequent 1-itemsets)
    for doc in tqdm(dataset, desc="Generating 1-itemsets", colour="green"):
        for word in doc:
            itemsets[frozenset([word])] += 1

    # Prune infrequent itemsets
    itemsets = {itemset: count for itemset, count in itemsets.items() if count >= min_support}

    # Start generating k-itemsets
    frequent_itemsets = defaultdict(int)
    current_itemsets = list(itemsets.keys())

    # We only need to keep itemsets of size k, so we stop the generation process once we reach k-itemsets.
    for k_itemsets in range(2, k + 1):
        print(f"\nGenerating {k_itemsets}-itemsets...")

        # Generate candidate itemsets from the previous frequent itemsets
        candidate_itemsets = defaultdict(int)

        # Join itemsets of size (k_itemsets - 1) to generate k_itemsets
        for i in tqdm(range(len(current_itemsets)), desc=f"Generating {k_itemsets}-itemsets candidates", colour="yellow"):
            for j in range(i + 1, len(current_itemsets)):
                # Try to join two itemsets
                candidate = current_itemsets[i] | current_itemsets[j]
                if len(candidate) == k_itemsets:  # Only consider itemsets of size k_itemsets
                    # Candidate itemset must not have any infrequent subsets
                    if all(frozenset(comb) in itemsets for comb in combinations(candidate, k_itemsets - 1)):
                        for doc in dataset:
                            if candidate.issubset(doc):
                                candidate_itemsets[candidate] += 1

        # Prune infrequent itemsets
        candidate_itemsets = {itemset: count for itemset, count in candidate_itemsets.items() if count >= min_support}

        if not candidate_itemsets:
            break  # No more frequent itemsets can be generated

        # Update the frequent itemsets dictionary with only size k itemsets
        if k_itemsets == k:
            frequent_itemsets.update(candidate_itemsets)

        # Update current itemsets for the next iteration
        current_itemsets = list(candidate_itemsets.keys())

    return frequent_itemsets


def main(path, name, k, min_support):
    start_time = time.time()

    # Initialize File class to load data
    file_handler = File(path, name)

    # Load data
    file_handler.load_data()

    # Get the list of documents as transactions
    documents = [file_handler.get_words_by_docID(doc_id) for doc_id in file_handler.get_unique_docs()]

    # Apply Apriori algorithm to find frequent itemsets of size k
    print("Running Apriori algorithm...")
    frequent_itemsets = apriori(documents, min_support, k)

    # Print the results for frequent itemsets of size k
    print("\nFrequent Itemsets of size", k, ":")
    if frequent_itemsets:
        for itemset, count in tqdm(frequent_itemsets.items(), desc="Printing itemsets", colour="blue"):
            print(f"Itemset: {set(itemset)}, Support: {count}")
    else:
        print(f"No frequent itemsets of size {k} found.")

    end_time = time.time()
    print(f"Total Time taken: {end_time - start_time:.2f} seconds")


if __name__ == "__main__":
    # Example usage
    dataset_name = input("Enter dataset name (e.g., enron): ")
    k = int(input("Enter k (size of itemsets): "))
    min_support = int(input("Enter min_support (minimum frequency threshold): "))
    path = r"/content/drive/MyDrive/bag+of+words"  # Adjust path if needed
    main(path, dataset_name, k, min_support)


Enter dataset name (e.g., enron): nips
Enter k (size of itemsets): 2
Enter min_support (minimum frequency threshold): 800
Reading vocabulary file...
Vocabulary file loaded. Time taken: 0.01 seconds.
Reading docword file (metadata)...
Docword file metadata loaded. Time taken: 0.01 seconds.
Reading docword file (data)...


Docword file data loaded. Time taken: 0.45 seconds.
Running Apriori algorithm...


Generating 1-itemsets: 100%|[32m██████████[0m| 1500/1500 [00:00<00:00, 2064.29it/s]



Generating 2-itemsets...


Generating 2-itemsets candidates: 100%|[33m██████████[0m| 75/75 [01:53<00:00,  1.51s/it]



Frequent Itemsets of size 2 :


Printing itemsets: 100%|[34m██████████[0m| 724/724 [00:00<00:00, 108755.68it/s]

Itemset: {316, 39}, Support: 1071
Itemset: {428, 39}, Support: 893
Itemset: {532, 39}, Support: 878
Itemset: {540, 39}, Support: 976
Itemset: {1482, 39}, Support: 1128
Itemset: {2056, 39}, Support: 866
Itemset: {2087, 39}, Support: 838
Itemset: {2175, 39}, Support: 811
Itemset: {2574, 39}, Support: 1128
Itemset: {2676, 39}, Support: 823
Itemset: {3603, 39}, Support: 967
Itemset: {3750, 39}, Support: 810
Itemset: {4146, 39}, Support: 1055
Itemset: {4180, 39}, Support: 858
Itemset: {4270, 39}, Support: 1355
Itemset: {4372, 39}, Support: 920
Itemset: {4810, 39}, Support: 835
Itemset: {5399, 39}, Support: 1192
Itemset: {5554, 39}, Support: 1291
Itemset: {6056, 39}, Support: 1077
Itemset: {6120, 39}, Support: 1132
Itemset: {6245, 39}, Support: 923
Itemset: {7011, 39}, Support: 1242
Itemset: {7358, 39}, Support: 1277
Itemset: {7365, 39}, Support: 1343
Itemset: {7484, 39}, Support: 905
Itemset: {7579, 39}, Support: 1276
Itemset: {7623, 39}, Support: 823
Itemset: {7787, 39}, Support: 1092
Item




# Code run for k = 3, f = 900 with output on nips dataset

In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict
from itertools import combinations
import time
from pathlib import Path
import os
import errno
from tqdm import tqdm  # Import tqdm for the progress bar

class File:
    def __init__(self, path, name):
        self.name = str(name)
        self.vocabFile = "vocab." + self.name + ".txt"
        self.vocabPath = Path(path) / self.vocabFile
        self.docWordFile = "docword." + self.name + ".txt.gz"
        self.docWordPath = Path(path) / self.docWordFile

        if not self.vocabPath.exists() or self.vocabPath.is_dir():
            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), self.vocabPath)
        if not self.docWordPath.exists() or self.docWordPath.is_dir():
            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), self.docWordPath)

    def __read_csv_chunks(self, lines, path, **params):
        chunks = []
        for i, chunk in enumerate(pd.read_csv(path, **params)):
            prog = min((i + 1) / lines * 100.0 * params["chunksize"], 100)
            print(f"Data Load Progress : {prog:.2f} %.", end="\r", flush=True)
            chunks.append(chunk)
        print("\n")
        df = pd.concat(chunks, axis=0)
        del chunks
        return df

    def load_data(self):
        start_time = time.time()
        print("Reading vocabulary file...")
        self.vocab = pd.read_csv(self.vocabPath, header=None, names=["word"])
        print(f"Vocabulary file loaded. Time taken: {time.time() - start_time:.2f} seconds.")

        self.vocab.index += 1

        start_time = time.time()
        print("Reading docword file (metadata)...")
        tmp = pd.read_csv(self.docWordPath, compression='gzip', header=None, nrows=3)
        self.docCount = tmp[0].values[0]
        self.wordCount = tmp[0].values[1]
        self.NNZ = tmp[0].values[2]
        print(f"Docword file metadata loaded. Time taken: {time.time() - start_time:.2f} seconds.")

        start_time = time.time()
        print("Reading docword file (data)...")
        self.docWord = self.__read_csv_chunks(self.NNZ, self.docWordPath, compression='gzip', header=None, sep=' ',
                                              quotechar='"', on_bad_lines="skip", skiprows=3, chunksize=10000,
                                              names=["docID", "wordID", "count"])
        print(f"Docword file data loaded. Time taken: {time.time() - start_time:.2f} seconds.")

    def get_unique_docs(self):
        try:
            return self.docIDS
        except AttributeError:
            docIDS = self.docWord["docID"].unique()
            docIDS.sort()
            self.docIDS = docIDS
            return self.docIDS

    def get_words_by_docID(self, _id):
        try:
            return self.docWordList[_id]
        except AttributeError:
            return self.docWord[self.docWord['docID'] == _id]["wordID"].tolist()


# Optimized Apriori algorithm to find frequent itemsets of size k
def apriori(dataset, min_support, k):
    itemsets = defaultdict(int)

    # Generate all itemsets of size 1 (frequent 1-itemsets)
    for doc in tqdm(dataset, desc="Generating 1-itemsets", colour="green"):
        for word in doc:
            itemsets[frozenset([word])] += 1

    # Prune infrequent itemsets
    itemsets = {itemset: count for itemset, count in itemsets.items() if count >= min_support}

    # Start generating k-itemsets
    frequent_itemsets = defaultdict(int)
    current_itemsets = list(itemsets.keys())

    # We only need to keep itemsets of size k, so we stop the generation process once we reach k-itemsets.
    for k_itemsets in range(2, k + 1):
        print(f"\nGenerating {k_itemsets}-itemsets...")

        # Generate candidate itemsets from the previous frequent itemsets
        candidate_itemsets = defaultdict(int)

        # Join itemsets of size (k_itemsets - 1) to generate k_itemsets
        for i in tqdm(range(len(current_itemsets)), desc=f"Generating {k_itemsets}-itemsets candidates", colour="yellow"):
            for j in range(i + 1, len(current_itemsets)):
                # Try to join two itemsets
                candidate = current_itemsets[i] | current_itemsets[j]
                if len(candidate) == k_itemsets:  # Only consider itemsets of size k_itemsets
                    # Candidate itemset must not have any infrequent subsets
                    if all(frozenset(comb) in itemsets for comb in combinations(candidate, k_itemsets - 1)):
                        for doc in dataset:
                            if candidate.issubset(doc):
                                candidate_itemsets[candidate] += 1

        # Prune infrequent itemsets
        candidate_itemsets = {itemset: count for itemset, count in candidate_itemsets.items() if count >= min_support}

        if not candidate_itemsets:
            break  # No more frequent itemsets can be generated

        # Update the frequent itemsets dictionary with only size k itemsets
        if k_itemsets == k:
            frequent_itemsets.update(candidate_itemsets)

        # Update current itemsets for the next iteration
        current_itemsets = list(candidate_itemsets.keys())

    return frequent_itemsets


def main(path, name, k, min_support):
    start_time = time.time()

    # Initialize File class to load data
    file_handler = File(path, name)

    # Load data
    file_handler.load_data()

    # Get the list of documents as transactions
    documents = [file_handler.get_words_by_docID(doc_id) for doc_id in file_handler.get_unique_docs()]

    # Apply Apriori algorithm to find frequent itemsets of size k
    print("Running Apriori algorithm...")
    frequent_itemsets = apriori(documents, min_support, k)

    # Print the results for frequent itemsets of size k
    print("\nFrequent Itemsets of size", k, ":")
    if frequent_itemsets:
        for itemset, count in tqdm(frequent_itemsets.items(), desc="Printing itemsets", colour="blue"):
            print(f"Itemset: {set(itemset)}, Support: {count}")
    else:
        print(f"No frequent itemsets of size {k} found.")

    end_time = time.time()
    print(f"Total Time taken: {end_time - start_time:.2f} seconds")


if __name__ == "__main__":
    # Example usage
    dataset_name = input("Enter dataset name (e.g., enron): ")
    k = int(input("Enter k (size of itemsets): "))
    min_support = int(input("Enter min_support (minimum frequency threshold): "))
    path = r"/content/drive/MyDrive/bag+of+words"  # Adjust path if needed
    main(path, dataset_name, k, min_support)


Enter dataset name (e.g., enron): nips
Enter k (size of itemsets): 3
Enter min_support (minimum frequency threshold): 900
Reading vocabulary file...
Vocabulary file loaded. Time taken: 0.01 seconds.
Reading docword file (metadata)...
Docword file metadata loaded. Time taken: 0.01 seconds.
Reading docword file (data)...


Docword file data loaded. Time taken: 0.48 seconds.
Running Apriori algorithm...


Generating 1-itemsets: 100%|[32m██████████[0m| 1500/1500 [00:00<00:00, 3368.94it/s]



Generating 2-itemsets...


Generating 2-itemsets candidates: 100%|[33m██████████[0m| 50/50 [00:51<00:00,  1.02s/it]



Generating 3-itemsets...


Generating 3-itemsets candidates: 100%|[33m██████████[0m| 365/365 [00:00<00:00, 13984.59it/s]


Frequent Itemsets of size 3 :
No frequent itemsets of size 3 found.
Total Time taken: 53.79 seconds





# Code run for k = 3, f = 500 with output on nips dataset

In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict
from itertools import combinations
import time
from pathlib import Path
import os
import errno
from tqdm import tqdm  # Import tqdm for the progress bar

class File:
    def __init__(self, path, name):
        self.name = str(name)
        self.vocabFile = "vocab." + self.name + ".txt"
        self.vocabPath = Path(path) / self.vocabFile
        self.docWordFile = "docword." + self.name + ".txt.gz"
        self.docWordPath = Path(path) / self.docWordFile

        if not self.vocabPath.exists() or self.vocabPath.is_dir():
            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), self.vocabPath)
        if not self.docWordPath.exists() or self.docWordPath.is_dir():
            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), self.docWordPath)

    def __read_csv_chunks(self, lines, path, **params):
        chunks = []
        for i, chunk in enumerate(pd.read_csv(path, **params)):
            prog = min((i + 1) / lines * 100.0 * params["chunksize"], 100)
            print(f"Data Load Progress : {prog:.2f} %.", end="\r", flush=True)
            chunks.append(chunk)
        print("\n")
        df = pd.concat(chunks, axis=0)
        del chunks
        return df

    def load_data(self):
        start_time = time.time()
        print("Reading vocabulary file...")
        self.vocab = pd.read_csv(self.vocabPath, header=None, names=["word"])
        print(f"Vocabulary file loaded. Time taken: {time.time() - start_time:.2f} seconds.")

        self.vocab.index += 1

        start_time = time.time()
        print("Reading docword file (metadata)...")
        tmp = pd.read_csv(self.docWordPath, compression='gzip', header=None, nrows=3)
        self.docCount = tmp[0].values[0]
        self.wordCount = tmp[0].values[1]
        self.NNZ = tmp[0].values[2]
        print(f"Docword file metadata loaded. Time taken: {time.time() - start_time:.2f} seconds.")

        start_time = time.time()
        print("Reading docword file (data)...")
        self.docWord = self.__read_csv_chunks(self.NNZ, self.docWordPath, compression='gzip', header=None, sep=' ',
                                              quotechar='"', on_bad_lines="skip", skiprows=3, chunksize=10000,
                                              names=["docID", "wordID", "count"])
        print(f"Docword file data loaded. Time taken: {time.time() - start_time:.2f} seconds.")

    def get_unique_docs(self):
        try:
            return self.docIDS
        except AttributeError:
            docIDS = self.docWord["docID"].unique()
            docIDS.sort()
            self.docIDS = docIDS
            return self.docIDS

    def get_words_by_docID(self, _id):
        try:
            return self.docWordList[_id]
        except AttributeError:
            return self.docWord[self.docWord['docID'] == _id]["wordID"].tolist()


# Optimized Apriori algorithm to find frequent itemsets of size k
def apriori(dataset, min_support, k):
    itemsets = defaultdict(int)

    # Generate all itemsets of size 1 (frequent 1-itemsets)
    for doc in tqdm(dataset, desc="Generating 1-itemsets", colour="green"):
        for word in doc:
            itemsets[frozenset([word])] += 1

    # Prune infrequent itemsets
    itemsets = {itemset: count for itemset, count in itemsets.items() if count >= min_support}

    # Start generating k-itemsets
    frequent_itemsets = defaultdict(int)
    current_itemsets = list(itemsets.keys())

    # We only need to keep itemsets of size k, so we stop the generation process once we reach k-itemsets.
    for k_itemsets in range(2, k + 1):
        print(f"\nGenerating {k_itemsets}-itemsets...")

        # Generate candidate itemsets from the previous frequent itemsets
        candidate_itemsets = defaultdict(int)

        # Join itemsets of size (k_itemsets - 1) to generate k_itemsets
        for i in tqdm(range(len(current_itemsets)), desc=f"Generating {k_itemsets}-itemsets candidates", colour="yellow"):
            for j in range(i + 1, len(current_itemsets)):
                # Try to join two itemsets
                candidate = current_itemsets[i] | current_itemsets[j]
                if len(candidate) == k_itemsets:  # Only consider itemsets of size k_itemsets
                    # Candidate itemset must not have any infrequent subsets
                    if all(frozenset(comb) in itemsets for comb in combinations(candidate, k_itemsets - 1)):
                        for doc in dataset:
                            if candidate.issubset(doc):
                                candidate_itemsets[candidate] += 1

        # Prune infrequent itemsets
        candidate_itemsets = {itemset: count for itemset, count in candidate_itemsets.items() if count >= min_support}

        if not candidate_itemsets:
            break  # No more frequent itemsets can be generated

        # Update the frequent itemsets dictionary with only size k itemsets
        if k_itemsets == k:
            frequent_itemsets.update(candidate_itemsets)

        # Update current itemsets for the next iteration
        current_itemsets = list(candidate_itemsets.keys())

    return frequent_itemsets


def main(path, name, k, min_support):
    start_time = time.time()

    # Initialize File class to load data
    file_handler = File(path, name)

    # Load data
    file_handler.load_data()

    # Get the list of documents as transactions
    documents = [file_handler.get_words_by_docID(doc_id) for doc_id in file_handler.get_unique_docs()]

    # Apply Apriori algorithm to find frequent itemsets of size k
    print("Running Apriori algorithm...")
    frequent_itemsets = apriori(documents, min_support, k)

    # Print the results for frequent itemsets of size k
    print("\nFrequent Itemsets of size", k, ":")
    if frequent_itemsets:
        for itemset, count in tqdm(frequent_itemsets.items(), desc="Printing itemsets", colour="blue"):
            print(f"Itemset: {set(itemset)}, Support: {count}")
    else:
        print(f"No frequent itemsets of size {k} found.")

    end_time = time.time()
    print(f"Total Time taken: {end_time - start_time:.2f} seconds")


if __name__ == "__main__":
    # Example usage
    dataset_name = input("Enter dataset name (e.g., enron): ")
    k = int(input("Enter k (size of itemsets): "))
    min_support = int(input("Enter min_support (minimum frequency threshold): "))
    path = r"/content/drive/MyDrive/bag+of+words"  # Adjust path if needed
    main(path, dataset_name, k, min_support)


Enter dataset name (e.g., enron): nips
Enter k (size of itemsets): 3
Enter min_support (minimum frequency threshold): 500
Reading vocabulary file...
Vocabulary file loaded. Time taken: 0.01 seconds.
Reading docword file (metadata)...
Docword file metadata loaded. Time taken: 0.01 seconds.
Reading docword file (data)...


Docword file data loaded. Time taken: 0.44 seconds.
Running Apriori algorithm...


Generating 1-itemsets: 100%|[32m██████████[0m| 1500/1500 [00:00<00:00, 3278.62it/s]



Generating 2-itemsets...


Generating 2-itemsets candidates: 100%|[33m██████████[0m| 252/252 [21:34<00:00,  5.14s/it]



Generating 3-itemsets...


Generating 3-itemsets candidates: 100%|[33m██████████[0m| 5521/5521 [00:04<00:00, 1330.70it/s]


Frequent Itemsets of size 3 :
No frequent itemsets of size 3 found.
Total Time taken: 1301.73 seconds





# Code run for k = 2, f = 1000 with output on enron dataset

(not completed as it took too much time)

In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict
from itertools import combinations
import time
from pathlib import Path
import os
import errno
from tqdm import tqdm  # Import tqdm for the progress bar

class File:
    def __init__(self, path, name):
        self.name = str(name)
        self.vocabFile = "vocab." + self.name + ".txt"
        self.vocabPath = Path(path) / self.vocabFile
        self.docWordFile = "docword." + self.name + ".txt.gz"
        self.docWordPath = Path(path) / self.docWordFile

        if not self.vocabPath.exists() or self.vocabPath.is_dir():
            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), self.vocabPath)
        if not self.docWordPath.exists() or self.docWordPath.is_dir():
            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), self.docWordPath)

    def __read_csv_chunks(self, lines, path, **params):
        chunks = []
        for i, chunk in enumerate(pd.read_csv(path, **params)):
            prog = min((i + 1) / lines * 100.0 * params["chunksize"], 100)
            print(f"Data Load Progress : {prog:.2f} %.", end="\r", flush=True)
            chunks.append(chunk)
        print("\n")
        df = pd.concat(chunks, axis=0)
        del chunks
        return df

    def load_data(self):
        start_time = time.time()
        print("Reading vocabulary file...")
        self.vocab = pd.read_csv(self.vocabPath, header=None, names=["word"])
        print(f"Vocabulary file loaded. Time taken: {time.time() - start_time:.2f} seconds.")

        self.vocab.index += 1

        start_time = time.time()
        print("Reading docword file (metadata)...")
        tmp = pd.read_csv(self.docWordPath, compression='gzip', header=None, nrows=3)
        self.docCount = tmp[0].values[0]
        self.wordCount = tmp[0].values[1]
        self.NNZ = tmp[0].values[2]
        print(f"Docword file metadata loaded. Time taken: {time.time() - start_time:.2f} seconds.")

        start_time = time.time()
        print("Reading docword file (data)...")
        self.docWord = self.__read_csv_chunks(self.NNZ, self.docWordPath, compression='gzip', header=None, sep=' ',
                                              quotechar='"', on_bad_lines="skip", skiprows=3, chunksize=10000,
                                              names=["docID", "wordID", "count"])
        print(f"Docword file data loaded. Time taken: {time.time() - start_time:.2f} seconds.")

    def get_unique_docs(self):
        try:
            return self.docIDS
        except AttributeError:
            docIDS = self.docWord["docID"].unique()
            docIDS.sort()
            self.docIDS = docIDS
            return self.docIDS

    def get_words_by_docID(self, _id):
        try:
            return self.docWordList[_id]
        except AttributeError:
            return self.docWord[self.docWord['docID'] == _id]["wordID"].tolist()


# Optimized Apriori algorithm to find frequent itemsets of size k
def apriori(dataset, min_support, k):
    itemsets = defaultdict(int)

    # Generate all itemsets of size 1 (frequent 1-itemsets)
    for doc in tqdm(dataset, desc="Generating 1-itemsets", colour="green"):
        for word in doc:
            itemsets[frozenset([word])] += 1

    # Prune infrequent itemsets
    itemsets = {itemset: count for itemset, count in itemsets.items() if count >= min_support}

    # Start generating k-itemsets
    frequent_itemsets = defaultdict(int)
    current_itemsets = list(itemsets.keys())

    # We only need to keep itemsets of size k, so we stop the generation process once we reach k-itemsets.
    for k_itemsets in range(2, k + 1):
        print(f"\nGenerating {k_itemsets}-itemsets...")

        # Generate candidate itemsets from the previous frequent itemsets
        candidate_itemsets = defaultdict(int)

        # Join itemsets of size (k_itemsets - 1) to generate k_itemsets
        for i in tqdm(range(len(current_itemsets)), desc=f"Generating {k_itemsets}-itemsets candidates", colour="yellow"):
            for j in range(i + 1, len(current_itemsets)):
                # Try to join two itemsets
                candidate = current_itemsets[i] | current_itemsets[j]
                if len(candidate) == k_itemsets:  # Only consider itemsets of size k_itemsets
                    # Candidate itemset must not have any infrequent subsets
                    if all(frozenset(comb) in itemsets for comb in combinations(candidate, k_itemsets - 1)):
                        for doc in dataset:
                            if candidate.issubset(doc):
                                candidate_itemsets[candidate] += 1

        # Prune infrequent itemsets
        candidate_itemsets = {itemset: count for itemset, count in candidate_itemsets.items() if count >= min_support}

        if not candidate_itemsets:
            break  # No more frequent itemsets can be generated

        # Update the frequent itemsets dictionary with only size k itemsets
        if k_itemsets == k:
            frequent_itemsets.update(candidate_itemsets)

        # Update current itemsets for the next iteration
        current_itemsets = list(candidate_itemsets.keys())

    return frequent_itemsets


def main(path, name, k, min_support):
    start_time = time.time()

    # Initialize File class to load data
    file_handler = File(path, name)

    # Load data
    file_handler.load_data()

    # Get the list of documents as transactions
    documents = [file_handler.get_words_by_docID(doc_id) for doc_id in file_handler.get_unique_docs()]

    # Apply Apriori algorithm to find frequent itemsets of size k
    print("Running Apriori algorithm...")
    frequent_itemsets = apriori(documents, min_support, k)

    # Print the results for frequent itemsets of size k
    print("\nFrequent Itemsets of size", k, ":")
    if frequent_itemsets:
        for itemset, count in tqdm(frequent_itemsets.items(), desc="Printing itemsets", colour="blue"):
            print(f"Itemset: {set(itemset)}, Support: {count}")
    else:
        print(f"No frequent itemsets of size {k} found.")

    end_time = time.time()
    print(f"Total Time taken: {end_time - start_time:.2f} seconds")


if __name__ == "__main__":
    # Example usage
    dataset_name = input("Enter dataset name (e.g., enron): ")
    k = int(input("Enter k (size of itemsets): "))
    min_support = int(input("Enter min_support (minimum frequency threshold): "))
    path = r"/content/drive/MyDrive/bag+of+words"  # Adjust path if needed
    main(path, dataset_name, k, min_support)


Enter dataset name (e.g., enron): enron
Enter k (size of itemsets): 2
Enter min_support (minimum frequency threshold): 1000
Reading vocabulary file...
Vocabulary file loaded. Time taken: 0.69 seconds.
Reading docword file (metadata)...
Docword file metadata loaded. Time taken: 0.67 seconds.
Reading docword file (data)...


Docword file data loaded. Time taken: 3.51 seconds.
Running Apriori algorithm...


Generating 1-itemsets: 100%|[32m██████████[0m| 39861/39861 [00:02<00:00, 15316.81it/s]



Generating 2-itemsets...


Generating 2-itemsets candidates:   1%|[33m▏         [0m| 10/789 [28:46<37:21:17, 172.63s/it]


KeyboardInterrupt: 