# Homework 2
## Imports

In [1]:
import time
from itertools import combinations

## A-priori class
Implements the algorithm from the lecture

In [7]:
class Apriori:
    @staticmethod
    def get_combinations_l_1(data):
        """
        :param data: Array of sets - each set containing the row in the dataset
        :return: Returns a set of frozen sets of size 1. Contains all items from data
        """
        res = set()
        for row in data:
            for item in row:
                # Add item to result
                res.add(frozenset({item}))
        return res

    @staticmethod
    def get_combinations(l1_sets, lk_sets):
        """

        :param l1_sets: set of frozen sets returned from get_combinations_l_1 function.
        :param lk_sets: set of frozen sets of k - 1 size
        :return: sets of frozen sets of length k created by combining elements from l1_sets and lk_sets
         where the intersection in an empty set.
        """
        res = set()
        for item in lk_sets:
            for val in l1_sets:
                # check if val is not a subset of item
                # no need to check the other way because size of item is always larger or equal than val
                if not val.issubset(item):
                    res.add(frozenset.union(val, item))
        return res

    @staticmethod
    def check(candidate_subsets, s, k, data):
        """
        This function counts all occurrences of candidate_subsets in data and check if their
        count is above the given threshold s
        :param candidate_subsets: array of subsets to filter out the non-frequent items
        :param s: support level
        :param k: size of sets in the subsets
        :param data: the dataset
        :return: filtered array of candidate subsets
        """
        subset_cnt = {subset: 0 for subset in candidate_subsets}
        for item in data:
            for comb in combinations(item, k):
                # if comb is in candidate subsets, increase its count by 1
                if frozenset(comb) in subset_cnt:
                    subset_cnt[frozenset(comb)] += 1
        # only return subsets that are frequent enough
        threshold_cnt = len(data) * s
        return [subset for subset, val in subset_cnt.items() if val > threshold_cnt]

    @staticmethod
    def compute(s, k, data):
        """
        This function uses the A-priori pruning principle - if there is any item-set that is infrequent,
        its superset is also not frequent. The function starts with generating subsets of size 1 and then increases
        their size until they are of size k
        :param s: support level, must be in range (0,1]
        :param k: find frequent sets of size k, must be an integer greater than 0
        :param data: array of sets representing the dataset
        :return: array of frozen sets with frequency above the support level
        """
        assert k > 0
        assert s >= 0
        assert s <= 1
        t = time.time()

        # generate subsets of size 1
        l_1 = Apriori.get_combinations_l_1(data)

        # filter out the infrequent subsets
        l_1 = Apriori.check(l_1, s, 1, data)
        print(f"Iteration 1 execution time: {time.time() - t}")
        l_prev = l_1
        for i in range(2, k + 1):
            t = time.time()

            # generate subsets of size i
            l_curr = Apriori.get_combinations(l_1, l_prev)

            # filter out the infrequent subsets
            l_curr = Apriori.check(l_curr, s, i, data)

            print(f"Iteration {i} execution time: {time.time() - t}")
            l_prev = l_curr
        return l_prev

## Data loader class
Loads space seperated numbers in specified file.


In [3]:
class DataLoader:
    res = []

    @staticmethod
    def load(path):
        """
        Loads space seperated numbers in specified file.
        :param path: File path
        :return: Array of frozen sets
        """
        res = []
        with open(path) as file:
            for line in file:
                res.append(frozenset([int(x) for x in line.split(" ") if x.isdigit()]))
        return res


In [8]:
support_s = 0.01
target_size_of_subsets = 3

t = time.time()

data = DataLoader.load("data/T10I4D100K.dat")
print(Apriori().compute(support_s, target_size_of_subsets, data))

print(f"Execution time: {time.time() - t}")

Iteration 1 execution time: 1.1999168395996094
Iteration 2 execution time: 4.132015705108643
Iteration 3 execution time: 4.891966104507446
[frozenset({704, 825, 39})]
Execution time: 11.195912837982178
