# Homework 2
## Imports

In [2]:
import time
from itertools import combinations

## A-priori class
Implements the algorithm from the lecture

In [3]:
class Apriori:
    @staticmethod
    def get_combinations_l_0(data):
        """
        :param data: Array of sets - each set containing the row in the dataset
        :return: Returns a set of frozen sets of size 1. Contains all items from data
        """
        res = set()
        for row in data:
            for item in row:
                # Add item to result
                res.add(frozenset({item}))
        return res

    @staticmethod
    def get_combinations(l1_sets, lk_sets):
        """

        :param l1_sets: set of frozen sets returned from get_combinations_l_1 function.
        :param lk_sets: set of frozen sets of k - 1 size
        :return: sets of frozen sets of length k created by combining elements from l1_sets and lk_sets
         where the intersection in an empty set.
        """
        res = set()
        for item in lk_sets:
            for val in l1_sets:
                # check if val is not a subset of item
                # no need to check the other way because size of item is always larger or equal than val
                if not val.issubset(item):
                    res.add(frozenset.union(val, item))
        return res

    @staticmethod
    def check(candidate_subsets, support, k, data):
        """
        This function counts all occurrences of candidate_subsets in data and check if their
        count is above the given threshold s
        :param candidate_subsets: array of subsets to filter out the non-frequent items
        :param support: support
        :param k: size of sets in the subsets
        :param data: the dataset
        :return: filtered dictionary with candidate sets as keys and their counts as values
        """
        subset_cnt = {subset: 0 for subset in candidate_subsets}
        for item in data:
            for comb in combinations(item, k):
                # if comb is in candidate subsets, increase its count by 1
                if frozenset(comb) in subset_cnt:
                    subset_cnt[frozenset(comb)] += 1
        # only return subsets that are frequent enough
        threshold_cnt = len(data) * support
        return {subset: val for subset, val in subset_cnt.items() if val > threshold_cnt}

    @staticmethod
    def find_frequent(support, data):
        """
        This function uses the A-priori pruning principle - if there is any item-set that is infrequent,
        its superset is also not frequent. The function starts with generating subsets of size 1 and then increases
        their size until they are of size k
        :param support: support, must be in range [0,1]
        :param k: find frequent sets of size k, must be an integer greater than 0
        :param data: array of sets representing the dataset
        :return: array of dictionaries where key is the set above support and value is number of occurrences in data
        """
        assert support >= 0
        assert support <= 1

        l = list()

        # generate subsets of size 1
        t = time.time()
        l_1 = Apriori.get_combinations_l_0(data)
        # filter out the infrequent subsets
        l_1_dict = Apriori.check(l_1, support, 1, data)
        print(time.time() - t)
        if len(l_1_dict) == 0:
            return l

        l.append(l_1_dict)
        k = 2
        while True:
            t = time.time()

            # generate subsets of size i
            l_k = Apriori.get_combinations(l_1_dict.keys(), l[-1])

            # filter out the infrequent subsets
            l_k_dict = Apriori.check(l_k, support, k, data)

            print(time.time() - t)
            if len(l_k_dict) == 0:
                return l
            l.append(l_k_dict)
            k += 1

    @staticmethod
    def find_associated(support, confidence, data):
        """
        Finds association rules
        :param support: support level the set has to have to be considered for association rules
        :param confidence: confidence of the association
        :param data: the dataset to look for the rules in
        :return: set of tuples where the association is as follows: tuple[0] -> tuple[1]
        """
        frequent_array = Apriori.find_frequent(support, data)
        # get all combinations
        comb = set()
        for frequent in frequent_array:
            for frequent_set, frequent_count in frequent.items():
                for i in range(1, len(frequent_set)):
                    for frequent_subset in combinations(frequent_set, i):
                        frequent_subset = frozenset(frequent_subset)
                        subset_count = frequent_array[i - 1][frequent_subset]
                        c = frequent_count / subset_count
                        if c > confidence:
                            comb.add((frequent_subset, frequent_set - frequent_subset))
        return comb

## Data loader class
Loads space seperated numbers in specified file.


In [4]:
class DataLoader:
    res = []

    @staticmethod
    def load(path):
        """
        Loads space seperated numbers in specified file.
        :param path: File path
        :return: Array of frozen sets
        """
        res = []
        with open(path) as file:
            for line in file:
                res.append(frozenset([int(x) for x in line.split(" ") if x.isdigit()]))
        return res


In [5]:
support = 0.01
confidence = 0.5

Finding all frequent sets

In [6]:
data = DataLoader.load("data/T10I4D100K.dat")
t = time.time()
print(Apriori().find_frequent(support, data))
print(f"Execution time: {time.time() - t}")

0.7151932716369629
2.727889060974121
3.2778327465057373
9.168785095214844
[{frozenset({675}): 2976, frozenset({33}): 1460, frozenset({701}): 1283, frozenset({274}): 2628, frozenset({617}): 2614, frozenset({561}): 2783, frozenset({97}): 1466, frozenset({185}): 1529, frozenset({862}): 3649, frozenset({104}): 1158, frozenset({210}): 2009, frozenset({823}): 1031, frozenset({334}): 2146, frozenset({618}): 1337, frozenset({285}): 2600, frozenset({25}): 1395, frozenset({688}): 1132, frozenset({111}): 1171, frozenset({814}): 1672, frozenset({874}): 2237, frozenset({73}): 2179, frozenset({496}): 1428, frozenset({85}): 1555, frozenset({541}): 3735, frozenset({707}): 1354, frozenset({336}): 1071, frozenset({765}): 1705, frozenset({381}): 2959, frozenset({276}): 2479, frozenset({51}): 1612, frozenset({75}): 3151, frozenset({913}): 1939, frozenset({653}): 2634, frozenset({275}): 1692, frozenset({296}): 2210, frozenset({335}): 1345, frozenset({631}): 2793, frozenset({918}): 3012, frozenset({515}): 1

Finding all association rules

In [7]:
data = DataLoader.load("data/T10I4D100K.dat")
t = time.time()
print(Apriori().find_associated(support, confidence, data))
print(f"Execution time: {time.time() - t}")

0.6796071529388428
2.6224918365478516
3.252183675765991
9.199936866760254
{(frozenset({704}), frozenset({825, 39})), (frozenset({825, 39}), frozenset({704})), (frozenset({704}), frozenset({825})), (frozenset({704, 825}), frozenset({39})), (frozenset({227}), frozenset({390})), (frozenset({704}), frozenset({39})), (frozenset({704, 39}), frozenset({825}))}
Execution time: 15.75421953201294
