# Homework 2: Discovery of Frequent Itemsets and Association Rules

### Group 28
Junjie Shan & Yuxin Meng

### Read transactions from dataset

Read data from dataset line by line, and turn the string into intergers.

In [1]:
# Discovery of Frequent Itemsets and Association Rules
import time
# read dataset from T10I4D100K.dat
# read intergers sets from dataset line by line
t0 = time.time()
def read_dataset(filename):
    dataset = []
    with open(filename, 'r') as f:
        for line in f:
            lines = list(map(int, line.strip().split()))
            lines.sort()
            dataset.append(lines)
    dataset.sort()
    return dataset
t = time.time() - t0
dataset = read_dataset('./T10I4D100K.dat')
# length of dataset
print('length of dataset:', len(dataset))
print('running time for data reading:', t, 'seconds')

length of dataset: 100000
running time for data reading: 0.0010008811950683594 seconds


## Task-1
### Implement the A-Priori algorithm
The aim of A-Priori algorithm is to find frequent itemsets with support at least s in a dataset of sales transactions.  
And the support of an itemset is the number of transactions containing the itemset. 

### Get the support threshold
Usually, we set the support threshold as the length of dataset / 100

In [2]:
# get the apporopriate support threshold

def get_support_threshold(dataset):
    return len(dataset) * 0.01

print('support threshold:', int(get_support_threshold(dataset)))

s = get_support_threshold(dataset)

support threshold: 1000


### A-priori Algorithm
1. support function & filter_itemset function. we need to calculate the support of an itemset, and then filter out itemset that do not meet the support value greater than or equal to threshold.  
2. get_1_item_set function & update_dataset function. In this algorithm, we first need to find individual frequent items and filter out transactions that do not include any frequent items.  
3. Then we generated k-itemset from k-1_itemsets, which is called possible itemsets. New added element comes from all single elements contain in k-1_itemsets.  
Then calculate the support of each possible itemset, here we don't use support function. The get_k_size_subsets function is used to do this. The elements in the transaction are combined by k and if they are the subset of possible_k_itemsets, it means the transaction contains these frequent itemsets.
4. Then update the dataset by removing transactions that don't contain any k-itemsets.
5. Repeat 3~4 until the new k_frequent_itemsets is empty.

In [3]:
# finding frequent itemsets of integers with support at least s in the dataset
import numpy as np
import pandas as pd
import itertools
class Apriori:
    def __init__(self, dataset, support_threshold):
        self.dataset = dataset
        self.support_threshold = support_threshold
        self.frequent_itemsets = []
        self.frequent_item = []
        self.all_freq_itemsets = []
        self.item_frequency = {}
        self.frequent_item_sets = []
        
   # compute the support of given itemset
    def support(self, itemset):
        support = 0
        for transaction in self.dataset:
            if set(itemset).issubset(set(transaction)):
                support += 1
        return support
    
    # filter generated possible itemset
    def filter_itemset(self, itemset):
        return (self.support(itemset) >= self.support_threshold)

    # get 1-item_set
    def get_1_item_set(self):
        for transaction in self.dataset:
            for item in transaction:
                if item not in self.item_frequency:
                    self.item_frequency[item] = 1
                else:
                    self.item_frequency[item] += 1
        for item in self.item_frequency:
            if self.item_frequency[item] >= self.support_threshold:
                self.frequent_item.append(item)
        # self.frequent_item.sort()
        return self.frequent_item

    # update the dataset
    def update_dataset(self, itemset):
        new_dataset = []
        for transaction in self.dataset:
            for item in transaction:
                if item in itemset:
                    new_dataset.append(transaction)
                    break
        # new_dataset.sort()
        self.dataset = new_dataset
        return self.dataset
    
    # get all k size subsets contained given itemset of given transactions
    def get_k_size_subsets(self, itemset, transactions, k):
        return [sets for sets in itertools.combinations(transactions, k) if sets in itemset]
    
    # get all frequent itemsets
    def get_k_item_set(self, _k_1_itemsets, k):
        k_itemsets = []
        # get all elements in the _k_1_itemsets
        all_elements = np.hstack(np.array(_k_1_itemsets))
        # get all unique elements in the _k_1_itemsets
        unique_elements = np.unique(all_elements)
        # get all combinations of unique elements
        possible_itemsets = list(itertools.combinations(unique_elements, k))
        # convert list to dict and set all values to 0
        possible_itemsets = {itemset: 0 for itemset in possible_itemsets}
        # get all transactions in the dataset to get the support of each itemset
        for transaction in self.dataset:
            subsets = self.get_k_size_subsets(possible_itemsets, transaction, k)
            for s in subsets:
                possible_itemsets[s] += 1
        k_itemsets = [item for item in possible_itemsets if possible_itemsets[item] >= self.support_threshold]
        # print all frequent itemsets with support
        if len(k_itemsets) != 0:
            print('frequent itemsets of size', k, ':')
            for item in k_itemsets:
                print(item, 'support:', possible_itemsets[item])
        return k_itemsets

    # get frequent itemset
    def get_frequent_itemset(self):
        _1_itemsets = self.get_1_item_set()
        self.update_dataset(_1_itemsets)
        # print(len(_1_itemsets))
        self.frequent_itemsets.append(_1_itemsets)
        k = 2
        while True:
            k_itemsets = self.get_k_item_set(_1_itemsets, k)
            if len(k_itemsets) == 0:
                break
            self.frequent_itemsets.append(k_itemsets)
            _1_itemsets = k_itemsets
            self.update_dataset_by_deleting_infrequent_items(k_itemsets)
            k += 1
        return self.frequent_itemsets

    # update the dataset by deleting the infrequent items
    def update_dataset_by_deleting_infrequent_items(self, itemsets):
        new_dataset = []
        for transaction in self.dataset:
            for itemset in itemsets:
                if set(itemset).issubset(set(transaction)):
                    new_dataset.append(transaction)
                    break
        self.dataset = new_dataset
        return self.dataset


### Test the Apriori Algorithm

In [4]:
apriori = Apriori(dataset, s)
# print all 1-item_set with support
t = time.time()
apriori.get_1_item_set()
t = time.time() - t
print('running time for 1-itemsets:', t, 'seconds')
print(apriori.item_frequency)


running time for 1-itemsets: 0.23600077629089355 seconds
{0: 594, 1: 1535, 21: 2666, 48: 2472, 173: 1080, 427: 1856, 529: 7057, 538: 3982, 660: 610, 710: 1044, 731: 560, 829: 6810, 911: 586, 956: 3626, 130: 1711, 698: 236, 839: 854, 53: 535, 168: 1538, 177: 4629, 194: 517, 310: 1390, 350: 3069, 362: 4388, 368: 7828, 442: 326, 600: 1192, 709: 672, 928: 1034, 970: 2086, 54: 2595, 183: 3883, 283: 4082, 944: 2794, 998: 2713, 128: 525, 369: 592, 398: 403, 469: 1502, 496: 1428, 561: 2783, 626: 874, 919: 3710, 471: 2894, 663: 2354, 804: 1315, 871: 2810, 983: 453, 3: 531, 27: 2165, 240: 1399, 274: 2628, 406: 785, 432: 985, 521: 1582, 639: 1572, 692: 4993, 746: 1982, 790: 1094, 855: 939, 879: 865, 73: 2179, 114: 816, 115: 1775, 145: 4559, 166: 346, 279: 3014, 308: 1402, 309: 1262, 508: 95, 618: 1337, 658: 1881, 661: 2693, 665: 1297, 682: 4132, 984: 1756, 4: 1394, 28: 1454, 268: 885, 401: 3667, 430: 580, 530: 1263, 578: 1290, 580: 1667, 755: 392, 913: 1939, 35: 1984, 242: 2325, 317: 266, 392: 24

In [5]:
# Run Aprori algorithm
apriori = Apriori(dataset, s)
t1 = time.time()
fre_items = apriori.get_frequent_itemset()
t2 = time.time()
print('running time for apriori algorithm for finding larger itemsets:', t2 - t1, 'seconds')
# print(fre_items[1:])
# print(fre_items[0], fre_items[1], fre_items[2])

frequent itemsets of size 2 :
(39, 704) support: 1107
(39, 825) support: 1187
(217, 346) support: 1336
(227, 390) support: 1049
(368, 682) support: 1193
(368, 829) support: 1194
(390, 722) support: 1042
(704, 825) support: 1102
(789, 829) support: 1194
frequent itemsets of size 3 :
(39, 704, 825) support: 1035
running time for apriori algorithm for finding larger itemsets: 4.662030458450317 seconds


## Task-2
### Get association rules
Develop and implement an algorithm for generating association rules between frequent itemsets discovered by using the A-Priori algorithm in a dataset of sales transactions. The rules must have support at least s and confidence at least c, where s and c are given as input parameters.  

confidence((A,B)->C) = support(A,B,C) / support(A,B), if the result larger than or equal to the given confidence, then we keep this association rule.

In [6]:
# frequent_items = [(39, 704), (39, 825), (217, 346), (227, 390), (368, 682), (368, 829), (390, 722), (704, 825), (789, 829), (39, 704, 825)]
frequent_items = fre_items[1:]
apriori_1 = Apriori(dataset, s)
apriori_1.update_dataset(apriori.get_1_item_set())
apriori_1.update_dataset_by_deleting_infrequent_items(frequent_items[0])
# get the association rules
def get_association_rules(frequent_itemsets, confidence):
    rules = []
    for itemset in frequent_itemsets:
        for item in itemset:
            for item_subset in list(itertools.combinations(itemset, len(itemset) - 1)):
                cal_confidence = apriori_1.support(itemset) / apriori_1.support(item_subset)
                if item not in item_subset:
                    rules.append((item_subset, item, cal_confidence))
    rules = [rule for rule in rules if rule[2] >= confidence]
    
    return rules



In [8]:
# Run Association Rule

rules = []
t1 = time.time()
for itemset in frequent_items:
    rules.extend(get_association_rules(itemset, 0.8))
t2 = time.time()
print('running time for finding association rule:', t2 - t1, 'seconds')
# print the number of rules
print('number of rules found:', len(rules))
# print rules in the form of: item => item confidence
for rule in rules:
    print(set(rule[0]), '=>', rule[1], 'confidence:', rule[2])

running time for finding association rule: 0.5709977149963379 seconds
number of rules found: 16
{704} => 39 confidence: 0.917910447761194
{825} => 39 confidence: 0.8831845238095238
{39} => 825 confidence: 0.8376852505292872
{346} => 217 confidence: 0.9258489258489259
{217} => 346 confidence: 0.8703583061889251
{390} => 227 confidence: 0.824685534591195
{227} => 390 confidence: 0.9536363636363636
{682} => 368 confidence: 0.9120795107033639
{722} => 390 confidence: 0.8282988871224165
{390} => 722 confidence: 0.8191823899371069
{825} => 704 confidence: 0.8199404761904762
{704} => 825 confidence: 0.9137645107794361
{789} => 829 confidence: 0.9100609756097561
{704, 825} => 39 confidence: 0.9392014519056261
{825, 39} => 704 confidence: 0.8719460825610783
{704, 39} => 825 confidence: 0.9349593495934959
