In [1]:
import numpy as np
import pandas as pd
from template import *

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
### HINT ON UNIT TESTS ###
### All unit tests assume that itemsets are ordered lexicographically.
### An itemset [B, A] is NOT valid! It must be [A, B]! This convention allows extreme speed up.

In [3]:
from itertools import combinations
from template import *
import pandas as pd
from tqdm import tqdm
import time
from collections import OrderedDict

dbExample = read_database("example.dat")
def eclat(db, minsup):
    P = OrderedDict()
    
    for i, transaction in enumerate(db):
        for item in transaction:
            if item in P:
                P[item].add(i)
            else:
                P[item] = {i}
                
    F = []
    def algorithm(P, minsup, F):
        for Xa, t_Xa in P.items():  # Acceder a los elementos de P como un OrderedDict
            F.append(([Xa], len(t_Xa)))  # Convertir Xa en una lista de un solo elemento
            Pa = {}
            for Xb in P.keys():
                if (Xb > Xa):  # Convertir Xb en una lista de un solo elemento antes de la comparación
                    Xab = sorted(set(Xa + Xb))
                    t_Xab = t_Xa.intersection(P[Xb])  # Calcular el soporte de Xab
                    if len(t_Xab) >= minsup:
                        Pa[tuple(Xab)] = t_Xab  # Convertir Xab en una tupla antes de usarlo como clave
            if Pa:
                algorithm(Pa, minsup, F)  # Llamada recursiva
    
    algorithm(P, minsup, F) 
    
    formatted_obtained_frequent_itemsets = []
    for itemset, support in F:
        if isinstance(itemset[0], tuple):
            formatted_itemset = list(itemset[0])
        else:
            formatted_itemset = itemset
        formatted_obtained_frequent_itemsets.append((formatted_itemset, support))

    formatted_obtained_frequent_itemsets.sort(key=lambda x: x[0])
    return formatted_obtained_frequent_itemsets





In [4]:
## Unit test
import pandas as pd
dbExample = read_database("example.dat")

def testSyntax(F):
    found_problem = False
    for t in F:
        if type(t) != tuple:
            print("Entry " + str(t) + " in frequent itemset collection F is not a tuple.")
            found_problem = True
        elif len(t) != 2:
            print("Entry " + str(t) + " in frequent itemset collection F is a tuple but of length " + str(len(t)) + " instead of 2.")
            found_problem = True
        elif type(t[0]) != list:
            print("Itemset type in entry " + str(t) + " is " + str(type(t[0])) + " but should be list!")
            found_problem = True
        elif type(t[1]) != int:
            print("Support in entry " + str(t) + " is of type " + str(type(t[1])) + " but should be int!")
            found_problem = True
    print("Syntax check on frequent itemset collection: " + ("FAILED" if found_problem else "OK"))

def testFrequentItemsetCollection(F_exp, F_act):
    # print("Expected frequent itemset collection:")
    # print(F_exp)
    # print("Obtained frequent itemset collection:")
    # print(F_act)
    testSyntax(F_act)
    length_exp = len(F_exp)
    length_act = len(F_act)
    print("Length of collection: " + str("OK" if length_exp == length_act else "FAILED, expected length " + str(length_exp) + " but saw " + str(length_act)))
    obsolete = [f for f in F_act if not f in F_exp]
    missing = [f for f in F_exp if not f in F_act]
    if obsolete:
        print("Found unexpected entries in F: " + str(obsolete))
    if missing:
        print("Missing entries in F: " + str(missing))
    if not missing and not obsolete:
        print("F seems to be correct.")

F_exp = [(['A'], 4), (['A', 'B'], 4), (['A', 'B', 'D'], 3), (['A', 'B', 'D', 'E'], 3), (['A', 'B', 'E'], 4), (['A', 'D'], 3), (['A', 'D', 'E'], 3), (['A', 'E'], 4), (['B'], 6), (['B', 'C'], 4), (['B', 'C', 'E'], 3), (['B', 'D'], 4), (['B', 'D', 'E'], 3), (['B', 'E'], 5), (['C'], 4), (['C', 'E'], 3), (['D'], 4), (['D', 'E'], 3), (['E'], 5)]

# Test ECLAT
F_act = eclat(dbExample, 3)
testFrequentItemsetCollection(F_exp, F_act)

Syntax check on frequent itemset collection: OK
Length of collection: OK
F seems to be correct.


In [5]:
def getStrongRulesFromFrequentSets(fsets, minconf):
    strong_rules = []
    for item in fsets:
        frequentSet = item[0]
        fsets_sup = item[1]
        if len(frequentSet) >= 2:
            A = getSubsets(frequentSet)
            while A:
                X = A.pop()
                index_x = [i[0] for i in fsets].index(X)
                c = fsets_sup / fsets[index_x][1]
                if c >= minconf:
                    Y = list(frequentSet)
                    for item in X:
                        Y.remove(item)
                    strong_rules.append((X, Y, fsets_sup, c))
                else:
                    if len(X) >= 2:
                        W_sets = getSubsets(X)
                        A = [a for a in A if a not in W_sets]
    return strong_rules


def getSubsets(set):
    subsets = []
    x = len(set)
    for i in range(1 << x):
       subsets.append([set[j] for j in range(x) if (i & (1 << j))])

    subsets.pop(-1)
    subsets.pop(0)

    return subsets

def getStrongRulesForDatabase(db, minsup, minconf):
    fsets = eclat(db, minsup)
    strong_rules = getStrongRulesFromFrequentSets(fsets, minconf)
    return strong_rules


In [6]:
def testRuleGeneration(R_exp, R_act):

    # test syntax
    found_problem = False
    for r in R_act:
        if type(r) != tuple:
            print("Rule " + str(r) + " is not represented as a tuple!")
            found_problem = True
        if len(r) != 4:
            print("Rule " + str(r) + " does not consist of 4 entries. Should be (premise, conclusion, (absolute) support, confidence)")
            found_problem = True
        if type(r[0]) != list:
            print("Premise of rule " + str(r) + " is of type " + str(type(r[0])) + " but should be list!")
            found_problem = True
        if type(r[1]) != list:
            print("Conclusion of rule " + str(r) + " is of type " + str(type(r[0])) + " but should be list!")
            found_problem = True
    print("Syntax check " + ("FAILED. Skipping rest of the test." if found_problem else "OK"))
    if found_problem:
        return
    
    # semantics test
    length_exp = len(R_exp)
    length_act = len(R_act)
    print("Length of rule set: " + str("OK" if length_exp == length_act else "FAILED, expected length " + str(length_exp) + " but saw " + str(length_act)))
    obsolete = [f for f in R_act if not f in R_exp]
    missing = [f for f in R_exp if not f in R_act]
    if obsolete:
        print("Found unexpected entries in R: " + str(obsolete))
        found_problem = True
    if missing:
        print("Missing entries in R: " + str(missing))
        found_problem = True
    print("Rule set test: " + ("FAILED" if found_problem else "OK"))

R_exp = [(['A'], ['B'], 4, 1.0), (['A', 'D'], ['B'], 3, 1.0), (['B', 'D', 'E'], ['A'], 3, 1.0), (['A', 'D', 'E'], ['B'], 3, 1.0), (['A', 'B', 'D'], ['E'], 3, 1.0), (['D', 'E'], ['A', 'B'], 3, 1.0), (['A', 'D'], ['B', 'E'], 3, 1.0), (['A', 'E'], ['B'], 4, 1.0), (['A', 'B'], ['E'], 4, 1.0), (['A'], ['B', 'E'], 4, 1.0), (['D', 'E'], ['A'], 3, 1.0), (['A', 'D'], ['E'], 3, 1.0), (['A'], ['E'], 4, 1.0), (['C'], ['B'], 4, 1.0), (['C', 'E'], ['B'], 3, 1.0), (['D'], ['B'], 4, 1.0), (['D', 'E'], ['B'], 3, 1.0), (['E'], ['B'], 5, 1.0)]
R_act = getStrongRulesForDatabase(dbExample, 3, 1.0)
testRuleGeneration(R_exp, R_act)

Syntax check OK
Length of rule set: OK
Rule set test: OK
