In [1]:
import pandas as pd

def read_database(file):
    db = []
    with open(file) as f:
        lineList = f.readlines()
        for line in lineList:
            line = line.strip()
            li = line.split(" ")
            row = []
            for v in li:
                row.append(v)
            db.append(row)
    return db

In [72]:
def eclat(db, minsup):
    def generate_frequent_itemsets(P, minsup, F):
        for i, p_i in enumerate(P):
            Xa, t_Xa = p_i
            if not isinstance(Xa, list):
                Xa = [Xa]
            F.append((Xa, len(t_Xa)))
            Pa = []
            for j in range(i + 1, len(P)):
                Xb, t_Xb = P[j]
                if not isinstance(Xb, list):
                    Xb = [Xb]
                if j > i:
                    Xab = list(set(Xa).union(set(Xb)))
                    Xab.sort()
                    t_Xab = t_Xa.intersection(t_Xb)
                    if len(t_Xab) >= minsup:
                        Pa.append((Xab, t_Xab))
            if len(Pa) != 0:
                generate_frequent_itemsets(Pa, minsup, F)

    P = {}
    for i in range(len(db)):
        for item in db[i]:
            if item in P:
                P[item].add(i)
            else:
                P[item] = {i}
    P = list(P.items())
    
    condition = lambda x: len(x[1]) >= minsup
    P = [item for item in P if condition(item)]

    P = sorted(P, key=lambda x: x[0])
    F = []
    
    generate_frequent_itemsets(P, minsup, F)

    return [(F[i][0], F[i][1]) for i in range(len(F))]

In [76]:
import time

data = read_database('shop.dat')
minsup = 500

start_time = time.time()
F_shop = eclat(data, minsup)
end_time = time.time()
runtime = end_time - start_time

print(f"Runtime: {runtime:.6f} seconds")

df = pd.DataFrame(F_shop, columns=['Frequent Itemset', 'Support'])
df

Runtime: 4.282974 seconds


Unnamed: 0,Frequent Itemset,Support
0,[0],594
1,[1],1535
2,[10],1351
3,[100],1749
4,"[100, 362]",595
...,...,...
1068,[989],1289
1069,[991],1268
1070,[992],1116
1071,[995],1521


In [None]:
def getStrongRulesFromFrequentSets(fsets, minconf):
    return

In [None]:
def getStrongRulesForDatabase(db, minsup, minconf):
    

    return

In [74]:
## Unit test
import pandas as pd
dbExample = read_database("example.dat")

def testSyntax(F):
    found_problem = False
    for t in F:
        if type(t) != tuple:
            print("Entry " + str(t) + " in frequent itemset collection F is not a tuple.")
            found_problem = True
        elif len(t) != 2:
            print("Entry " + str(t) + " in frequent itemset collection F is a tuple but of length " + str(len(t)) + " instead of 2.")
            found_problem = True
        elif type(t[0]) != list:
            print("Itemset type in entry " + str(t) + " is " + str(type(t[0])) + " but should be list!")
            found_problem = True
        elif type(t[1]) != int:
            print("Support in entry " + str(t) + " is of type " + str(type(t[1])) + " but should be int!")
            found_problem = True
    print("Syntax check on frequent itemset collection: " + ("FAILED" if found_problem else "OK"))

def testFrequentItemsetCollection(F_exp, F_act):
    testSyntax(F_act)
    length_exp = len(F_exp)
    length_act = len(F_act)
    print("Length of collection: " + str("OK" if length_exp == length_act else "FAILED, expected length " + str(length_exp) + " but saw " + str(length_act)))
    obsolete = [f for f in F_act if not f in F_exp]
    missing = [f for f in F_exp if not f in F_act]
    if obsolete:
        print("Found unexpected entries in F: " + str(obsolete))
    if missing:
        print("Missing entries in F: " + str(missing))
    if not missing and not obsolete:
        print("F seems to be correct.")

F_exp = [(['A'], 4), (['A', 'B'], 4), (['A', 'B', 'D'], 3), (['A', 'B', 'D', 'E'], 3), (['A', 'B', 'E'], 4), (['A', 'D'], 3), (['A', 'D', 'E'], 3), (['A', 'E'], 4), (['B'], 6), (['B', 'C'], 4), (['B', 'C', 'E'], 3), (['B', 'D'], 4), (['B', 'D', 'E'], 3), (['B', 'E'], 5), (['C'], 4), (['C', 'E'], 3), (['D'], 4), (['D', 'E'], 3), (['E'], 5)]

# Test ECLAT
F_act = eclat(dbExample, 3)
testFrequentItemsetCollection(F_exp, F_act)

Syntax check on frequent itemset collection: OK
Length of collection: OK
F seems to be correct.


In [66]:
def testRuleGeneration(R_exp, R_act):

    # test syntax
    found_problem = False
    for r in R_act:
        if type(r) != tuple:
            print("Rule " + str(r) + " is not represented as a tuple!")
            found_problem = True
        if len(r) != 4:
            print("Rule " + str(r) + " does not consist of 4 entries. Should be (premise, conclusion, (absolute) support, confidence)")
            found_problem = True
        if type(r[0]) != list:
            print("Premise of rule " + str(r) + " is of type " + str(type(r[0])) + " but should be list!")
            found_problem = True
        if type(r[1]) != list:
            print("Conclusion of rule " + str(r) + " is of type " + str(type(r[0])) + " but should be list!")
            found_problem = True
    print("Syntax check " + ("FAILED. Skipping rest of the test." if found_problem else "OK"))
    if found_problem:
        return
    
    # semantics test
    length_exp = len(R_exp)
    length_act = len(R_act)
    print("Length of rule set: " + str("OK" if length_exp == length_act else "FAILED, expected length " + str(length_exp) + " but saw " + str(length_act)))
    obsolete = [f for f in R_act if not f in R_exp]
    missing = [f for f in R_exp if not f in R_act]
    if obsolete:
        print("Found unexpected entries in R: " + str(obsolete))
        found_problem = True
    if missing:
        print("Missing entries in R: " + str(missing))
        found_problem = True
    print("Rule set test: " + ("FAILED" if found_problem else "OK"))

R_exp = [(['A'], ['B'], 4, 1.0), (['A', 'D'], ['B'], 3, 1.0), (['B', 'D', 'E'], ['A'], 3, 1.0), (['A', 'D', 'E'], ['B'], 3, 1.0), (['A', 'B', 'D'], ['E'], 3, 1.0), (['D', 'E'], ['A', 'B'], 3, 1.0), (['A', 'D'], ['B', 'E'], 3, 1.0), (['A', 'E'], ['B'], 4, 1.0), (['A', 'B'], ['E'], 4, 1.0), (['A'], ['B', 'E'], 4, 1.0), (['D', 'E'], ['A'], 3, 1.0), (['A', 'D'], ['E'], 3, 1.0), (['A'], ['E'], 4, 1.0), (['C'], ['B'], 4, 1.0), (['C', 'E'], ['B'], 3, 1.0), (['D'], ['B'], 4, 1.0), (['D', 'E'], ['B'], 3, 1.0), (['E'], ['B'], 5, 1.0)]
R_act = getStrongRulesForDatabase(dbExample, 3, 1.0)
testRuleGeneration(R_exp, R_act)

NameError: name 'getStrongRulesForDatabase' is not defined