# import and define functions

In [25]:
import numpy as np
import pandas as pd
import csv
import math
import pickle
import pattern_count

In [26]:
def P1DominatedByP2(P1, P2):
    length = len(P1)
    for i in range(length):
        if P1[i] == -1:
            if P2[i] != -1:
                return False
        if P1[i] != -1:
            if P2[i] != P1[i] and P2[i] != -1:
                return False
    return True

# whether a pattern P is dominated by MUP M
def PDominatedByM(P, M):
    for m in M:
        if P1DominatedByP2(P, m):
            return True
    return False

# whether a pattern P dominates MUP M
def PDominatesM(P, M):
    for m in M:
        if P1DominatedByP2(m, P):
            return True
    return False

# coverage of P among dataset D
def cov(P, D):
    cnt = 0
    for d in D:
        if P1DominatedByP2(d, P):
            cnt += 1
    return cnt


def GenerateParents(P):
    parents = []
    length = len(P)
    for i in range(length):
        if P[i] != -1:
            q = P.copy()
            q[i] = -1
            parents.append(q)
    return parents

def GenerateByRule1(P, mcdes, attributes):
    children = []
    length = len(P)
    i = 0
    for i in range(length-1, -1, -1):
        if P[i] != -1:
            break
    if P[i] == -1:
        i -= 1
    for j in range(i+1, length, 1):
        for a in range(int(mcdes[attributes[j]]['min']), int(mcdes[attributes[j]]['max'])+1): #####!!!!!
            s = P.copy()
            s[j] = a
            children.append(s)
    return children

def Deepdiver(D, tao, mcdes, attributes):
    root = [-1] * (len(attributes))
    S = [root]  # initial stack
    M = []  # maximal uncovered patterns
    while len(S) > 0:
        P = S.pop()
        #print(P, len(M))
        if PDominatedByM(P, M):
            continue
        elif PDominatesM(P, M):
            uncoveredFlag = True
        else:
            cnt = cov(P, D)
            uncoveredFlag = cnt < tao
        if uncoveredFlag:
            S1 = [P] # stack
            while len(S1) > 0:
                P1 = S1.pop()
                ParentNodes = GenerateParents(P1)
                for parent in ParentNodes:
                    cntparent = cov(parent, D)
                    if cntparent < tao:
                        S1.append(parent)
                        break
                M.append(P)
            # end while
        else:
            children = GenerateByRule1(P, mcdes, attributes)
            S = S + children
    return M


def Prepatation(filename):
    mc = pd.read_csv(filename)
    mcdes = mc.describe()
    attributes = mcdes.columns.values
    return mc, mcdes, attributes


# load data

In [27]:

mc, mcdes, attributes = Prepatation('miss_class3.csv')
mcarray = np.array(mc)
mclist = mcarray.tolist()
print(len(attributes))



13


In [28]:
with open('M30.pkl', 'rb') as filehandle:
    # read the data as binary data stream
    cands = pickle.load(filehandle)
filehandle.close()

In [29]:
data = pd.read_csv("CleanAdult3.csv") 


In [30]:
column_list_mc = np.array(mc.columns).tolist()
pc_mc = pattern_count.PatternCounter('miss_class3.csv', column_list_mc, encoded=False)
pc_mc.parse_data()


column_list_adult = np.array(data.columns).tolist()
pc_adult = pattern_count.PatternCounter('CleanAdult3.csv', column_list_adult, encoded=False)
pc_adult.parse_data()


In [31]:
dataarray = np.array(data)
datalist = dataarray.tolist()
print(len(datalist))



45222


# check each pattern, find legal patterns and pickle them

In [55]:

def num2string(pattern):
    st = ''
    for i in pattern:
        if i != -1:
            st += str(i)
        st += '|'
    st = st[:-1]
    return st


# check whether each pattern P is legal: cardinality and accuracy
def CheckLowAcc(P, Tha, Thc):
    adult_p = num2string(P)
    covdata = pc_adult.pattern_count(adult_p)
    covmc = pc_mc.pattern_count(adult_p)
    if covdata >= Thc:
        #print(covdata, covmc)
        acc = 1 - covmc / covdata
        if acc < Tha:
            return True
    return False
    

In [56]:
P = []
for p in cands:
    if CheckLowAcc(p, 0.6, 50):
        P.append(p)


In [57]:
print(P)

[]


In [58]:
print(len(cands), len(P)) # 50 * 0.6  
# 13 attributes
# data set size = 45222


7550 0


In [48]:
with open('P30.pkl', 'wb') as filehandle:
    # store the data as binary data stream
    pickle.dump(P, filehandle)
filehandle.close()

# load the pickled pattern and check its correctness

## load the pickled patterns

In [49]:
with open('P30.pkl', 'rb') as filehandle:
    # read the data as binary data stream
    patterns = pickle.load(filehandle)
filehandle.close()

In [50]:
print(patterns) 

[[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 39], [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 35], [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 34], [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 32], [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 31], [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 30], [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 23], [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 22], [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 21], [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 19], [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 18], [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 13], [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12], [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 10], [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 8], [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 7], [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5], [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 4], [-1, -1, -1, -1, -1, -1, -1, -1

In [51]:
numPatterns = len(patterns)
print(numPatterns)

6640


## no pattern is dominated by others

In [52]:
for i in range(numPatterns):
    for j in range(i+1, numPatterns):
        if P1DominatedByP2(patterns[i], patterns[j]):
            print("%d is dominated by %d" %(i, j))
print("end checking domination")

end checking domination


## check accuracy of each pattern


In [53]:
#patternCount = pc.pattern_count(st)
Tha = 0.6
Thc = 50
for p in patterns:
    numdata = cov(p, datalist) # number of this pattern in the whole dataset
    numdata_pattern_count = pc_adult.pattern_count(num2string(p))
    nummc = cov(p, mclist) # number of this pattern in the miss_classified set
    if numdata < Thc:
        print(p, numdata, "< Thc")
        break
    acc = nummc / numdata
    if acc >= Tha:
        print(">= tha")
print('end checking accuracy')

end checking accuracy
