In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
from concepts import Context
from tqdm import tqdm
from openpyxl import Workbook

In [None]:
class Dataset:
    def __init__(self, path):
        # context
        self.c = Context.fromfile(path, frmat="csv")
        # lattice
        self.l = self.c.lattice
        # context definition
        self.d = self.c.definition()
        # total ones
        self.total_ones = 0
        for val in self.c.bools:
            self.total_ones += sum(val)
        # dataset name
        self.name = path[path.rfind("/")+1:len(path)-4]

In [None]:
#@title greedy coverage
def GreedyCoverage(dataset, figsonly=False, limit=100):
    # dataset mapping
    l = dataset.l
    d = dataset.d
    total_ones = dataset.total_ones
    # generate a concept list, with area figures
    area_figures = []
    for i in l:
            z = []
            for p in i.extent:
                    for q in i.intent:
                            z.append([p, q])
            area_figures.append([i,z])
    for i in area_figures:
            i.append(len(i[1]))

    # generate a bools list for tracking covered area
    d2 = d.copy()

    # generate a list for algorithm results
    calculation_results = []
    
    # remove highest area concept
    area_figures = sorted(area_figures, key=lambda x: x[2])
    calculation_results.append(area_figures.pop())
    boolCount = calculation_results[-1][2]
    # loop until all ones are covered
    while boolCount < total_ones:
    
            # remove ones from bools list tracking covered area
            for i,j in calculation_results[-1][1]:
                    d2[i,j]=False
            for j in area_figures:
                    # create a new list for tracking latest coverage
                    latest_coverages = []
                    for k,m in j[1]:
                            if d2[k,m]:
                                    latest_coverages.append([k,m])
                            else:
                                    j[2] -=1
                    # update coverage area
                    j[1] = latest_coverages
            # remove current highest area concept
            area_figures = sorted(area_figures, key=lambda x: x[2])
            calculation_results.append(area_figures.pop())
            boolCount += calculation_results[-1][2]

    ### count number of concepts covered by every next concept

    # create a matching bools table and holder objects for number of concepts and ones
    d3 = d.copy()
    num_cons = 0
    num_ones = 0

    # list holding values
    values = []

    for i in calculation_results:
            # merge all extent and intent figures
            z = []
            for p in i[0].extent: 
                    for q in i[0].intent:
                            z.append([p, q])
            for k,m in z:
                    if d3[k,m] == True:
                            num_ones += 1
                            d3[k,m] = False
            num_cons += 1
            values.append([num_cons, num_ones])
            if num_cons >= limit:
                break
    if(figsonly==True):
        figures = []
        for val in values:
            figures.append(val[1])
        return figures
    else:
        return values

In [None]:
#@title QualityCover
def QualityCover(dataset, figsonly=False, limit=100):
    
    # dataset mapping
    l = dataset.l
    d = dataset.d
    c = dataset.c
    total_ones = dataset.total_ones
    
    # calculate pseudo concepts
    calculation_results = []
    index_list = []
    for ob in c.objects:
            obj_i = c.intension((ob,))
            for pr in c.properties:
                    scr = 0
                    if pr in obj_i:
                            pro_i = c.extension((pr,))
                            for row in pro_i:
                                    scr += len(set(c.intension((row,)))&set(obj_i))
                            index_list.append([ob,pr,obj_i,pro_i,scr/(len(obj_i)*len(pro_i)),scr])
    # i_list is the list of pseudo concepts 
    # (object, property, all properties having object, all objects having property, 
    # pseudo concept size, pseudo concept number of ones )
    i_list = list(sorted(index_list, key = lambda x: (x[4],x[5]), reverse=True))
    #print(i_list) ##### REMOVE THIS
    while len(i_list)>0:
            d2 = d.copy()
            for ob in d2[0]:
                    if ob not in i_list[0][3]:
                            d2.remove_object(ob)
            for pr in d2[1]:
                    if pr not in i_list[0][2]:
                            d2.remove_property(pr)
            c2 = Context(d2[0], d2[1], d2[2])
            l2 = c2.lattice
            concept_list = []
            #GAIN:
            if len(l2) == 1:
                #NB! If pseudo concept = concept, then gain score by default = 1.0, irrelevant/not used at the time of writing
                concept_list.append([l2[0], 1.0])
            else:
                for i in l2:
                        conj = len(i.extent)
                        disj = set()
                        for j in i.intent:
                                disj.update(c[(j,)][0])
                        concept_list.append([i, conj/len(disj)])
            c_list = list(sorted(concept_list, key=lambda x: x[1]))
            #print(c_list) ##### REMOVE THIS
            cn = c_list.pop()
            #print(cn) ##### REMOVE THIS
            calculation_results.append(cn)
            for i in i_list[:]:
                    if i[0] in cn[0].extent:
                            if i[1] in cn[0].intent:
                                    i_list.remove(i)
                                    
    # create a matching bools table and holder objects for number of concepts and ones
    d3 = d.copy()
    num_cons = 0
    num_ones = 0

    # list holding values
    values = []

    for i in calculation_results:
            # merge all extent and intent figures
            z = []
            for p in i[0].extent: 
                    for q in i[0].intent:
                            z.append([p, q])
            for k,m in z:
                    if d3[k,m] == True:
                            num_ones += 1
                            d3[k,m] = False
            num_cons += 1
            values.append([num_cons, num_ones])
            if num_cons >= limit:
                break
            
    if(figsonly==True):
        figures = []
        for val in values:
            figures.append(val[1])
        return figures
    else:
        return values

In [None]:
#@title QualityCover with Greedy Coverage as GAIN
def QualityCoverGreedy(dataset, figsonly=False, limit=100):
    
    # dataset mapping
    l = dataset.l
    d = dataset.d
    c = dataset.c
    total_ones = dataset.total_ones
    
    # calculate pseudo concepts
    calculation_results = []
    index_list = []
    for ob in c.objects:
            obj_i = c.intension((ob,))
            for pr in c.properties:
                    scr = 0
                    if pr in obj_i:
                            pro_i = c.extension((pr,))
                            for row in pro_i:
                                    scr += len(set(c.intension((row,)))&set(obj_i))
                            index_list.append([ob,pr,obj_i,pro_i,scr/(len(obj_i)*len(pro_i)),scr])
    # i_list is the list of pseudo concepts 
    # (object, property, all properties having object, all objects having property, 
    # pseudo concept size, pseudo concept number of ones )
    i_list = list(sorted(index_list, key = lambda x: (x[4],x[5]), reverse=True))

    d4 = d.copy()

    while len(i_list)>0:
            d2 = d.copy()
            for ob in d2[0]:
                    if ob not in i_list[0][3]:
                            d2.remove_object(ob)
            for pr in d2[1]:
                    if pr not in i_list[0][2]:
                            d2.remove_property(pr)
            c2 = Context(d2[0], d2[1], d2[2])
            l2 = c2.lattice
            concept_list = []
            #GAIN:
            if len(l2) == 1:
                #NB! If pseudo concept = concept, then gain score by default = 1.0, irrelevant/not used at the time of writing
                concept_list.append([l2[0], 1.0])
            else:
                    d5 = d2.intersection(d4, True)
                    for i in l2:
                            cov = 0
                            for j in i.intent:
                                    for k in i.extent:
                                            cov += d5[k,j]
                            concept_list.append([i, cov])
            c_list = list(sorted(concept_list, key=lambda x: x[1]))
            cn = c_list.pop()
            calculation_results.append(cn)
            for i in i_list[:]:
                    if i[0] in cn[0].extent:
                            if i[1] in cn[0].intent:
                                    i_list.remove(i)
                                    d4[i[0],i[1]] = False
                                    
    # create a matching bools table and holder objects for number of concepts and ones
    d3 = d.copy()
    num_cons = 0
    num_ones = 0

    # list holding values
    values = []

    for i in calculation_results:
            # merge all extent and intent figures
            z = []
            for p in i[0].extent: 
                    for q in i[0].intent:
                            z.append([p, q])
            for k,m in z:
                    if d3[k,m] == True:
                            num_ones += 1
                            d3[k,m] = False
            num_cons += 1
            values.append([num_cons, num_ones])
            if num_cons >= limit:
                break
            
    if(figsonly==True):
        figures = []
        for val in values:
            figures.append(val[1])
        return figures
    else:
        return values

In [None]:
#@title Stability
def Stability(dataset, figsonly=False, limit=100):
    # dataset mapping
    l = dataset.l
    d = dataset.d
    total_ones = dataset.total_ones

    stability_amounts = {}
    counted_subsets = {}

    for i in l:
        total_subsets = 2**len(i.extent)
        deductible_subsets = 0
        for j in l.downset_union([i]):
            if j != i:
                deductible_subsets += counted_subsets[j]
        final_subset = total_subsets-deductible_subsets
        counted_subsets[i] = final_subset
        stability_amounts[i] = final_subset/total_subsets

    calculation_results = list(sorted(stability_amounts.items(), key=lambda x: x[1], reverse=True))
    # remove infimum and supremum
    for i in calculation_results[:]:
        if i[0] == l.infimum:
            calculation_results.remove(i)
        if i[0] == l.supremum:
            calculation_results.remove(i)
        
    # create a matching bools table and holder objects for number of concepts and ones
    d3 = d.copy()
    num_cons = 0
    num_ones = 0

    # list holding values
    values = []

    for i in calculation_results:
            # merge all extent and intent figures
            z = []
            for p in i[0].extent: 
                    for q in i[0].intent:
                            z.append([p, q])
            for k,m in z:
                    if d3[k,m] == True:
                            num_ones += 1
                            d3[k,m] = False
            num_cons += 1
            values.append([num_cons, num_ones])
            if num_ones == total_ones:
                break
            if num_cons >= limit:
                break
            
    if(figsonly==True):
        figures = []
        for val in values:
            figures.append(val[1])
        return figures
    else:
        return values

In [None]:
#@title Iceberg

def Iceberg(dataset, figsonly=False, limit=100):
    # dataset mapping
    l = dataset.l
    d = dataset.d
    c = dataset.c
    total_ones = dataset.total_ones

# iceberg calculations

    iceberg_amounts = {}

    for i in l:
        amt = len(i.extent)/len(c.objects)
        iceberg_amounts[i] = amt

    calculation_results = list(sorted(iceberg_amounts.items(), key=lambda x: x[1], reverse=True))
    # remove infimum and supremum
    for i in calculation_results[:]:
        if i[0] == l.infimum:
            calculation_results.remove(i)
        if i[0] == l.supremum:
            calculation_results.remove(i)
        
    # create a matching bools table and holder objects for number of concepts and ones
    d3 = d.copy()
    num_cons = 0
    num_ones = 0

    # list holding values
    values = []

    for i in calculation_results:
            # merge all extent and intent figures
            z = []
            for p in i[0].extent: 
                    for q in i[0].intent:
                            z.append([p, q])
            for k,m in z:
                    if d3[k,m] == True:
                            num_ones += 1
                            d3[k,m] = False
            num_cons += 1
            values.append([num_cons, num_ones])
            if num_ones == total_ones:
                break
            if num_cons >= limit:
                break
            
    if(figsonly==True):
        figures = []
        for val in values:
            figures.append(val[1])
        return figures
    else:
        return values

In [None]:
#@title Monotone Systems Method

def MonotoneSystemsMethod(dataset, figsonly=False, limit=100):
    # dataset mapping
    l = dataset.l
    d = dataset.d
    c = dataset.c
    total_ones = dataset.total_ones

# monotone systems method calculations

    msm_amounts = []
    concepts_amount = len(l)
    attribute_values = []
    object_values = []

# remove empty set concepts from count
    for i in l:
        if (len(i.extent) == 0) or (len(i.intent) == 0):
            concepts_amount -= 1
            
# calculate attribute and object values
    for i in c.properties:
        amt = 0
        for j in l:
            if (len(j.extent)) > 0:
                if i in j.intent:
                    amt += 1
        attribute_values.append([i, amt])
    
    for i in c.objects:
        amt = 0
        for j in l:
            if (len(j.intent)) > 0:
                if i in j.extent:
                    amt += 1
        object_values.append([i, amt])

    for i in l:
        object_sum = 0
        attribute_sum = 0
        for j in i.intent:
            k = c.properties.index(j)
            object_sum += (concepts_amount - attribute_values[k][1]) + 1
        
        for j in i.extent:
            k = c.objects.index(j)
            attribute_sum += (concepts_amount - object_values[k][1]) + 1

        msm_amounts.append([i, object_sum*attribute_sum])

    calculation_results = sorted(msm_amounts, key=lambda x: x[1], reverse=True)
    # remove infimum and supremum
    for i in calculation_results[:]:
        if i[0] == l.infimum:
            calculation_results.remove(i)
        if i[0] == l.supremum:
            calculation_results.remove(i)
        
    # create a matching bools table and holder objects for number of concepts and ones
    d3 = d.copy()
    num_cons = 0
    num_ones = 0

    # list holding values
    values = []

    for i in calculation_results:
            # merge all extent and intent figures
            z = []
            for p in i[0].extent: 
                    for q in i[0].intent:
                            z.append([p, q])
            for k,m in z:
                    if d3[k,m] == True:
                            num_ones += 1
                            d3[k,m] = False
            num_cons += 1
            values.append([num_cons, num_ones])
            if num_ones == total_ones:
                break
            if num_cons >= limit:
                break
            
    if(figsonly==True):
        figures = []
        for val in values:
            figures.append(val[1])
        return figures
    else:
        return values

In [None]:
#@title Separation Index

def SeparationIndex(dataset, figsonly=False, limit=100):
    # dataset mapping
    l = dataset.l
    d = dataset.d
    c = dataset.c
    total_ones = dataset.total_ones

# separation index calculations

    separation_amounts = {}
    for i in l:
        object_list = []
        attribute_list = []
        objects_sum, attribute_sum, concept_area, amt = 0,0,0,0

        for j in i.extent:
            object_list.append(c.objects.index(j))
        for j in i.intent:
            attribute_list.append(c.properties.index(j))
        for j in object_list:
            objects_sum += sum(c.bools[j])
        for j in attribute_list:
            for k in c.bools:
                attribute_sum += k[j]
        concept_area = len(i.extent) * len(i.intent)
        amt = concept_area / (objects_sum + attribute_sum - concept_area)
        separation_amounts[i] = amt

    calculation_results = list(sorted(separation_amounts.items(), key=lambda x: x[1], reverse=True))
    # remove infimum and supremum
    for i in calculation_results[:]:
        if i[0] == l.infimum:
            calculation_results.remove(i)
        if i[0] == l.supremum:
            calculation_results.remove(i)
        
    # create a matching bools table and holder objects for number of concepts and ones
    d3 = d.copy()
    num_cons = 0
    num_ones = 0

    # list holding values
    values = []

    for i in calculation_results:
            # merge all extent and intent figures
            z = []
            for p in i[0].extent: 
                    for q in i[0].intent:
                            z.append([p, q])
            for k,m in z:
                    if d3[k,m] == True:
                            num_ones += 1
                            d3[k,m] = False
            num_cons += 1
            values.append([num_cons, num_ones])
            if num_ones == total_ones:
                break
            if num_cons >= limit:
                break
            
    if(figsonly==True):
        figures = []
        for val in values:
            figures.append(val[1])
        return figures
    else:
        return values

In [None]:
#@title Cue Validity

def CueValidity(dataset, figsonly=False, limit=100):
    # dataset mapping
    l = dataset.l
    d = dataset.d
    c = dataset.c
    total_ones = dataset.total_ones

# cue validity calculations

    cue_validity_amounts = {}

    for i in l:
        if not i.extent or not i.intent:
            continue
        extent_size = len(i.extent)
        cv_amt = 0
        for j in i.intent:
            cv_amt += extent_size/len(c.extension((j,)))
        cue_validity_amounts[i] = cv_amt

    calculation_results = list(sorted(cue_validity_amounts.items(), key=lambda x: x[1], reverse=True))
    # remove infimum and supremum
    for i in calculation_results[:]:
        if i[0] == l.infimum:
            calculation_results.remove(i)
        if i[0] == l.supremum:
            calculation_results.remove(i)
        
    # create a matching bools table and holder objects for number of concepts and ones
    d3 = d.copy()
    num_cons = 0
    num_ones = 0

    # list holding values
    values = []

    for i in calculation_results:
            # merge all extent and intent figures
            z = []
            for p in i[0].extent: 
                    for q in i[0].intent:
                            z.append([p, q])
            for k,m in z:
                    if d3[k,m] == True:
                            num_ones += 1
                            d3[k,m] = False
            num_cons += 1
            values.append([num_cons, num_ones])
            if num_ones == total_ones:
                break
            if num_cons >= limit:
                break
            
    if(figsonly==True):
        figures = []
        for val in values:
            figures.append(val[1])
        return figures
    else:
        return values

In [None]:
#@title Category Feature Collocation

def CategoryFeatColl(dataset, figsonly=False, limit=100):
    # dataset mapping
    l = dataset.l
    d = dataset.d
    c = dataset.c
    total_ones = dataset.total_ones

# category feature collocation calculations

    cfc_amounts = {}

    for i in l:
        if not i.extent or not i.intent:
            continue
        extent_size = len(i.extent)
        extent_sum = 0
        for j in i.extent:
            extent_sum += len(c.intension((j,)))
        cfc_amt = 0
        for j in i.intent:
            cfc_amt += (extent_size/len(c.extension((j,))) * extent_size/extent_sum)
        cfc_amounts[i] = cfc_amt

    calculation_results = list(sorted(cfc_amounts.items(), key=lambda x: x[1], reverse=True))
    # remove infimum and supremum
    for i in calculation_results[:]:
        if i[0] == l.infimum:
            calculation_results.remove(i)
        if i[0] == l.supremum:
            calculation_results.remove(i)
        
    # create a matching bools table and holder objects for number of concepts and ones
    d3 = d.copy()
    num_cons = 0
    num_ones = 0

    # list holding values
    values = []

    for i in calculation_results:
            # merge all extent and intent figures
            z = []
            for p in i[0].extent: 
                    for q in i[0].intent:
                            z.append([p, q])
            for k,m in z:
                    if d3[k,m] == True:
                            num_ones += 1
                            d3[k,m] = False
            num_cons += 1
            values.append([num_cons, num_ones])
            if num_ones == total_ones:
                break
            if num_cons >= limit:
                break
            
    if(figsonly==True):
        figures = []
        for val in values:
            figures.append(val[1])
        return figures
    else:
        return values

In [None]:
#@title Category Utility

def CategoryUtility(dataset, figsonly=False, limit=100):
    # dataset mapping
    l = dataset.l
    d = dataset.d
    c = dataset.c
    total_ones = dataset.total_ones

# category utility calculations

    category_utility_amounts = {}

    for i in l:
        if not i.extent or not i.intent:
            continue
        extent_sum = 0
        for j in i.extent:
            extent_sum += len(c.intension((j,)))
        cu_amt = 0
        for j in c.properties:
            prims = set(c.extension((j,)))
            if len(prims) == 0:
                continue
            extents = set(i.extent)
            intersect = prims.intersection(extents)
            cu_amt += ((len(intersect)/len(prims))**2-(len(prims)/total_ones)**2)
        category_utility_amounts[i] = cu_amt*extent_sum/total_ones

    calculation_results = list(sorted(category_utility_amounts.items(), key=lambda x: x[1], reverse=True))
    # remove infimum and supremum
    for i in calculation_results[:]:
        if i[0] == l.infimum:
            calculation_results.remove(i)
        if i[0] == l.supremum:
            calculation_results.remove(i)
        
    # create a matching bools table and holder objects for number of concepts and ones
    d3 = d.copy()
    num_cons = 0
    num_ones = 0

    # list holding values
    values = []

    for i in calculation_results:
            # merge all extent and intent figures
            z = []
            for p in i[0].extent: 
                    for q in i[0].intent:
                            z.append([p, q])
            for k,m in z:
                    if d3[k,m] == True:
                            num_ones += 1
                            d3[k,m] = False
            num_cons += 1
            values.append([num_cons, num_ones])
            if num_ones == total_ones:
                break
            if num_cons >= limit:
                break
            
    if(figsonly==True):
        figures = []
        for val in values:
            figures.append(val[1])
        return figures
    else:
        return values

In [None]:
def SheetSave(folder=False, file=False):
    if folder:
        files = list()
        for f in os.listdir(folder):
            files.append(folder+f)
    elif file:
        files = list()
        files.append(file)
    else:
        return
    title = input() + '.xlsx'
    wb = Workbook()
    wsmeta = wb.active
    wsmeta.title = "RESULTS"
    wsmeta.cell(1,1,"Dataset")
    wsmeta.cell(1,2,"Attributes")
    wsmeta.cell(1,3,"Objects")
    wsmeta.cell(1,4,"Ones")
    wsmeta.cell(1,5,"Concepts")
    wsmeta.cell(1,6,"Ones %")
    wsmeta.cell(1,7,"Concepts-to-Ones")
    for indx, file in enumerate(tqdm(files)):
        #print(file)
        dt = Dataset(file)
        print("Working on " + dt.name)
        if len(dt.l)<3:
            continue
        gc = GreedyCoverage(dt, figsonly=True)
        qc = QualityCover(dt, figsonly=True)
        qcg = QualityCoverGreedy(dt, figsonly=True)
        st = Stability(dt, figsonly=True)
        ib = Iceberg(dt, figsonly=True)
        msm = MonotoneSystemsMethod(dt, figsonly=True)
        si = SeparationIndex(dt, figsonly=True)
        cv = CueValidity(dt, figsonly=True)
        cfc = CategoryFeatColl(dt, figsonly=True)
        cu = CategoryUtility(dt, figsonly=True)
        worksheet = wb.create_sheet(title = dt.name)
        lists = list([gc, qc, qcg, st, ib, msm, si, cv, cfc, cu])
        largest_list = len(max(lists, key=len))
        h_names = ["GC", "QC", "QCG", "ST", "IB", "MSM", "SI", "CV", "CFC", "CU"]

        for idx in range(2,largest_list+2): #ROW NUMBERS
            worksheet.cell(idx,1).value = idx-1

        for idx in range(2,len(h_names)+2): #HEADERS
            worksheet.cell(1,idx).value = h_names[idx-2]

        for idx in range(2,len(gc)+2): #GC VALUES
            worksheet.cell(idx,2).value = gc[idx-2]

        for idx in range(2,len(qc)+2): #QC VALUES
            worksheet.cell(idx,3).value = qc[idx-2]

        for idx in range(2,len(qcg)+2): #QCG VALUES
            worksheet.cell(idx,4).value = qcg[idx-2]

        for idx in range(2,len(st)+2): #ST VALUES
            worksheet.cell(idx,5).value = st[idx-2]

        for idx in range(2,len(ib)+2): #IB VALUES
            worksheet.cell(idx,6).value = ib[idx-2]

        for idx in range(2,len(msm)+2): #MSM VALUES
            worksheet.cell(idx,7).value = msm[idx-2]

        for idx in range(2,len(si)+2): #SI VALUES
            worksheet.cell(idx,8).value = si[idx-2]

        for idx in range(2,len(cv)+2): #CV VALUES
            worksheet.cell(idx,9).value = cv[idx-2]

        for idx in range(2,len(cfc)+2): #CFC VALUES
            worksheet.cell(idx,10).value = cfc[idx-2]

        for idx in range(2,len(cu)+2): #CU VALUES
            worksheet.cell(idx,11).value = cu[idx-2]

        wsmeta.cell(indx+2,1,dt.name)
        wsmeta.cell(indx+2,2,len(dt.c.properties))
        wsmeta.cell(indx+2,3,len(dt.c.objects))
        wsmeta.cell(indx+2,4,dt.total_ones)
        wsmeta.cell(indx+2,5,len(dt.l))
        wsmeta.cell(indx+2,6,round(dt.total_ones/(len(dt.c.properties)*len(dt.c.objects)),2))
        wsmeta.cell(indx+2,7,round(len(dt.l)/dt.total_ones,2))
    wb.save(title)

In [None]:
# use either a folder of datasets or a single dataset file

#SheetSave(folder="Datasets/real_datasets/")
#SheetSave(file="Datasets/test.csv")