In [3]:
from itertools import combinations

def contiguous_subpaths(path : str):
    tags = path.strip("/").split("/")
    indices = list(range(len(tags)+1))
 
    for i,j in combinations(indices,2):
        yield '/'.join(tags[i:j])

In [9]:
docsFolder = 'docs/'
k = 3.0
minsup = 1/k

N = len(os.listdir(docsFolder))
print("Number of docs: ", N)

Number of docs:  9


In [4]:
from lxml import etree
import re
import os
from collections import defaultdict


docpathsAll = []

# Paths in each doc and their subpaths
for docName in os.listdir(docsFolder):
    doc = etree.parse(docsFolder + docName)
    docpathsFreq = defaultdict(int)
    for node in doc.iter():
        full_path = doc.getpath(node)
        full_path = re.sub(r'\[\d*\]', '', full_path) # remove [num] markers inserted by lxml
        for f in contiguous_subpaths(full_path): 
            docpathsFreq[f] += 1
    docpathsAll.append(docpathsFreq)

# All paths
print("Paths per doc: ", *(len(docpathsFreq) for docpathsFreq in docpathsAll))
paths = list({docpath for docpathsFreq in docpathsAll for docpath in docpathsFreq})
print("Total paths: ", len(paths))
print(paths[0:25])



Paths per doc:  755 7122 813 813 806 814 7173 385 417
Total paths:  7357
['custom-block/block/list', 'scenes/scene/blocks/block-definition/script/block/script/block/script/block/script/block/block/list/block/list/block/block/block/block', 'project/scenes/scene/stage/sprites/sprite/blocks', 'script/block/block/block/block/script/block/block/list/block/block/block/l', 'scene/blocks/block-definition/script/block/script/block/custom-block/block/block', 'scene/blocks/block-definition/script/block/script/block/script/block/l', 'scene/blocks/block-definition/script/block/block/block/autolambda/block/custom-block', 'project/scenes/scene/blocks/block-definition/script/block/script/custom-block/custom-block', 'block-definition/script/block/script/block/script/block/block/block/block/block/block/list', 'sprites/watcher', 'block-definition/script/block/block/block/list/block/block/l/option', 'block/script/block/block/block/block/block/block/autolambda', 'sounds', 'scenes/scene/blocks/block-definit

In [5]:
# Patterns = Maximum frequency paths: occur in at least minsup% of docs
maxfreqpaths = []
for path in paths:
    freq = sum([1 for docpaths in docpathsAll if path in docpaths])
    # TODO: No shared subpath criteria might be biased
    if (freq/N >= minsup) and not any((path in maxfreqpath or maxfreqpath in path) for maxfreqpath in maxfreqpaths): # TODO: Optimize
        maxfreqpaths.append(path)

print("Patterns:", len(maxfreqpaths))
print(maxfreqpaths[0:25])

Patterns: 81
['custom-block/block/list', 'project/scenes/scene/stage/sprites/sprite/blocks', 'scene/blocks/block-definition/script/block/block/block/autolambda/block/custom-block', 'sprites/watcher', 'sounds', 'sprite/scripts', 'scene/stage/costumes', 'project/scenes/scene/blocks/block-definition/script/block/block/block/block', 'block-definition/script/block/block/block/autolambda/block/block', 'project/scenes/scene/headers', 'project/scenes/scene/blocks/block-definition/script/block/script/block/block', 'blocks/block-definition/script/block/block/custom-block', 'scene/blocks/block-definition/script/block/block/list/block/block/list', 'scenes/scene/variables/variable/list', 'blocks/block-definition/script/block/block/block/block/l', 'autolambda/block/custom-block/l', 'scenes/scene/blocks/block-definition/script/block/block/list/l', 'block/autolambda/custom-block', 'script/block/script/custom-block', 'script/block/script/block/block/l/option', 'block/script/custom-block/block/l', 'spri

In [6]:
# Frequency of a path in a doc
def m(p, di):
    docpathsFreq = docpathsAll[di]
    return docpathsFreq[p]

# Similarity between two path patterns, using AHC Complete Link Algorithm
def sim(p1, p2):
    n = 0
    d = 0
    for docindex in range(N):
        m1 = m(p1, docindex)
        m2 = m(p2, docindex)
        mmin = min(m1, m2)
        n += mmin
        d += (m1 + m2 - mmin)
    return n/d

In [7]:
from cluster import HierarchicalClustering

cl = HierarchicalClustering(maxfreqpaths, sim)
cl.set_linkage_method('complete')

In [10]:
profiles = cl.getlevel(minsup)
print("Number of profiles: ", len(profiles))

Number of profiles:  2


In [13]:
profiles[0]

['block/color',
 'project/scenes/scene/blocks/block-definition/script/block/script/block/block',
 'project/scenes/scene/headers',
 'scenes/scene/blocks/block-definition/script/custom-block',
 'project/scenes/scene/hidden',
 'scene/blocks/block-definition/script/block/script/block/block/block',
 'scene/notes',
 'script/block/script/custom-block',
 'autolambda/block/custom-block/l',
 'scenes/scene/blocks/block-definition/script/block/script/block/block/l',
 'autolambda/custom-block/l',
 'block-definition/code',
 'script/block/block/list/block/block/block/autolambda',
 'translations',
 'block/block/block/autolambda/block/block/block/l',
 'script/block/block/block/block/list']

In [18]:
#cl.display()

In [14]:
def connection_strength(docpathsFreq, profile):
    n = 0
    d = 0
    for pattern in profile:
        if docpathsFreq[pattern] > 0:
            n += 1
    return n/len(profile)

In [15]:
import numpy as np

def assign_docs(docpathsAll, profiles):
    for docindex in range(N):
        print("Doc ", docindex)
        docpathsFreq = docpathsAll[docindex]
        strengths = [connection_strength(docpathsFreq, profile) for profile in profiles]
        print("Connection strengths per profile: ", strengths)
        profileindex = np.argmax(strengths)
        print("Assigned profile: ", profileindex)

In [16]:
assign_docs(docpathsAll, profiles)

Doc  0
Connection strengths per profile:  [0.9375, 0.9230769230769231]
Assigned profile:  0
Doc  1
Connection strengths per profile:  [0.9375, 1.0]
Assigned profile:  1
Doc  2
Connection strengths per profile:  [0.875, 0.9692307692307692]
Assigned profile:  1
Doc  3
Connection strengths per profile:  [0.9375, 0.9384615384615385]
Assigned profile:  1
Doc  4
Connection strengths per profile:  [1.0, 0.9846153846153847]
Assigned profile:  0
Doc  5
Connection strengths per profile:  [0.9375, 0.9692307692307692]
Assigned profile:  1
Doc  6
Connection strengths per profile:  [1.0, 1.0]
Assigned profile:  0
Doc  7
Connection strengths per profile:  [0.625, 0.4461538461538462]
Assigned profile:  0
Doc  8
Connection strengths per profile:  [0.4375, 0.4]
Assigned profile:  0
