In [131]:
from itertools import combinations

def contiguous_subpaths(path : str):
    tags = path.strip("/").split("/")
    indices = list(range(len(tags)+1))

    for i,j in combinations(indices,2):
        yield '/'.join(tags[i:j])

In [132]:
docsFolder = 'docs/'
k = 2.0
minsup = 1/k

N = len(os.listdir(docsFolder))
print("Number of docs: ", N)

Number of docs:  3


In [106]:
from lxml import etree
import re
import os
from collections import defaultdict


docpathsAll = []

# Paths in each doc # TODO: Need to incorporate subpaths!
for docName in os.listdir(docsFolder):
    doc = etree.parse(docsFolder + docName)
    docpathsFreq = defaultdict(int)
    for node in doc.iter():
        full_path = doc.getpath(node)
        full_path = re.sub(r'\[\d*\]', '', full_path) # remove [num] markers inserted by lxml
        for f in contiguous_subpaths(full_path): 
            docpathsFreq[f] += 1
    docpathsAll.append(docpathsFreq)

# All paths
print("Paths per doc: ", *(len(docpathsFreq) for docpathsFreq in docpathsAll))
paths = list({docpath for docpathsFreq in docpathsAll for docpath in docpathsFreq})
print("Total paths: ", len(paths))
print(paths[0:25])



Number of docs:  3
Paths per doc:  802 813 417
Total paths:  994
['block/list/block/block/block/autolambda/custom-block/l', 'scene/blocks/block-definition/code', 'block/script/block/l/option', 'scene/blocks/block-definition/script/block/script/block/script/block/l', 'scenes/scene/stage/sprites/sprite/blocks', 'project/scenes/scene/blocks/block-definition/script/block/block/block/block/list/block', 'blocks/block-definition/script/block/script/block/script', 'blocks/block-definition/script/block/script/block', 'project/scenes/scene/stage/sprites/sprite/variables', 'project/scenes/scene/blocks/block-definition/script/block/l/option', 'scene/stage/scripts/script', 'sprites/watcher', 'blocks/block-definition/script/block/l/option', 'scene/stage/sprites/sprite/sounds', 'script/block/block/custom-block/block', 'block/custom-block/block/list', 'project/scenes/scene/blocks/block-definition/script/block/script/block/block', 'block/block/autolambda/block/block/block', 'scripts/script/block/block'

In [107]:
# Patterns = Maximum frequency paths: occur in at least minsup% of docs
maxfreqpaths = []
for path in paths:
    freq = sum([1 for docpaths in docpathsAll if path in docpaths])
    if (freq/N >= minsup) and not any(path in maxfreqpath for maxfreqpath in maxfreqpaths): # TODO: Optimize
        maxfreqpaths.append(path)

print("Patterns:", len(maxfreqpaths))
print(maxfreqpaths[0:25])

Patterns: 142
['block/list/block/block/block/autolambda/custom-block/l', 'scene/blocks/block-definition/code', 'block/script/block/l/option', 'scene/blocks/block-definition/script/block/script/block/script/block/l', 'scenes/scene/stage/sprites/sprite/blocks', 'project/scenes/scene/blocks/block-definition/script/block/block/block/block/list/block', 'project/scenes/scene/stage/sprites/sprite/variables', 'project/scenes/scene/blocks/block-definition/script/block/l/option', 'scene/stage/scripts/script', 'sprites/watcher', 'scene/stage/sprites/sprite/sounds', 'script/block/block/custom-block/block', 'project/scenes/scene/blocks/block-definition/script/block/script/block/block', 'block/block/autolambda/block/block/block', 'block-definition/header', 'block/block/autolambda/block/custom-block/l', 'scenes/scene/stage/sprites/sprite/scripts/script/custom-block', 'script/block/block/block/block/autolambda', 'script/block/block/block/autolambda/block/block', 'scenes/scene/blocks/block-definition/s

In [109]:
# Frequency of a path in a doc
def m(p, di):
    docpathsFreq = docpathsAll[di]
    return docpathsFreq[p]

# Similarity between two path patterns, using AHC Complete Link Algorithm
def sim(p1, p2):
    n = 0
    d = 0
    for docindex in range(N):
        m1 = m(p1, docindex)
        m2 = m(p2, docindex)
        mmin = min(m1, m2)
        n += mmin
        d += (m1 + m2 - mmin)
    return n/d

In [110]:
from cluster import HierarchicalClustering

cl = HierarchicalClustering(maxfreqpaths, sim)
cl.set_linkage_method('complete')

In [122]:
profiles = cl.getlevel(minsup)
print("Number of profiles: ", len(profiles))

Number of profiles:  3


In [90]:
cl.display()

[level 1.0]
    [level 0.5]
    sounds/list
        [level 0.5]
        scenes/scene/stage/costumes
            [level 0.5]
            scene/stage/costumes
                [level 0.3333333333333333]
                scenes/scene/stage/scripts
                    [level 0.3]
                        [level 0.2727272727272727]
                        custom-block/l
                            [level 0.25]
                                [level 0.12]
                                    [level 0.1]
                                        [level 0.1]
                                            [level 0.0967741935483871]
                                                [level 0.06818181818181818]
                                                    [level 0.057692307692307696]
                                                        [level 0.057692307692307696]
                                                            [level 0.05555555555555555]
                                                

In [128]:
def connection_strength(docpathsFreq, profile):
    n = 0
    d = 0
    for pattern in profile:
        if docpathsFreq[pattern] > 0:
            n += 1
        d += 1
    return n/d

In [129]:
import numpy as np

def assign_docs(docpathsAll, profiles):
    for docindex in range(N):
        docpathsFreq = docpathsAll[docindex]
        strengths = [connection_strength(docpathsFreq, profile) for profile in profiles]
        print(strengths)
        profileindex = np.argmax(strengths)
        print(docindex, profileindex)

In [130]:
assign_docs(docpathsAll, profiles)

[1.0, 0.9629629629629629, 1.0]
0 0
[1.0, 0.9259259259259259, 1.0]
1 0
[0.0, 0.5333333333333333, 0.5]
2 1
