In [1]:
import random
import os
from shutil import copytree
from collections import Counter
import re
import numpy as np
import json

from sklearn.cluster import KMeans

In [2]:
from tqdm import tqdm_notebook as tqdm

In [3]:
random.seed(42)  # for reproducibility

In [4]:
datasets = ["authors", "periods", "recipes"]

In [5]:
def get_directories(path):
    """Return the directories inside the first level of a given path."""
    return next(os.walk(path))[1]

def get_files(path):
    """Return the files inside the first level of a given path."""
    return next(os.walk(path))[2]

def split_dataset(path, training_percentage=0.6):
    """Split the dataset at given path into learning and testing."""
    for document_class in get_directories(path):
        documents = get_directories("{path}/{folder}".format({
            "path": path,
            "folder": document_class
        }))
        
        random.shuffle(documents)
        
        n = int(len(documents) * training_percentage)
        split = [(documents[n:], "training"), (documents[:n], "testing")]
            
        for documents_set, target in split:
            if not os.path.isdir("%s-%s" % (path, document_class)):
                for d in documents_set:
                    copytree("%s/%s/%s" % (path, document_class, d),
                     "%s-%s/%s/%s" % (path, target, document_class, d))

In [6]:
def directory_counter(path):
    """Return a counter representing the files in the given directory."""
    regex = re.compile(r"\W+")
    counter = Counter()
    for document in get_files(path):
        with open("{path}/{document}".format(path=path, document=document), "r") as f:
            counter.update(word for word in re.split(regex, f.read()) if word)
    return counter

In [7]:
def build_keymap(counters):
    """Return an enumeration of the given counter keys as dictionary."""
    keyset = set()
    for counter in counters:
        keyset |= set(counter)
    return {k:i for i, k in enumerate(keyset)}

In [8]:
def counters_to_frequencies(counters, keys):
    """Return a numpy array representing sorted counters as frequencies."""
    frequencies = np.zeros((len(counters), len(keys)))
    for j, counter in enumerate(counters):
        indices, values = np.array([(keys[k], v) for k, v in counter.items() if k in keys]).T
        frequencies[j][indices] = values
        frequencies[j] /= np.sum(frequencies[j])
    return frequencies

In [9]:
def build_dataset(dataset):
    """Return a numpy vector representation of the dataset."""
    counters_classes = {}

    for document_class in get_directories(dataset):
        class_path = "{path}/{document_class}".format(
            path=dataset, document_class=document_class)
        counters_classes[document_class] = [
            directory_counter("{class_path}/{document}".format(
                class_path=class_path, document=document))
            for document in get_directories(class_path)
        ]

    keymap = build_keymap([
        counter for counters in counters_classes.values()
        for counter in counters
    ])

    with open("{path}-keymap.json".format(path=dataset), "w") as f:
        json.dump(keymap, f)

    return {
        key: counters_to_frequencies(counters, keymap)
        for key, counters in counters_classes.items()
    }

In [19]:
def kmeans(k, points):
    """Return a tuple containing centroids and predictions for given data with k centroids."""
    kmeans = KMeans(n_clusters=k, random_state=42, max_iter=10, verbose=True)
    kmeans.fit(points)
    return kmeans.cluster_centers_, kmeans.predict(points)


def representative_points(points, p=0.1, a=0.2, d=10000):
    """Return representative points for given set, using given percentage `p` and moving points of `a`."""
    k = np.ceil(points.shape[0] * p ** 2)
    n = np.floor(points.shape[0] * p)

    
    centroids, predictions = kmeans(int(k), points)

    representatives = centroids

    if n > 0:
        distances = np.sum(
            np.power(reduced - centroids[predictions], 2), axis=1)
        for i in tqdm(range(k)):
            cluster = distances[predictions == i]
            representatives.vstack(cluster[np.argpartition(cluster, -n)[-n:]] *
                                   (1 - a) + centroids[i] * a)
    return representatives

In [20]:
def build_classifier(dataset):
    """Build classifier for given dataset."""
    return {
        k:representative_points(data) for k, data in tqdm(dataset.items())
    }

In [21]:
built_dataset = build_dataset("recipes-training")

In [22]:
classifier = build_classifier(built_dataset)

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

Initialization complete
start iteration
done sorting
end inner loop
Iteration 0, inertia 37.27129402188239
start iteration
done sorting
end inner loop
Iteration 1, inertia 36.069952293533554
start iteration
done sorting
end inner loop
Iteration 2, inertia 35.81982641160563
start iteration
done sorting
end inner loop
Iteration 3, inertia 35.6380494500003
start iteration
done sorting
end inner loop
Iteration 4, inertia 35.44847929700592
start iteration
done sorting
end inner loop
Iteration 5, inertia 35.31400708554676
start iteration
done sorting
end inner loop
Iteration 6, inertia 35.3019808353654
start iteration
done sorting
end inner loop
Iteration 7, inertia 35.29641421735384
start iteration
done sorting
end inner loop
Iteration 8, inertia 35.29302841155568
start iteration
done sorting
end inner loop
Iteration 9, inertia 35.27629473109795
Initialization complete
start iteration
done sorting
end inner loop
Iteration 0, inertia 36.149056382595475
start iteration
done sorting
end inner 

Iteration 6, inertia 33.9087861359512
start iteration
done sorting
end inner loop
Iteration 7, inertia 33.906700253168424
start iteration
done sorting
end inner loop
Iteration 8, inertia 33.90468234919041
start iteration
done sorting
end inner loop
Iteration 9, inertia 33.90271346101602


NameError: name 'reduced' is not defined