# Benchmarks

In [1]:
%load_ext autoreload
%autoreload 2

In [23]:
from sklearn import datasets

X, y = datasets.load_digits(n_class=5, return_X_y=True)
y.shape

In [25]:
import myriade
from sklearn import linear_model
from sklearn import model_selection
from sklearn import pipeline
from sklearn import preprocessing

X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.4, random_state=42
)

model = pipeline.make_pipeline(
    preprocessing.StandardScaler(),
    myriade.multiclass.OptimalHierarchyClassifier(
        classifier=linear_model.LogisticRegression()
    )
)

model = model.fit(X_train, y_train)
print(f"{model.score(X_test, y_test):.2%}")

95.84%


In [4]:
import functools
from sklearn import linear_model
from sklearn import pipeline
from sklearn import preprocessing
from sklearn import metrics
from sklearn import model_selection
from sklearn import multiclass
from sklearn import neighbors

binary_clf = pipeline.make_pipeline(
    preprocessing.StandardScaler(),
    linear_model.LogisticRegression()
)

models = {
    'Multinomial': pipeline.make_pipeline(
        preprocessing.StandardScaler(),
        linear_model.LogisticRegression(multi_class='multinomial', solver='sag')
    ),
    'KNN': pipeline.make_pipeline(
        preprocessing.StandardScaler(),
        model_selection.GridSearchCV(
            neighbors.KNeighborsClassifier(),
            param_grid={'n_neighbors': [3, 6, 9, 12]}
        )
    ),
    'OvR': multiclass.OneVsRestClassifier(binary_clf),
    'OvO': multiclass.OneVsOneClassifier(binary_clf),
    #'ECOC': model_selection.GridSearchCV(
    #    multiclass.OutputCodeClassifier(binary_clf, random_state=42),
    #    param_grid={'code_size': [i / 10 for i in range(1, 16, 2)]}  # 10% to 150%
    #)
}

cross_val = functools.partial(
    model_selection.cross_val_score,
    X=X,
    y=y,
    scoring='accuracy',
    cv=model_selection.KFold(5, shuffle=True, random_state=42)
)

for name, model in models.items():
    print(f'{name}: {cross_val(model).mean():.5f}')

Multinomial: 0.95333
KNN: 0.95333
OvR: 0.92667
OvO: 0.96667


One-vs-rest.

In [5]:
import myriad

binary_clf = pipeline.make_pipeline(
    preprocessing.StandardScaler(),
    linear_model.LogisticRegression(solver='liblinear')
)

lt = myriad.LabelTreeClassifier(binary_clf)
cross_val(lt).mean()

0.9533333333333334

In [6]:
import math
import myriad
import tqdm

best_tree = None
best_score = -math.inf

for tree in tqdm.tqdm(myriad.iter_trees(labels=set(y))):
    hc = myriad.LabelTreeClassifier(binary_clf, prior_tree=tree)
    score = cross_val(hc).mean()
    if score > best_score:
        best_score = score
        best_tree = tree

print(best_score)

3it [00:00, 49.51it/s]

0.9666666666666668





In [15]:
hc = myriad.LabelTreeClassifier(binary_clf, prior_tree=tree)
hc.fit(X, y)
hc.predict_proba(X).sum(axis=1)

UnboundLocalError: local variable 'p' referenced before assignment

Improve.

In [6]:
nodes = {(i,): anytree.AnyNode(label=i) for i in range(10)}
nodes

{(0,): AnyNode(label=0),
 (1,): AnyNode(label=1),
 (2,): AnyNode(label=2),
 (3,): AnyNode(label=3),
 (4,): AnyNode(label=4),
 (5,): AnyNode(label=5),
 (6,): AnyNode(label=6),
 (7,): AnyNode(label=7),
 (8,): AnyNode(label=8),
 (9,): AnyNode(label=9)}

In [7]:
import collections


def sort_tuple(*args):
    return tuple(sorted(args))


class CM(collections.defaultdict):
    
    def __init__(self):
        super().__init__(int)
        
    def __setitem__(self, key, val):
        skey = sort_tuple(*key)
        return super().__setitem__(skey, val)
        
    def __getitem__(self, key):
        skey = sort_tuple(*key)
        return super().__getitem__(skey)

cm = metrics.confusion_matrix(y_test, y_pred)
mistakes = CM()

for i in range(10):
    for j in range(10):
        if i == j:
            continue
        mistakes[(i,), (j,)] += cm[i, j]


In [8]:
left, right = max(mistakes, key=mistakes.get)
del mistakes[left, right]
new_key = sort_tuple(*left, *right)
left, right, new_key

((3,), (9,), (3, 9))

In [9]:
nodes[new_key] = make_branch(nodes.pop(left), nodes.pop(right))
print(anytree.RenderTree(nodes[new_key]))

AnyNode(labels=[3, 9])
├── AnyNode(label=3)
└── AnyNode(label=9)


In [10]:
nodes

{(0,): AnyNode(label=0),
 (1,): AnyNode(label=1),
 (2,): AnyNode(label=2),
 (4,): AnyNode(label=4),
 (5,): AnyNode(label=5),
 (6,): AnyNode(label=6),
 (7,): AnyNode(label=7),
 (8,): AnyNode(label=8),
 (3, 9): AnyNode(labels=[3, 9])}

In [11]:
def safe_pop(d, k, default=0):
    try:
        return d.pop(k)
    except KeyError:
        return default

for node in list(nodes.keys())[:-1]:
    mistakes[sort_tuple(node, new_key)] = (
        safe_pop(mistakes, sort_tuple(node, left)) +
        safe_pop(mistakes, sort_tuple(node, right))
    )

Now let's do a loop.

In [12]:
def build_smart_tree(labels, cm):
    
    nodes = {(label,): anytree.AnyNode(label=label) for label in labels}
    mistakes = CM()

    for i in range(len(labels)):
        for j in range(len(labels)):
            if i == j:
                continue
            mistakes[(labels[i],), (labels[j],)] += cm[i, j]

    while len(nodes) > 1:

        left, right = max(mistakes, key=mistakes.get)
        del mistakes[left, right]
        new_key = sort_tuple(*left, *right)

        nodes[new_key] = make_branch(nodes.pop(left), nodes.pop(right))

        def safe_pop(d, k, default=0):
            try:
                return d.pop(k)
            except KeyError:
                return default

        for node in list(nodes.keys())[:-1]:
            mistakes[sort_tuple(node, new_key)] = (
                safe_pop(mistakes, sort_tuple(node, left)) +
                safe_pop(mistakes, sort_tuple(node, right))
            )
            
    return list(nodes.values())[0]

In [13]:
tree = build_smart_tree(np.unique(y), metrics.confusion_matrix(y_test, y_pred))
print(anytree.RenderTree(tree))

AnyNode(labels=[0, 1, 3, 9, 8, 6, 4, 7, 5, 2])
├── AnyNode(labels=[0, 1, 3, 9, 8, 6, 4, 7, 5])
│   ├── AnyNode(labels=[0, 1, 3, 9, 8, 6, 4, 7])
│   │   ├── AnyNode(labels=[0, 1, 3, 9, 8, 6])
│   │   │   ├── AnyNode(labels=[0, 1, 3, 9, 8])
│   │   │   │   ├── AnyNode(label=0)
│   │   │   │   └── AnyNode(labels=[1, 3, 9, 8])
│   │   │   │       ├── AnyNode(labels=[1, 3, 9])
│   │   │   │       │   ├── AnyNode(label=1)
│   │   │   │       │   └── AnyNode(labels=[3, 9])
│   │   │   │       │       ├── AnyNode(label=3)
│   │   │   │       │       └── AnyNode(label=9)
│   │   │   │       └── AnyNode(label=8)
│   │   │   └── AnyNode(label=6)
│   │   └── AnyNode(labels=[4, 7])
│   │       ├── AnyNode(label=4)
│   │       └── AnyNode(label=7)
│   └── AnyNode(label=5)
└── AnyNode(label=2)


In [14]:
train(X_train, y_train, tree, binary_model)
y_pred = predict(X_test, tree)
metrics.accuracy_score(y_test, y_pred)

0.9577777777777777

## Class API

In [15]:
from sklearn import base
from sklearn import utils

class HierarchyClassifier(base.BaseEstimator, base.ClassifierMixin):
    
    def __init__(self, classifier):
        self.classifier = classifier
        
    def fit(self, X, y):
        
        self.classes_ = np.unique(y)
        
        # Build and train a flat tree
        flat_tree = make_flat_tree(labels=self.classes_)
        train(X, y, flat_tree, self.classifier)

        # Make predictions and establish the confusion matrix
        y_pred = predict(X, flat_tree)
        cm = metrics.confusion_matrix(y, y_pred, labels=self.classes_)

        # Build smarter tree
        self.tree_ = build_smart_tree(self.classes_, cm)
        train(X, y, self.tree_, self.classifier)
        
        return self
    
    def predict(self, X):
        return predict(X, self.tree_)

In [16]:
hc = HierarchyClassifier(binary_model)
hc.fit(X_train, y_train)
y_pred = hc.predict(X_test)
metrics.accuracy_score(y_test, y_pred)

0.9711111111111111

In [17]:
print(anytree.RenderTree(hc.tree_))

AnyNode(labels=[0, 1, 3, 8, 9, 6, 4, 7, 5, 2], model=Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression())]))
├── AnyNode(labels=[0, 1, 3, 8, 9, 6, 4, 7, 5], model=Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression())]))
│   ├── AnyNode(labels=[0, 1, 3, 8, 9, 6, 4, 7], model=Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression())]))
│   │   ├── AnyNode(labels=[0, 1, 3, 8, 9, 6], model=Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression())]))
│   │   │   ├── AnyNode(labels=[0, 1, 3, 8, 9], model=Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression())]))
│   │   │   │   ├── AnyNode(label=0)
│   │   │   │   └── AnyNode(labels=[1, 3, 8, 9], model=Pipeline(steps=[('standardscaler', StandardScaler

## Benchmarks

In [18]:
from sklearn import datasets
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

newsgroups = datasets.fetch_20newsgroups(
    subset='train',
    remove=('headers', 'footers', 'quotes'),
)
newsgroups_vectors = vectorizer.fit_transform(newsgroups.data)

tasks = {
    'Iris': datasets.load_iris(return_X_y=True),
    'Digits': datasets.load_digits(return_X_y=True),
    'Newsgroups': (newsgroups_vectors, newsgroups.target)
}

In [20]:
import time
from sklearn import metrics
from sklearn import model_selection
from sklearn import multiclass

binary_model = pipeline.make_pipeline(
    preprocessing.StandardScaler(with_mean=False),
    linear_model.SGDClassifier(random_state=42)
)

models = {
    'OvR': lambda: multiclass.OneVsRestClassifier(binary_model),
    'OvO': lambda: multiclass.OneVsOneClassifier(binary_model),
    'Hierarchy': lambda: HierarchyClassifier(binary_model)
}


for task_name, task in tasks.items():
    
    print(task_name)
    print('-' * len(task_name))
    
    X, y = task
    y = preprocessing.LabelEncoder().fit_transform(y)
    print('n', X.shape[0])
    print('p', X.shape[1])
    print('k', len(np.unique(y)))
    print()
    
    X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, random_state=42)
    
    for model_name, model in models.items():
        model = model()
        tic = time.time()
        model.fit(X_train, y_train)
        toc = time.time()
        y_pred = model.predict(X_test)
        print(model_name, metrics.accuracy_score(y_test, y_pred), f'{toc - tic:.2f}')
        
    print()

Iris
----
n 150
p 4
k 3

OvR 0.9736842105263158 0.01
OvO 0.9736842105263158 0.01
Hierarchy 0.9736842105263158 0.01

Digits
------
n 1797
p 64
k 10

OvR 0.9355555555555556 0.10
OvO 0.96 0.09
Hierarchy 0.9488888888888889 0.06

Newsgroups
----------
n 11314
p 101631
k 20

OvR 0.49240014139271826 0.90
OvO 0.48462354188759277 1.52
Hierarchy 0.4330151997172146 0.91



In [1]:
%load_ext autoreload
%autoreload 2

In [26]:
import myriad

X, y = myriad.datasets.load_wiki_large()

In [27]:
y

<2365436x325056 sparse matrix of type '<class 'numpy.bool_'>'
	with 7716184 stored elements in Compressed Sparse Column format>

In [28]:
y.sum(axis=1).mean()

3.2620557055866235

In [23]:
len(set(l for labels in y for l in labels))

325056

In [17]:
import tarfile
import tempfile
import os
import pathlib

def get_data_home() -> pathlib.Path:
    """Return the location where datasets are to be stored.

    """

    data_home = os.environ.get('MYRIAD_DATA', os.path.join('~', 'myriad_data'))
    data_home = os.path.expanduser(data_home)
    data_home = pathlib.Path(data_home)
    if not data_home.exists():
        os.makedirs(data_home)
    return data_home

data_home = get_data_home()

with tarfile.open(data_home.joinpath('wiki_small.tar')) as wiki, tempfile.TemporaryDirectory() as untar:
    print(wiki.getnames())
    wiki.extract('train.txt', path=untar)

['cat_hier.txt', 'test.txt', 'train.txt']


In [60]:
labels, features = line.rstrip().split(' ', 1)

'33692,13402,393382 1958361:1 1406434:1 1087979:1 1575596:1 1568423:1 1647082:1 1683055:2 959786:2 199557:1 1298293:1 1818509:1 1627758:1 1797547:1 1137332:1 792648:1 823758:1 1657857:1 1257203:2 1643138:1 1794479:2 1892706:1 1776443:1 1440249:1 343843:1 1875794:2 1668225:2 840280:2 1959409:2 1806640:2 1269574:1 367780:1 348667:1 1225157:1 1405316:1 1662756:1 1288684:1 1058628:2 75013:1 1274224:1 1341165:1 1124792:1 1092419:1 1370520:1 1248932:1 1055151:1 1576704:1 1669002:1 992551:1 1527338:1 742798:1 1175252:1 447960:1 1956883:1 1390251:1 14403:1 2059598:1 594957:1 818590:7 554613:1 416684:1 1520683:1 352804:1 1227548:1 405008:1 1129361:3 603084:1 1055696:11 1061488:1 358027:1 1729436:1 226053:1 326241:1 54278:1 328156:1 1313681:1 597023:1 1517117:1 1749787:3 1945075:2 1035426:1 1225319:1 1700823:1 521422:3 1900504:1 1390999:1 1959967:2 1912168:1 1219566:1 1193610:1 1117370:1 1630093:2 62619:1 1078156:2 1972926:1 1246596:1 304423:1 1910640:1 855276:2 1989445:1 1822977:1 624:3 275673:

In [69]:
import tempfile

with tarfile.open('wikipediaSmallv2.0.tar.gz') as wiki, tempfile.TemporaryDirectory() as untar_dir:
    wiki.extract('train.txt', path=untar_dir)

In [59]:
def split(pair):
    k, v = pair.split(':')
    return int(k), v

features = sorted(map(split, (pair for pair in line.rstrip().split(' ', 1)[1].split(' '))))
print(' '.join(f'{k}:{v}' for k, v in features))

624:3 4288:1 14403:1 54278:1 62619:1 75013:1 106021:2 199557:1 226053:1 275673:1 279353:1 304423:1 326241:1 328156:1 343843:1 348667:1 352470:1 352804:1 358027:1 367780:1 405008:1 416684:1 444833:2 447960:1 521422:3 536530:1 548779:1 554613:1 592526:1 594957:1 597023:1 603084:1 742798:1 792648:1 818590:7 823758:1 840280:2 855276:2 959786:2 992551:1 1035426:1 1055151:1 1055696:11 1058628:2 1061488:1 1078156:2 1087979:1 1092419:1 1117370:1 1124792:1 1129361:3 1137332:1 1175252:1 1193610:1 1211852:1 1219566:1 1225157:1 1225319:1 1227548:1 1246596:1 1248932:1 1257203:2 1269574:1 1274224:1 1288684:1 1298293:1 1313681:1 1341165:1 1345267:1 1370520:1 1390251:1 1390999:1 1405316:1 1406434:1 1440249:1 1517117:1 1520683:1 1527338:1 1568423:1 1575596:1 1576704:1 1627758:1 1630093:2 1643138:1 1647082:1 1657857:1 1662756:1 1668225:2 1669002:1 1683055:2 1700823:1 1729436:1 1749787:3 1776443:1 1779069:1 1794479:2 1797547:1 1806640:2 1818509:1 1822977:1 1875794:2 1892706:1 1900504:1 1909745:1 1910640:

In [20]:
import collections
from scipy import sparse

encoder = collections.defaultdict(lambda: len(encoder))

Y = sparse.dok_matrix((394_756, 36_372), dtype=bool)

for i, labels in enumerate(y):
    for label in labels:
        j = encoder[label]
        Y[i, j] = True 

Y

TypeError: unhashable type: 'csr_matrix'

In [21]:
label

<1x36372 sparse matrix of type '<class 'numpy.bool_'>'
	with 1 stored elements in Compressed Sparse Row format>

In [15]:
Y.tocsc()

<394756x36372 sparse matrix of type '<class 'numpy.bool_'>'
	with 394756 stored elements in Compressed Sparse Column format>

In [13]:
set(len(labels) for labels in y)

{1}

In [None]:
X