In [None]:
import sys, os
import matplotlib.pyplot as plt
import numpy as np
sys.path.append('../modules')
import requests
from nsw.nsw_classifier import NSWClassifier

In [29]:
def norm(dataset):
    summ = [0] * len(dataset[0][0])
    for i in range(len(dataset[0][0])):
        if type(dataset[0][0][i]) != float:
            summ[i] = None
    std = list(summ)
    
    # summ
    for row in dataset:
        for i in range(len(summ)):
            if summ[i] is not None: 
                summ[i] += row[0][i]
    
    # avg
    for i in range(len(summ)):
        if summ[i] is not None: 
            summ[i] /= len(dataset)
    
    # std
    for row in dataset:
        for i in range(len(summ)):
            if summ[i] is not None: 
                std[i] += (summ[i] - row[0][i]) ** 2
    
    for i in range(len(summ)):
        if summ[i] is not None: 
            std[i] += (std[i] / (len(dataset) - 1)) ** .5

    for row in dataset:
        for i in range(len(summ)):
            if summ[i] is not None: 
                if std[i] != 0.0:
                    row[0][i] = (row[0][i] - summ[i]) / (4 * std[i])
    
    return dataset

# Isolet

## download (requires manual call)

In [2]:
folder = os.path.abspath("../data/isolet")

if not os.path.exists(folder):
    os.mkdir(folder)
    prefix = "https://archive.ics.uci.edu/ml/machine-learning-databases/isolet/"
    for file in ["isolet.info", "isolet.names", "isolet1+2+3+4.data.Z", "isolet5.data.Z"]:
        url = prefix + file
        r = requests.get(url, allow_redirects=True)
        open(folder + "/" + file, 'wb').write(r.content)
    
    # you can do this manually is you don't have 7zip
    exe7z = "C:/Program Files/7-Zip/7z.exe"
    for file in ["isolet1+2+3+4.data.Z", "isolet5.data.Z"]:
        exe = f"\"{exe7z}\" e -o\"{folder}\" \"{folder}/{file}\""
        print(exe)

## load

In [19]:
def load_data(filename):
    result = []
    for line in open(filename, 'r'):
        parts = line.split(',')
        cls = int(float(parts[-1]))
        vector = [float(v) for v in parts[:-1]]
        result.append((vector, cls))
    return result

In [None]:
train = load_data(folder + "/isolet1+2+3+4.data")
test = load_data(folder + "/isolet5.data")

## test sparse graph

In [11]:
G8, G16, G32 = NSWClassifier(), NSWClassifier(), NSWClassifier()
G8.build_navigable_graph(train, attempts=1, M=8)
G16.build_navigable_graph(train, attempts=1, M=16)
G32.build_navigable_graph(train, attempts=1, M=32)

Classifier graph is build in 92.383s
Classifier graph is build in 189.845s
Classifier graph is build in 342.232s


In [14]:
path8 = 0
path16 = 0
path32 = 0
knn8 = 0
knn16 = 0
knn32 = 0
for t in test:
    path8 += G8.classify_by_path(t[0], attempts=7) == t[1]
    path16 += G16.classify_by_path(t[0], attempts=7) == t[1]
    path32 += G32.classify_by_path(t[0], attempts=7) == t[1]
    knn8 += G8.classify_knn(t[0], k=7) == t[1]
    knn16 += G16.classify_knn(t[0], k=7) == t[1]
    knn32 += G32.classify_knn(t[0], k=7) == t[1]
print(f"P8\tP16\tP32\tK8\tK16\tK32")
print(f"{path8}\t{path16}\t{path32}\t{knn8}\t{knn16}\t{knn32}\t{len(test)}")
print(f"{path8 / len(test):.3f}\t{path16 / len(test):.3f}\t{path32 / len(test):.3f}\t{knn8 / len(test):.3f}\t{knn16 / len(test):.3f}\t{knn32 / len(test):.3f}")

P8	P16	P32	K8	K16	K32
648	979	1293	1244	1348	1403	1559
0.416	0.628	0.829	0.798	0.865	0.900


## test medium graphs

In [15]:
G64 = NSWClassifier()
G64.build_navigable_graph(train, attempts=1, M=64)
knn64 = 0
path64 = 0
for t in test:
    path64 += G64.classify_by_path(t[0], attempts=7) == t[1]
    knn64 += G64.classify_knn(t[0], k=7) == t[1]
    
print(path64, knn64)

Classifier graph is build in 644.523s
1369 1409


In [16]:
G128 = NSWClassifier()
G128.build_navigaget_hvdmraph(train, attempts=1, M=128)
knn128 = 0
path128 = 0
for t in test:
    path128 += G128.classify_by_path(t[0], attempts=7) == t[1]
    knn128 += G128.classify_knn(t[0], k=7) == t[1]
    
print(path128, knn128)
print(path128 / len(test), knn128 / len(test))

Classifier graph is build in 1041.164s
1380 1410
0.8851828094932649 0.9044259140474663


# Dermatology

## download

In [52]:
folder = os.path.abspath("../data/dermatology")
if not os.path.exists(folder):
    os.mkdir(folder)
    prefix = "https://archive.ics.uci.edu/ml/machine-learning-databases/dermatology/"
    for file in ["dermatology.data"]:
        url = prefix + file
        r = requests.get(url, allow_redirects=True)
        open(folder + "/" + file, 'wb').write(r.content)

## Load

In [53]:
def load_derm_data(filename):
    result = []
    mx = 0
    for line in open(filename, 'r'):
        parts = line.split(',')
        cls = int(float(parts[-1]))
        vector = parts[:-1]
        if vector[-1] != '?':
            vector[-1] = int(vector[-1])
            mx = max(vector[-1], mx)
        result.append((vector, cls))
    for row in result:
        if row[0][-1] == '?':
            row[0][-1] = .5
        else:
            row[0][-1] /= mx;
    
    return result   

## Prepare train/test and distance function

In [54]:
import random

dataset = norm(load_derm_data(folder + "/dermatology.data"))
random.seed(13)
random.shuffle(dataset)

train_derm, test_derm = dataset[:320], dataset[320:]

In [55]:
from tools.hvdm import get_hvdm
dist_derm = get_hvdm(train_derm)

size=320, dim=34, classes={1, 2, 3, 4, 5, 6}


In [56]:
cderm = NSWClassifier(dist=dist_derm)
cderm.build_navigable_graph(train_derm, attempts=2, M=100)
knn_derm = 0
path_derm = 0
for t in test_derm:
    path_derm += cderm.classify_by_path(t[0], attempts=10) == t[1]
    knn_derm += cderm.classify_knn(t[0], k=10) == t[1]
    
print("Path\t11-NN")
print(f"{path_derm}\t{knn_derm}")
print(f"{path_derm / len(test_derm):.3f}, {knn_derm / len(test_derm):.3f}")

Classifier graph is build in 1.991s
Path	11-NN
44	45
0.957, 0.978


# Image segmentation

## Download

In [16]:
folder = os.path.abspath("../data/segmentation")
if not os.path.exists(folder):
    os.mkdir(folder)
    prefix = "https://archive.ics.uci.edu/ml/machine-learning-databases/image/"
    for file in ["segmentation.data", "segmentation.test"]:
        url = prefix + file
        r = requests.get(url, allow_redirects=True)
        open(folder + "/" + file, 'wb').write(r.content)

## load

In [21]:
def load_segment_data(filename):
    result = []
    for i, line in enumerate(open(filename, 'r')):
        if i < 5: continue
        parts = line.split(',')
        cls = parts[0]
        vector = [float(v) for v in parts[1:]]
        result.append((vector, cls))
    return result

In [43]:
## can be swapped, as ".test" holds 2100 lines and '.data' - 210
train_segm = (load_segment_data(folder + "/segmentation.data"))
test_segm = (load_segment_data(folder + "/segmentation.test"))
print(len(train_segm), len(test_segm))

210 2100


In [49]:
segment_clf = NSWClassifier()
segment_clf.build_navigable_graph(train_segm, attempts=2, M=50)
knn_segm = 0
path_segm = 0
for t in test_segm:
    path_segm += segment_clf.classify_by_path(t[0], attempts=1) == t[1]
    knn_segm += segment_clf.classify_knn(t[0], k=1) == t[1]
    
print("Path\t1-NN")
print(f"{path_segm}\t{knn_segm}")
print(f"{path_segm / len(test_segm):.3f}, {knn_segm / len(test_segm):.3f}")

Classifier graph is build in 1.055s
Path	1-NN
1834	1837
0.873, 0.875
