In [1]:
import sys, os
import matplotlib.pyplot as plt
import numpy as np
sys.path.append('../modules')
import requests
from nsw.nsw_classifier import NSWClassifier

# Isolet

## download (requires manual call)

In [2]:
folder = os.path.abspath("../data/isolet")

if not os.path.exists(folder):
    os.mkdir(folder)
    prefix = "https://archive.ics.uci.edu/ml/machine-learning-databases/isolet/"
    for file in ["isolet.info", "isolet.names", "isolet1+2+3+4.data.Z", "isolet5.data.Z"]:
        url = prefix + file
        r = requests.get(url, allow_redirects=True)
        open(folder + "/" + file, 'wb').write(r.content)
    
    # you can do this manually is you don't have 7zip
    exe7z = "C:/Program Files/7-Zip/7z.exe"
    for file in ["isolet1+2+3+4.data.Z", "isolet5.data.Z"]:
        exe = f"\"{exe7z}\" e -o\"{folder}\" \"{folder}/{file}\""
        print(exe)

## load

In [3]:
def load_data(filename):
    result = []
    for line in open(filename, 'r'):
        parts = line.split(',')
        cls = int(float(parts[-1]))
        vector = [float(v) for v in parts[:-1]]
        result.append((vector, cls))
    return result

train = load_data(folder + "/isolet1+2+3+4.data")
test = load_data(folder + "/isolet5.data")

## test sparse graph

In [11]:
G8, G16, G32 = NSWClassifier(), NSWClassifier(), NSWClassifier()
G8.build_navigable_graph(train, attempts=1, M=8)
G16.build_navigable_graph(train, attempts=1, M=16)
G32.build_navigable_graph(train, attempts=1, M=32)

Classifier graph is build in 92.383s
Classifier graph is build in 189.845s
Classifier graph is build in 342.232s


In [14]:
path8 = 0
path16 = 0
path32 = 0
knn8 = 0
knn16 = 0
knn32 = 0
for t in test:
    path8 += G8.classify_by_path(t[0], attempts=7) == t[1]
    path16 += G16.classify_by_path(t[0], attempts=7) == t[1]
    path32 += G32.classify_by_path(t[0], attempts=7) == t[1]
    knn8 += G8.classify_knn(t[0], k=7) == t[1]
    knn16 += G16.classify_knn(t[0], k=7) == t[1]
    knn32 += G32.classify_knn(t[0], k=7) == t[1]
print(f"P8\tP16\tP32\tK8\tK16\tK32")
print(f"{path8}\t{path16}\t{path32}\t{knn8}\t{knn16}\t{knn32}\t{len(test)}")
print(f"{path8 / len(test):.3f}\t{path16 / len(test):.3f}\t{path32 / len(test):.3f}\t{knn8 / len(test):.3f}\t{knn16 / len(test):.3f}\t{knn32 / len(test):.3f}")

P8	P16	P32	K8	K16	K32
648	979	1293	1244	1348	1403	1559
0.416	0.628	0.829	0.798	0.865	0.900


## test medium graphs

In [15]:
G64 = NSWClassifier()
G64.build_navigable_graph(train, attempts=1, M=64)
knn64 = 0
path64 = 0
for t in test:
    path64 += G64.classify_by_path(t[0], attempts=7) == t[1]
    knn64 += G64.classify_knn(t[0], k=7) == t[1]
    
print(path64, knn64)

Classifier graph is build in 644.523s
1369 1409


In [None]:
G128 = NSWClassifier()
G128.build_navigable_graph(train, attempts=1, M=128)
knn128 = 0
path128 = 0
for t in test:
    path128 += G128.classify_by_path(t[0], attempts=7) == t[1]
    knn128 += G128.classify_knn(t[0], k=7) == t[1]
    
print(path128, knn128)
print(path128 / len(test), knn128 / len(test))

# Dermatology

## download

In [None]:
folder = os.path.abspath("../data/dermatology")
if not os.path.exists(folder):
    os.mkdir(folder)
    prefix = "https://archive.ics.uci.edu/ml/machine-learning-databases/dermatology/"
    for file in ["dermatology.data"]:
        url = prefix + file
        r = requests.get(url, allow_redirects=True)
        open(folder + "/" + file, 'wb').write(r.content)

In [None]:
def load_derm_data(file):
    result = []
    mx = 0
    for line in open(filename, 'r'):
        parts = line.split(',')
        cls = int(float(parts[-1]))
        vector = [:-1]
        if vector[-1] != '?':
            vector[-1] = int(vector[-1])
            mx = max(vector[-1], mx)
        result.append((vector, cls))
    for row in result:
        if row[0][-1] != '?':
            row[0][-1] = .5
        else:
            row[0][-1] /= mx;
    
    return result   

In [None]:
import random

dataset = load_derm_data(folder + "/dermatology.data")
random.seed(13)
random.shuffle(dataset)

train_derm, test_derm = dataset[:320], dataset[320:]

In [None]:
from tools.hvdm import get_hvdm
dist_derm = get_hvdm(train_derm)

In [None]:
cderm = NSWClassifier()
cderm.build_navigable_graph(train, attempts=1, M=128)
knn_derm = 0
path_derm = 0
for t in test:
    path_derm += cderm.classify_by_path(t[0], attempts=7) == t[1]
    knn_derm += cderm.classify_knn(t[0], k=7) == t[1]
    
print(path_derm, knn_derm)
print(path_derm / len(test), knn_derm / len(test))