# từ 1 điểm đến 1 điểm

đầu tiên, chọn d và N là các giá trị lớn, khai báo ngẫu nghiên X và z.

In [5]:
import numpy as np
from time import time # comparing running timed, N = 

In [2]:
d, N = 1000, 10000 # dimension, number of training points
X = np.random.randn(N, d) # N d dimension points
z = np.random.randn(d)

In [3]:
# naively compute square distance between two vectors
def dist_pp(z, x):
    d = z - x.reshape(z.shape) # force x and z to have the same dims
    return np.sum(d*d)

# from one point to each point in a set, naive
def dist_ps_naive(z, X):
    N = X.shape[0]
    res = np.zeros((1, N))
    for i in range(N):
        res[0][i] = dist_pp(z, X[i])
    return res

# from one point to each point in a set, fast
def dist_ps_fast(z, X):
    X2 = np.sum(X*X, 1) # square of l2 norm of each X[i], can be precomputed
    z2 = np.sum(z*z) # square of l2 norm of z
    return X2 + z2 - 2*X.dot(z) # z2 can be ignored

In [4]:
t1 = time()
D1 = dist_ps_naive(z, X)
print('naive point2set, running time:', time() - t1, 's')

t1 = time()
D2 = dist_ps_fast(z, X)
print('fast point2set , running time:', time() - t1, 's')

print('Result difference:', np.linalg.norm(D1 - D2))

naive point2set, running time: 0.14117741584777832 s
fast point2set , running time: 0.05016636848449707 s
Result difference: 2.1656671196598214e-11


# từng điểm giữa 2 tập

In [5]:
M = 100
Z = np.random.randn(M, d)

In [6]:
# from each point in one set to each point in another set, half-fast
def dist_ss_0(Z, X):
    M = Z.shape[0]
    N = X.shape[0]
    res = np.zeros((M, N))
    for i in range(M):
        res[i] = dist_ps_fast(Z[i], X)
    return res

# from each point in one set to each point in another set, fast
def dist_ss_fast(Z, X):
    X2 = np.sum(X*X, 1) # square of l2 norm of each row of X
    Z2 = np.sum(Z*Z, 1) # square of l2 norm of each row of Z
    return Z2.reshape(-1, 1) + X2.reshape(1, -1) - 2*Z.dot(X.T)

In [7]:
t1 = time()
D3 = dist_ss_0(Z, X)
print('half fast set2set running time:', time() - t1, 's')

t1 = time()
D4 = dist_ss_fast(Z, X)
print('fast set2set  running time', time() - t1, 's')

print('Result difference:', np.linalg.norm(D3 - D4))

half fast set2set running time: 4.081785202026367 s
fast set2set  running time 0.10276627540588379 s
Result difference: 9.696121053460651e-11


### sử dụng scipy

In [8]:
from scipy.spatial.distance import cdist

In [9]:
t1 = time()
D5 = cdist(Z, X)**2

t2 = time()
print('time:', t2-t1, 's')
print('difference:', np.linalg.norm(D5-D4))

time: 1.008002758026123 s
difference: 1.617336092276414e-09


In [12]:
d = 1000
N = 10000
M = 10000
X = np.random.randn(N, d)
Z = np.random.randn(M, d) 
t1 = time()
D0 = cdist(Z, X)**2
print('cdist:', time() - t1, 's')

t1 = time()
D1 = dist_ss_fast(Z, X)
print('dist_ss_fast:', time() - t1, 's')
print('difference:', np.linalg.norm(D0 - D1))

cdist: 86.10610127449036 s
dist_ss_fast: 3.138760805130005 s
difference: 1.6110843296616265e-08


In [14]:
d = 1000
N = 10000
M = 10000
X = np.random.randn(N, d)
Z = np.random.randn(M, d) 
t1 = time()
D0 = cdist(Z, X)
print('cdist:', time() - t1, 's')

t1 = time()
D1 = dist_ss_fast(Z, X)
print('dist_ss_fast:', time() - t1, 's')
print('difference:', np.linalg.norm(D0**2 - D1))

cdist: 85.63755297660828 s
dist_ss_fast: 3.0120649337768555 s
difference: 0.0


# iris

In [1]:
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split # for splitting data
from sklearn.metrics import accuracy_score

In [3]:
iris = datasets.load_iris()
iris_X = iris.data
iris_Y = iris.target

20 điểm dữ liệu đc tách làm tập train, 130 điểm còn lại làm tập test

In [6]:
print('Labels:', np.unique(iris_Y))

# split train test
np.random.seed(7)
X_train, X_test, y_train, y_test = train_test_split(iris_X, iris_Y, test_size=130)
print('Training size:', X_train.shape[0], ', test size:', X_test.shape[0])

Labels: [0 1 2]
Training size: 20 , test size: 130


`np.random.seed(7)` để đảm bảo kết quả chạy ở các lần khác nhau cho kết quả giống nhau. có thể thay 7 bằng số 32 bit bất kỳ

### 1NN

In [7]:
model = KNeighborsClassifier(n_neighbors=1, p=2)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Accuracy of 1NN: %.2f %%' %(100*accuracy_score(y_test, y_pred)))

Accuracy of 1NN: 92.31 %


In [8]:
model = KNeighborsClassifier(n_neighbors=1, p=1)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Accuracy of 1NN: %.2f %%' %(100*accuracy_score(y_test, y_pred)))

Accuracy of 1NN: 92.31 %


p=2 chính là l2 norm để tính khoảng cách. l2 norm và l1 norm cho kết quả giống nhau

### 7NN

1NN dễ gây overfitting, ta có thể tăng số hàng xóm để kết quả dựa trên đa số.

In [12]:
model = KNeighborsClassifier(n_neighbors=7, p=2)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Accuracy of 7NN with major voting: %.2f %%' %(100*accuracy_score(y_test, y_pred)))

Accuracy of 7NN with major voting: 93.85 %


In [11]:
model = KNeighborsClassifier(n_neighbors=7, p=1)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Accuracy of 7NN with major voting: %.2f %%' %(100*accuracy_score(y_test, y_pred)))

Accuracy of 7NN with major voting: 95.38 %


### Đánh trọng số

In [13]:
model = KNeighborsClassifier(n_neighbors=7, p=2, weights='distance')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Accuracy of 7NN (1/distance weights): %.2f %%' %(100*accuracy_score(y_test, y_pred)))

Accuracy of 7NN (1/distance weights): 94.62 %


In [19]:
def myweight(distances):
    sigma2 = .4 # we can change this number
    return np.exp(-distances**2/sigma2)

model = KNeighborsClassifier(n_neighbors=7, p=2, weights=myweight)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Accuracy of 7NN (customzed weight): %.2f %%' %(100*accuracy_score(y_test, y_pred)))

Accuracy of 7NN (customzed weight): 95.38 %


In [20]:
model = KNeighborsClassifier(n_neighbors=7, p=1, weights=myweight)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Accuracy of 7NN (customzed weight): %.2f %%' %(100*accuracy_score(y_test, y_pred)))

Accuracy of 7NN (customzed weight): 93.08 %


# xác thực chéo

In [31]:
from sklearn.model_selection import StratifiedKFold, KFold, LeaveOneOut
from statistics import mean

In [28]:
skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=1)

In [30]:
accs = []
X, y = iris_X, iris_Y
model = KNeighborsClassifier(n_neighbors=7, p=2, weights=myweight)
for train_index, test_index in skf.split(X, y):
    model.fit(X[train_index], y[train_index])
    y_pred = model.predict(X[test_index])
    accs.append(accuracy_score(y[test_index], y_pred))

print('Accuracy of 7NN (customized weight) with skf: %.2f %%' %(100*mean(accs)))

Accuracy of 7NN (customized weight) with skf: 96.67 %


In [32]:
kf = KFold(n_splits=2, shuffle=True, random_state=1)
loo = LeaveOneOut()

In [34]:
accs = []
X, y = iris_X, iris_Y
model = KNeighborsClassifier(n_neighbors=7, p=2, weights=myweight)
for train_index, test_index in kf.split(X):
    model.fit(X[train_index], y[train_index])
    y_pred = model.predict(X[test_index])
    accs.append(accuracy_score(y[test_index], y_pred))

print('Accuracy of 7NN (customized weight) with kf: %.2f %%' %(100*mean(accs)))

accs = []
model = KNeighborsClassifier(n_neighbors=7, p=2, weights=myweight)
for train_index, test_index in loo.split(X):
    model.fit(X[train_index], y[train_index])
    y_pred = model.predict(X[test_index])
    accs.append(accuracy_score(y[test_index], y_pred))

print('Accuracy of 7NN (customized weight) with loo: %.2f %%' %(100*mean(accs)))

Accuracy of 7NN (customized weight) with kf: 94.67 %
Accuracy of 7NN (customized weight) with loo: 96.00 %
