In [5]:
from google.colab import files
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import neighbors #, datasets
import pandas as pd

In [7]:
from time import time # for comparing runing time
d, N = 1000, 10000 # dimension, number of training points
X = np.random.randn(N, d) # N d-dimensional points
z = np.random.randn(d)

In [8]:
# naively compute square distance between two vector
def dist_pp(z, x):
 d = z - x.reshape(z.shape) # force x and z to have the same dims
 return np.sum(d*d)
# from one point to each point in a set, naive
def dist_ps_naive(z, X):
 N = X.shape[0]
 res = np.zeros((1, N))
 for i in range(N):
  res[0][i] = dist_pp(z, X[i])
 return res

In [9]:
# from one point to each point in a set, fast
def dist_ps_fast(z, X):
 X2 = np.sum(X*X, 1) # square of l2 norm of each X[i], can be precomputed
 z2 = np.sum(z*z) # square of l2 norm of z
 return X2 + z2 - 2*X.dot(z) # z2 can be ignored
t1 = time()
D1 = dist_ps_naive(z, X)
print('naive point2set, running time:', time() - t1, 's')
t1 = time()
D2 = dist_ps_fast(z, X)
print('fast point2set , running time:', time() - t1, 's')
print('Result difference:', np.linalg.norm(D1 - D2))

naive point2set, running time: 0.09959936141967773 s
fast point2set , running time: 0.049826860427856445 s
Result difference: 2.1411792416972374e-11


In [10]:
res = np.zeros((1, N))
res

array([[0., 0., 0., ..., 0., 0., 0.]])

In [11]:
df = pd.read_csv('/content/drive/My Drive/Social_Network_Ads.csv', sep=",")
print (df.shape)
df.head(8)

(400, 5)


Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
5,15728773,Male,27,58000,0
6,15598044,Female,27,84000,0
7,15694829,Female,32,150000,1


In [12]:
# Replace class
df=df.replace({'Gender': {'Male':0, 'Female':1}})
df

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,0,19,19000,0
1,15810944,0,35,20000,0
2,15668575,1,26,43000,0
3,15603246,1,27,57000,0
4,15804002,0,19,76000,0
...,...,...,...,...,...
395,15691863,1,46,41000,1
396,15706071,0,51,23000,1
397,15654296,1,50,20000,1
398,15755018,0,36,33000,0


In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,0:4], df.iloc[:,4], test_size=0.33)
print ("Training size: %d" %len(y_train))
print ("Test size : %d" %len(y_test))

Training size: 268
Test size : 132


In [22]:
clf = neighbors.KNeighborsClassifier(n_neighbors = 3, p = 2)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print ("Print results for first 20 test data points:")
print ("Predicted labels: ", y_pred[0:20].tolist())
print ("Ground truth : ", y_test[0:20].tolist())

Print results for first 20 test data points:
Predicted labels:  [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0]
Ground truth :  [1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0]


In [24]:
from sklearn.metrics import accuracy_score
print ("Accuracy of 3NN: %.2f %%" %(100*accuracy_score(y_test, y_pred)))

Accuracy of 3NN: 67.42 %


In [28]:
clf7 = neighbors.KNeighborsClassifier(n_neighbors = 7, p = 2)
clf7.fit(X_train, y_train)
y_pred7 = clf7.predict(X_test)
print ("Accuracy of 7NN: %.2f %%" %(100*accuracy_score(y_test, y_pred7)))

Accuracy of 7NN: 67.42 %


In [29]:
print ("Predicted labels: ", y_pred[0:20].tolist())
print ("Predicted labels: ", y_pred7[0:20].tolist())

Predicted labels:  [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0]
Predicted labels:  [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0]


In [30]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred7)

array([[68, 12],
       [31, 21]])