# Introduction to K-Nearest Neighbours 

# Representing Points

## Euclidean Distance

In [2]:
#example of euclidean distance
def euclidean_distance(pt1, pt2):
  distance = 0
  for i in range(len(pt1)):
    distance += (pt1[i] - pt2[i]) ** 2
  return distance ** 0.5

print(euclidean_distance([1, 2], [4, 0]))
print(euclidean_distance([5, 4, 3], [1, 7, 9]))

3.605551275463989
7.810249675906654


## Manhattan Distance

In [3]:
#example of manhattan distance
def manhattan_distance(pt1,pt2):
  distance=0;
  for i in range(len(pt1)):
    distance += abs(pt1[i] - pt2[i])
  return distance

print(manhattan_distance([1, 2], [4, 0]))
print(manhattan_distance([5, 4, 3], [1, 7, 9]))

5
13


## Hamming Distance

In [5]:
#example of hamming distance
def hamming_distance(pt1, pt2):
  distance = 0
  for i in range(len(pt1)):
    if pt1[i] != pt2[i]:
      distance += 1
  return distance

print(hamming_distance([1, 2], [1, 100]))
print(hamming_distance([5, 4, 9], [1, 7, 9]))

1
2


## Scipy library-distance calculation function

In [6]:
#example of both ours and scipy lib
from scipy.spatial import distance

def euclidean_distance(pt1, pt2):
  distance = 0
  for i in range(len(pt1)):
    distance += (pt1[i] - pt2[i]) ** 2
  return distance ** 0.5

def manhattan_distance(pt1, pt2):
  distance = 0
  for i in range(len(pt1)):
    distance += abs(pt1[i] - pt2[i])
  return distance

def hamming_distance(pt1, pt2):
  distance = 0
  for i in range(len(pt1)):
    if pt1[i] != pt2[i]:
      distance += 1
  return distance

print(euclidean_distance([1, 2], [4, 0]))
print(manhattan_distance([1, 2], [4, 0]))
print(hamming_distance([5, 4, 9], [1, 7, 9]))

print(distance.euclidean([1, 2], [4, 0]))
print(distance.cityblock([1, 2], [4, 0]))
print(distance.hamming([5, 4, 9], [1, 7, 9]))#here we see difference in answer from scipy lib

3.605551275463989
5
2
3.605551275463989
5
0.6666666666666666


# Normalization

## Training set vs validating set vs test set

# K-Nearest Neighbours Classifier

## Introduction

## Distance between 2 points in 2D space

In [7]:
#example of classification in 2d
star_wars = [125, 1977]
raiders = [115, 1981]
mean_girls = [97, 2004]

def distance(movie1, movie2):#normal euvlidean distance found 
  length_difference = (movie1[0] - movie2[0]) ** 2
  year_difference = (movie1[1] - movie2[1]) ** 2
  distance = (length_difference + year_difference) ** 0.5
  return distance

print(distance(star_wars, raiders))
print(distance(star_wars, mean_girls))#more similar stars wars will be to most closer it is in distance

10.770329614269007
38.897300677553446


## Distance between 2 points in 3D space

In [8]:
#example of classification in 3d
star_wars = [125, 1977, 11000000]
raiders = [115, 1981, 18000000]
mean_girls = [97, 2004, 17000000]

def distance(movie1, movie2):#(this formula can be used for any number of dimension)
  squared_difference = 0
  for i in range(len(movie1)):
    squared_difference += (movie1[i] - movie2[i]) ** 2
  final_distance = squared_difference ** 0.5
  return final_distance

print(distance(star_wars, raiders))
print(distance(star_wars, mean_girls))#so again taking any number of dimesnion we can find the distance between them and classify

7000000.000008286
6000000.000126083


## step1-Normalize the data

In [9]:
#example of using normalizing of data using min_max_nomalize
release_dates = [1897.0, 1998.0, 2000.0, 1948.0, 1962.0, 1950.0, 1975.0, 1960.0, 2017.0, 1937.0, 1968.0, 1996.0, 1944.0, 1891.0, 1995.0, 1948.0, 2011.0, 1965.0, 1891.0, 1978.0]
#for our function the data type is float of all 
def min_max_normalize(lst):
  minimum = min(lst)
  maximum = max(lst)
  normalized = []
  
  for value in lst:
    normalized_num = (value - minimum) / (maximum - minimum)
    normalized.append(normalized_num)
  
  return normalized

print(min_max_normalize(release_dates))

[0.047619047619047616, 0.8492063492063492, 0.8650793650793651, 0.4523809523809524, 0.5634920634920635, 0.46825396825396826, 0.6666666666666666, 0.5476190476190477, 1.0, 0.36507936507936506, 0.6111111111111112, 0.8333333333333334, 0.42063492063492064, 0.0, 0.8253968253968254, 0.4523809523809524, 0.9523809523809523, 0.5873015873015873, 0.0, 0.6904761904761905]


## Step-2-Finding the Nearest Neighbour

In [None]:
from movies import movie_dataset, movie_labels

#print(movie_dataset['Bruce Almighty'])
#print(movie_labels['Bruce Almighty'])

def distance(movie1, movie2):
  squared_difference = 0
  for i in range(len(movie1)):
    squared_difference += (movie1[i] - movie2[i]) ** 2
  final_distance = squared_difference ** 0.5
  return final_distance

def classify(unknown, dataset, k):#k is the number of nearest neighbors(discussed further)
  distances = []
  #Looping through all points in the dataset
  for title in dataset:
    movie = dataset[title]
    distance_to_point = distance(movie, unknown)
    #Adding the distance and point associated with that distance
    distances.append([distance_to_point, title])
  distances.sort()
  #Taking only the k closest points
  neighbors = distances[0:k]
  return neighbors
  
print(classify([.4, .2, .9], movie_dataset, 5))
#so what we do is we pass the unknown ka to function and we pass the dataset and k,then each distance is calculated with all movies and appended to distances list with the distance and also the name then we sort and return only k movies so when we pass any point we get the closest k movies to that point 

In [None]:
#output will be
Output:
[[0.08273614694606074, 'Lady Vengeance'], 
 [0.22989623153818367, 'Steamboy'], [0.23641372358159884, 'Fateless'],
 [0.26735445689589943, 'Princess Mononoke'],
 [0.3311022951533416, 'Godzilla 2000']]

## Step-3-Classifying by counting the nearest neighbors

In [None]:
#complete code for classification
from movies import movie_dataset, movie_labels

def distance(movie1, movie2):
  squared_difference = 0
  for i in range(len(movie1)):
    squared_difference += (movie1[i] - movie2[i]) ** 2
  final_distance = squared_difference ** 0.5
  return final_distance

def classify(unknown, dataset,labels, k):
  distances = []
  
  #Looping through all points in the dataset
  for title in dataset:
    movie = dataset[title]
    distance_to_point = distance(movie, unknown)
    #Adding the distance and point associated with that distance
    distances.append([distance_to_point, title])
  distances.sort()
  #Taking only the k closest points
  neighbors = distances[0:k]
  num_good = 0
  num_bad = 0
  for movie in neighbors:
    title=movie[1]
    if labels[title]==0:
      num_bad+=1
    else:
      num_good+=1
    if(num_good > num_bad):
      return 1
    else:
      return 0
print(classify([.4, .2, .9],movie_dataset,movie_labels,5))#output is 1(good)
#so once neighbors got we take the title of each neigbour and check if it is good or bad from labels and increment count of good and bad and finally based on good and bad we classify if tie(we can see first movi ine neighbour which is nearest ka good or bad and classify based on that)

In [None]:
from movies import movie_dataset, movie_labels, normalize_point

def distance(movie1, movie2):
  squared_difference = 0
  for i in range(len(movie1)):
    squared_difference += (movie1[i] - movie2[i]) ** 2
  final_distance = squared_difference ** 0.5
  return final_distance


def classify(unknown, dataset, labels, k):
  distances = []
  #Looping through all points in the dataset
  for title in dataset:
    movie = dataset[title]
    distance_to_point = distance(movie, unknown)
    #Adding the distance and point associated with that distance
    distances.append([distance_to_point, title])
  distances.sort()
  #Taking only the k closest points
  neighbors = distances[0:k]
  num_good = 0
  num_bad = 0
  for neighbor in neighbors:
    title = neighbor[1]
    if labels[title] == 0:
      num_bad += 1
    elif labels[title] == 1:
      num_good += 1
  if num_good > num_bad:
    return 1
  else:
    return 0

#print("Call Me By Your Name" in movie_dataset)
my_movie=[3500000, 132, 2017]
normalized_my_movie=normalize_point(my_movie)
print(classify(normalized_my_movie, movie_dataset, movie_labels, 5))#output is 1(good)
#so to make our own movie we first see in datset(if there on neigbour itself so shd not) once not then make a list of all for function and normalize using function and see

## Training and Validation

In [None]:
#doing it for only one movie from validation set and checking
from movies import training_set, training_labels, validation_set, validation_labels

def distance(movie1, movie2):
  squared_difference = 0
  for i in range(len(movie1)):
    squared_difference += (movie1[i] - movie2[i]) ** 2
  final_distance = squared_difference ** 0.5
  return final_distance

def classify(unknown, dataset, labels, k):
  distances = []
  #Looping through all points in the dataset
  for title in dataset:
    movie = dataset[title]
    distance_to_point = distance(movie, unknown)
    #Adding the distance and point associated with that distance
    distances.append([distance_to_point, title])
  distances.sort()
  #Taking only the k closest points
  neighbors = distances[0:k]
  num_good = 0
  num_bad = 0
  for neighbor in neighbors:
    title = neighbor[1]
    if labels[title] == 0:
      num_bad += 1
    elif labels[title] == 1:
      num_good += 1
  if num_good > num_bad:
    return 1
  else:
    return 0
#print(validation_set["Bee Movie"])
#print(validation_labels["Bee Movie"])
guess=classify(validation_set["Bee Movie"],training_set,training_labels,5)
if guess==validation_labels["Bee Movie"]:
  print("Correct!")
else:
  print("Wrong")#output is correct(means both prediction and actual are same)
#like this we need to do for complete validation set

## Choosing k

In [None]:
#a function to validating accuracy
from movies import training_set, training_labels, validation_set, validation_labels

def distance(movie1, movie2):
  squared_difference = 0
  for i in range(len(movie1)):
    squared_difference += (movie1[i] - movie2[i]) ** 2
  final_distance = squared_difference ** 0.5
  return final_distance

def classify(unknown, dataset, labels, k):
  distances = []
  #Looping through all points in the dataset
  for title in dataset:
    movie = dataset[title]
    distance_to_point = distance(movie, unknown)
    #Adding the distance and point associated with that distance
    distances.append([distance_to_point, title])
  distances.sort()
  #Taking only the k closest points
  neighbors = distances[0:k]
  num_good = 0
  num_bad = 0
  for neighbor in neighbors:
    title = neighbor[1]
    if labels[title] == 0:
      num_bad += 1
    elif labels[title] == 1:
      num_good += 1
  if num_good > num_bad:
    return 1
  else:
    return 0
  
def find_validation_accuracy(training_set, training_labels, validation_set, validation_labels, k):
  num_correct = 0.0
  for title in validation_set:
    guess = classify(validation_set[title], training_set, training_labels, k)
    if guess == validation_labels[title]:
      num_correct += 1
  return num_correct / len(validation_set)


print(find_validation_accuracy(training_set, training_labels, validation_set, validation_labels, 3))#output is 66%(0.66)
#so in validating for each movie in validation_set we guess from our function and check if it is correct or not and if correct adding and finally getting the accuracy

# Using Sklearn for KNN

In [None]:
#example of how to do using sklearn
from movies import movie_dataset, labels
from sklearn.neighbors import KNeighborsClassifier

classifier = KNeighborsClassifier(n_neighbors = 5)
classifier.fit(movie_dataset, labels)
guess = classifier.predict([[.45, .2, .5], [.25, .8, .9],[.1, .1, .9]])#see how passed when predicting
print(guess)#got [1,1,0]

!!!IMPORTANT!!
Learn how accuracy of this and all checking
see how to do it
accuracy_score and others can we put and all