<a href="https://colab.research.google.com/github/LNSHRIVAS/Face-recognition-using-knn-from-scratch/blob/main/Face_recognition_homework2_msml_602_uid_121334466.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Face recognition: This Notebook is able to recognize a person's face by comparing facial images to that of a known person



In [37]:
import cv2
import os
import re
import numpy as np

# importing the dataset

In [38]:
img_path = '/content/drive/MyDrive/FaceData'

# Creating an image data loader class which loads the data and extracts the label

In [39]:
class ImageDataLoader:


  def load_image(self, img_path):

    img_data = []
    img_label = []

    for filename in os.listdir(img_path):
      pattern = r"(\d+)_\d+\.png"              #This line of code extracts the label from the class in the format specified in the text file.
      image_path = os.path.join(img_path, filename)
      check = re.match(pattern, filename)
      if check:
        person_id = int(check.group(1))
        img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE) / 255.0       #Here we are reading the image file using open cv and normalizing the image by diviing it by 255.0 for every indiviudal pixel.
        img_data.append(img.flatten())     # Flattening the image
        img_label.append(person_id)

    return img_data, img_label

# Creating the KNN classifier class as specified in the homework assignment document we want to create an 1NN classifier so I named the class as OneNNClassifier.

In [40]:
class OneNNClassifier():

  def __init__(self, distance):          # This class takes distance as an input, which can be euclidean and manhattan as specified in the classroom. We can experiment with both of this distances but they moreless yeilds the same results.
    self.train_data = None
    self.train_labels = None
    self.distance = distance

  def train(self, train_data, train_labels):  #assigns the train data and labels to the class variables.
    self.train_data = train_data
    self.train_labels = train_labels

  def euclidean_distance(self, x1, x2):       #Function to calcluate the euclidean distance
    return np.sqrt(np.sum((x1 - x2) ** 2))

  def manhattan_distance(self, x1, x2):        #Function to calculate the manhattan distance
    return np.sum(np.abs(x1 - x2))

  def predict(self, test_data):               #Fucntion to predict the test dataset

  # Predicts the label for each test sample based on the closest (1-nearest) training sample.
    predictions = []
    for i, test_sample in enumerate(test_data): #We iterate over each test data point and calcuate its distance from the train data points.
        if self.distance == 'euclidean':
            distances = [self.euclidean_distance(test_sample, train_sample) for train_sample in self.train_data]
        elif self.distance == 'manhattan':
            distances = [self.manhattan_distance(test_sample, train_sample) for train_sample in self.train_data]

        nearest_neighbor_index = np.argmin(distances)   #We get the minimum distance of our test data point from the train data points

        # 1-NN step: Assigns the label of the single nearest neighbor as the prediction.

        predicted_label = self.train_labels[nearest_neighbor_index]  #we do the predictions
        print(f"Predicted label: {predicted_label}")

        predictions.append(predicted_label)
    return predictions


In [41]:
img = ImageDataLoader() #Initializing the class

# Loading the dataset and dividing it into train and test data

In [42]:
data, label = img.load_image(img_path)  #Loading the data using the ImageDataLoader class we created previously
data = np.array(data)  #Converting the data to numpy array
label = np.array(label)   #Converting the labels to numpy array
randomize_data = np.random.permutation(len(data)) #randomizing the data
data = data[randomize_data]
label = label[randomize_data]

#We are splitting the data as we have to evaluate its accuracy on the test dataset. I am using 80% as train data and 20% of data as testing data.

split_ratio = 0.8
split_index = int(len(data) * split_ratio)

train_data, test_data = data[:split_index], data[split_index:]
train_label, test_label = label[:split_index], label[split_index:]

In [43]:
print(test_label)

[21 39 38 11  8 21 29 24 36 23 36  4 11 37 33 36 24 28 16 33 12  1 27 32
 15 35 17 31  6 33 10 18  8  5 20  5  8 26 35  7  7 37 17 40 30  1  5 20
 32 20 25 12 30 40 38 19 27  3 30  7 37  2 22 25 16 14  6  4 40 22  7 11
 35 22  6 35 30 20 26  9]


In [44]:
print(test_data)

[[0.38823529 0.35294118 0.38039216 ... 0.52941176 0.44313725 0.61960784]
 [0.34509804 0.33333333 0.34901961 ... 0.45882353 0.45882353 0.45490196]
 [0.44313725 0.42745098 0.44313725 ... 0.28235294 0.25882353 0.27843137]
 ...
 [0.42352941 0.43137255 0.42352941 ... 0.14117647 0.1372549  0.13333333]
 [0.43921569 0.42745098 0.43529412 ... 0.29019608 0.29411765 0.29019608]
 [0.43137255 0.42745098 0.43529412 ... 0.36862745 0.34509804 0.35294118]]


In [45]:
print(train_label)

[25  9 17 28 19 34  8 11 27 20 25 33  6 15 40 35 33 14 21 13 15 22 34 23
 25 34 20 16 15 21  2 26 18  4 35 22 31 39 21 27  8 33 13 38 18 27 27 10
 39 15 23  2 16  9  2 39 38 32 10  6 26 24 10 36  4 22 19 10  3 24 19  7
  5  9  7  8  4  6  8 19 17 33 26 22  4 23 14 24  5 27 34 24 14  1  7  3
 27  6 22 25 15 24  1 26 13  3 34 12 39 32 29  1 22  1 31  9 17 20 12 14
 18 16 12 28  1 38 11 37  8 32 28  2  5 36 18 30 23 28 34 17 29 26  5  6
 14 29 34 28 36 23 31 27 37 24 16 11 36 13 16 32 38  9  5 33 11  5  3 37
 21 24 26  3 21  2 21 37  1 25 18 29 29  4 28  4 35  3 10 15 13 32 12 36
  3  7 12 17 37 17 22 26 16 39 21 14  7  8 35 31 19 34 16 24  4 32 29  5
 31 10 14 30 20 40 23 25 28 23 20  8 31 14 19 39 39 38 12 20  7 21 15 30
 13 28 23 34 40 11  1 29 19 33 10 25 18 13 29 32  9 36 19 30 36 18 13  4
  1 31  9  2 15 10 35 17 30 18 12  9 31  2 40  2 14 40 39 37 28 11 17 10
 16 25 11 38 15  2 39 23 34 38 13 27 38  6  3 31 13 37 26 12 30 40  6 33
 32 29 40  9 19  3 35 18]


In [46]:
print(len(train_data))

320


In [47]:
print(len(train_label))

320


# We are predicting without the use of cross validation

In [48]:
model  = OneNNClassifier(distance='euclidean')  #
model.train(train_data, train_label)
predictions = model.predict(test_data)
accuracy = np.mean(np.array(predictions) == np.array(test_label))
print("Accuracy:", accuracy)

Predicted label: 21
Predicted label: 39
Predicted label: 38
Predicted label: 11
Predicted label: 8
Predicted label: 21
Predicted label: 29
Predicted label: 24
Predicted label: 36
Predicted label: 23
Predicted label: 36
Predicted label: 4
Predicted label: 11
Predicted label: 37
Predicted label: 33
Predicted label: 36
Predicted label: 24
Predicted label: 28
Predicted label: 16
Predicted label: 33
Predicted label: 12
Predicted label: 1
Predicted label: 27
Predicted label: 32
Predicted label: 15
Predicted label: 35
Predicted label: 17
Predicted label: 31
Predicted label: 6
Predicted label: 33
Predicted label: 10
Predicted label: 18
Predicted label: 8
Predicted label: 5
Predicted label: 20
Predicted label: 5
Predicted label: 8
Predicted label: 26
Predicted label: 35
Predicted label: 7
Predicted label: 7
Predicted label: 37
Predicted label: 17
Predicted label: 40
Predicted label: 30
Predicted label: 1
Predicted label: 40
Predicted label: 20
Predicted label: 32
Predicted label: 20
Predicted l

# Cross validation implimentation

In [49]:
class cross_validation():

  def __init__(self, k_fold, distance):  #We take in the number of folds required in our case the professor has mentioned to do 5 fold corss-validation
    self.k_fold = k_fold
    self.distance = distance

  def euclidean_distance(self, x1, x2):  #This are just the distances same as we defined in the OneNNClass implimentation
    return np.sqrt(np.sum((x1 - x2) ** 2))

  def manhattan_distance(self, x1, x2):
    return np.sum(np.abs(x1 - x2))

  def cross_validation(self, data, label):
    fold_size = len(data) // self.k_fold  # This will be 400/5  = 80, we divide the total data length by the value of k which will give us the fold size which than we will use to divide our data.

    accuracy_list = []                    #This accuracy list we will use to compute the average accuracy across this 5 fold cross validation.

    for k in range(self.k_fold):    #Iterating over each fold
      start_index = k * fold_size #This is the starting index from where our data will be used. Forst the first fold 1, 1 * 80, so start index is 80
      end_index = (k + 1) * fold_size #This is the end index till were we will use our data. This will be 2 * 80 so end index is 160.

      test_data = data[start_index:end_index]  #According to the assingnment here 1 fold data is used for testing data, Test data will be from 80 to 160 in the first fold
      test_label = label[start_index:end_index] #This are the respective labels


      train_data = np.concatenate([data[:start_index], data[end_index:]])   #Our training data will be like 1 to 79 and form 160 to the rest 400 for first fold and so on...   #This is the training data which is 4 fold and will be used to compare against the test data now.
      train_label = np.concatenate([label[:start_index], label[end_index:]])    #This are the respective labels

      for i, test_sample in enumerate(test_data):     #Iterating over each test data points to calcuate the predictions.
        if self.distance == 'euclidean':
            distances = [self.euclidean_distance(test_sample, train_sample) for train_sample in train_data]
            nearest_neighbor_index = np.argmin(distances)
            predicted_label = train_label[nearest_neighbor_index]
        else:
            distances = [self.manhattan_distance(test_sample, train_sample) for train_sample in train_data]

            # 1-NN step: Find the single nearest neighbor.
            nearest_neighbor_index = np.argmin(distances)

            # 1-NN prediction: Assign the label of the nearest neighbor to the test sample.
            predicted_label = train_label[nearest_neighbor_index]


        if predicted_label == test_label[i]:
            accuracy_list.append(1)
        else:
            accuracy_list.append(0)

    return accuracy_list


# We have achieved the average accuracy of 97%.

In [50]:
cross_validation_model = cross_validation(k_fold=5, distance='euclidean') #Initilalizing the class cross_validation and passing the value of k = 5 as that is what we were told to do in the assignment doc.
accuracy_list = cross_validation_model.cross_validation(data, label) #Passing the data and the labels, which will be handeled by our corss_validation class as it contains the necessary devision of our data based on the k fold method.
accuracy = np.mean(accuracy_list) #We have use numpy mean method here to calcuate the average accuracy as it was specified in the assignment to calculate it seperately.
print("Average Accuracy:", accuracy)

Average Accuracy: 0.97


In [51]:
print(accuracy_list)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 