Uploading the Dataset from my Drive

In [1]:
# ! gdown https://drive.google.com/drive/folders/1eJPlwS6bd_3SUlm-rh_t7KglTaR4zrIQ?usp=sharing

from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


Importing libraries

In [2]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from imutils import paths
import numpy as np
import argparse
import imutils
import cv2
import os
from tqdm import tqdm
import random

Building helper function to create the dataset

In [3]:
def image_to_feature_vector(image, size=(150, 150)):
    return cv2.resize(image, size).flatten()

def extract_color_histogram(image, bins=(8, 8, 8)):
    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    hist = cv2.calcHist([hsv], [0, 1, 2], None, bins, [0, 180, 0, 256, 0, 256])
    if imutils.is_cv2():
        hist = cv2.normalize(hist)
    else:
        cv2.normalize(hist, hist)
    return hist.flatten()

Creating the dataset, it spelt to: training/testing/validation data

In [4]:
IMG_SIZE = 150
categories = ["NORMAL", "PNEUMONIA"]

def create_data(data_path="/content/gdrive/MyDrive/chest_xray/train"):
    Created_data = []
    for category in categories:

        path = os.path.join(data_path,category)
        class_num = categories.index(category)
        count = 0
        for img in tqdm(os.listdir(path)):
            count += 1
            if class_num == 1 and count == 1352:
                break
            try:
                image = cv2.imread(os.path.join(path, img))
                if class_num == 0:
                    label = [1, 0]
                else:
                    label = [0, 1]
                pixels = image_to_feature_vector(image)
                hist = extract_color_histogram(image)

                Created_data.append([pixels, hist, label])
            except Exception as e:
                pass

    return Created_data

print("Creating training Data: ")
training_data = create_data("/content/gdrive/MyDrive/chest_xray/train")

print("Creating testing Data: ")
test_data = create_data("/content/gdrive/MyDrive/chest_xray/test")

print("Creating validation Data: ")
val_data = create_data("/content/gdrive/MyDrive/chest_xray/val")

Creating training Data: 


100%|██████████| 1352/1352 [02:06<00:00, 10.65it/s]
 35%|███▍      | 1351/3876 [01:43<03:13, 13.04it/s]


Creating testing Data: 


100%|██████████| 244/244 [00:12<00:00, 19.08it/s]
100%|██████████| 390/390 [00:10<00:00, 37.11it/s]


Creating validation Data: 


100%|██████████| 9/9 [00:01<00:00,  5.19it/s]
100%|██████████| 9/9 [00:01<00:00,  6.58it/s]


Shuffling the Data to purpose of reducing variance and making sure that models remain general and overfit less.

In [5]:
random.shuffle(training_data)
random.shuffle(test_data)
random.shuffle(val_data)

Spilting data into categories

In [6]:
x_train, x_test, x_val = [], [], []
y_train, y_test, y_val = [], [], []

x_train_pixel, x_test_pixel, x_val_pixel = [], [], []

for pixel,features,label in training_data:
    x_train.append(features)
    y_train.append(label)
    x_train_pixel.append(pixel)
    
for pixel,features,label in test_data:
    x_test.append(features)
    y_test.append(label)
    x_test_pixel.append(pixel)

for pixel,features,label in val_data:
    x_val.append(features)
    y_val.append(label)
    x_val_pixel.append(pixel)


Saving data model for using it in another algorithms

In [7]:
np.save("/content/saved_data/KNN_saved/x_train_pixel.npy", x_train_pixel)
np.save("/content/saved_data/KNN_saved/x_train.npy", x_train)
np.save("/content/saved_data/KNN_saved/y_train.npy", y_train)

np.save("/content/saved_data/KNN_saved/x_test_pixel.npy", x_test_pixel)
np.save("/content/saved_data/KNN_saved/x_test.npy", x_test)
np.save("/content/saved_data/KNN_saved/y_test.npy", y_test)

np.save("/content/saved_data/KNN_saved/x_val_pixel.npy", x_val_pixel)
np.save("/content/saved_data/KNN_saved/x_val.npy", x_val)
np.save("/content/saved_data/KNN_saved/y_val.npy", y_val)

Running KNN algorithm and checking model accuracy

In [8]:
x_train = np.load("/content/saved_data/KNN_saved/x_train.npy", allow_pickle=True)
x_test = np.load("/content/saved_data/KNN_saved/x_test.npy", allow_pickle=True)
x_train_pixel = np.load("/content/saved_data/KNN_saved/x_train_pixel.npy", allow_pickle=True)
x_test_pixel = np.load("/content/saved_data/KNN_saved/x_test_pixel.npy", allow_pickle=True)
y_train = np.load("/content/saved_data/KNN_saved/y_train.npy", allow_pickle=True)
y_test = np.load("/content/saved_data/KNN_saved/y_test.npy", allow_pickle=True)

print("evaluating accuracy...")
model = KNeighborsClassifier(n_neighbors=5,n_jobs=1)
model.fit(x_train, y_train)
acc = model.score(x_test, y_test)
print("Accuracy: {:.2f}%".format(acc * 100))

evaluating accuracy...
Accuracy: 74.92%
