## Malaria recognition
This notebook contains classification for ill patients (malaria) and healthy patients.

## Obtaining data

In [None]:
from io import BytesIO
from PIL import Image
import os
import cv2

def read_images(directory):
    images = []
    for filename in os.listdir(directory):
        img = cv2.imread(os.path.join(directory, filename))
        if img is not None:
            images.append(img)
    return images

images_healthy = read_images('cell_images/Uninfected/')
images_ill = read_images('cell_images/Parasitized/')

In [None]:
# cv2.imshow('aa', images_healthy[0])
# cv2.waitKey(0)
# cv2.destroyAllWindows()

In [None]:
import numpy as np
from matplotlib import pyplot as plt
from PIL import Image
WIDTH = 100
HEIGHT = 100
DIM = (WIDTH, HEIGHT)

orb = cv2.ORB_create(700, 1.2, 8, 25)

def get_descriptors(input_image):
    features = []
    keypoints = orb.detect(input_image, None) 
    features.append(keypoints)
    return features

def read_original_file(img):
    resized = cv2.resize(img, DIM, interpolation = cv2.INTER_AREA)
    gray_resized = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    return gray_resized

In [None]:
import cv2

def save_images(healthy, ill):
    features_healthy, features_ill = [],[]
    for file in healthy:
        original = read_original_file(file)
        descriptor = get_descriptors(original)
        features_healthy.append(descriptor)
    for file in ill:
        original = read_original_file(file)
        descriptor = get_descriptors(original)
        features_ill.append(descriptor)
    return features_healthy, features_ill

In [None]:
features_healthy, features_ill = save_images(images_healthy, images_ill)

In [None]:
import random

class DataManaging:
    def __init__(self, healthy, ill):
        self.healthy = healthy
        self.ill = ill
        
    def create_split_set(self, ratio):
        k = round(ratio*len(self.healthy))
        ill_new = random.choices(self.ill, k=k)
        healthy_new = random.choices(self.healthy, k=k)
        X = healthy_new + ill_new
        Y = ['Healthy' for img in range(len(healthy_new))] + ['Ill' for img in range(len(ill_new))]
        self.healthy = [img for img in self.healthy if img not in healthy_new]
        self.ill = [img for img in self.ill if img not in ill_new]
        return X, Y

    def shuffle_data(self, x, y):
        new_indexes = random.randint(0, len(y))
        x,y = x[new_indexes], y[new_indexes]
        return x,y
    
    def data_splitting(self, healthy, ill):
        training_X, validation_X, test_X, training_Y, validation_Y, test_Y = [],[],[],[],[],[]
        training_X, training_Y = self.create_split_set(0.7)
        validation_X, validation_Y = self.create_split_set(0.7)
        test_X, test_Y = self.create_split_set(1) # rest belongs in test set
        return training_X, validation_X, test_X, training_Y, validation_Y, test_Y
    
    def data_management(self):
        training_X, validation_X, test_X, training_Y, validation_Y, test_Y = self.data_splitting(self.healthy, self.ill)
        self.shuffle_data(training_X, training_Y)
        self.shuffle_data(validation_X, validation_Y)
        self.shuffle_data(test_X, test_Y)
        return training_X, validation_X, test_X, training_Y, validation_Y, test_Y
        

In [None]:
data_manager = DataManaging(features_healthy, features_ill)
training_X, validation_X, test_X, training_Y, validation_Y, test_Y = data_manager.data_management()

In [None]:
# TODO:
# Clustering the keypoints into BOW

In [None]:
from sklearn import svm
import numpy as np
print(np.asarray(training_X).shape)
X = np.asarray(training_X)
lin_clf = svm.LinearSVC()
lin_clf.fit(X, training_Y)