# Final Notebook

This notebook contains the code of our project, aimed at extend the original's one by implementing active learning in it and demonstratig its benefits.

More information about this work in the Project Report attached.

In [6]:
#TODO: add imports
import numpy as np
import os
import skimage
import joblib

### Constants & Hyperparameters

In [14]:
# Possible labels present in the dataset
# (NOTE: labeled datasets are expected to be folders with one subfolder per label, and inside each subfolder all the images of that type)
POSSIBLE_LABELS = ['PNEUMONIA', 'NORMAL']

# The size dataset images will be resized to before feature extraction
IMG_SIZE = 128

### Data Import

In [15]:
# Define a function to extract features from a given image
# using scikit-image Histogram of Oriented Gradient features extractor
def extract_hog_features(image):
    resized_image = skimage.transform.resize(
        image,
        (IMG_SIZE, IMG_SIZE),
        anti_aliasing=True,
        preserve_range=True
    ).astype(np.uint8)

    features = skimage.feature.hog(
        resized_image,
        orientations=9,
        pixels_per_cell=(8, 8),
        cells_per_block=(2, 2),
        block_norm='L2-Hys',
        transform_sqrt=False,
        feature_vector=True
    )
    
    return features

In [16]:
# Loads images from the given directory and returns their extracted features and labels as numpy arrays
def load_dataset(data_path):
    features_list = []
    labels_list = []

    # for each folder of the dataset (each folder corresponds to a different class of labeled data)...
    for label in POSSIBLE_LABELS:
        class_path = os.path.join(data_path, label)
        class_label = POSSIBLE_LABELS.index(label)

        # for each image in the folder...
        for img_name in os.listdir(class_path):
            try:
                # load the image as an array of float values of its pixels (in graycale since it's x-ray)
                img = skimage.io.imread(
                    os.path.join(class_path, img_name),
                    as_gray=True
                )

                # extract the hog features from the resized image using the method defined above
                hog_features = extract_hog_features(img)

                # store the extracted features in the features list and the index of the current class as their respective label
                features_list.append(hog_features)
                labels_list.append(class_label)
            
            except Exception as e:
                print(f"Error loading image {img_name}: {e}")
    
    return np.array(features_list), np.array(labels_list)

In [17]:
# Try to load the dataset from cache, otherwise load it from the given path and cache it for future use
def get_cached_dataset(data_path, force_reload=False):
    cache_dir = os.path.join(".cache/", data_path)
    features_cache = os.path.join(cache_dir, "features.joblib")
    labels_cache = os.path.join(cache_dir, "labels.joblib")

    if not force_reload and os.path.exists(features_cache) and os.path.exists(labels_cache):
        print(f"Loading cached dataset from {data_path}...")
        features = joblib.load(features_cache)
        labels = joblib.load(labels_cache)
        return features, labels
        
    print(f"Loading dataset from {data_path}...")
    features, labels = load_dataset(data_path)
    
    print(f"Caching loaded dataset to {features_cache} and {labels_cache}...")
    os.makedirs(cache_dir, exist_ok=True)
    joblib.dump(features, features_cache, compress=3)
    joblib.dump(labels, labels_cache, compress=3)

    return features, labels

In [19]:
# Load the training and validation datasets (checking c)
train_features, train_labels = get_cached_dataset("data/chest_xray/train")
test_features, test_labels = get_cached_dataset("data/chest_xray/test")

print(f"Training features shape: {train_features.shape}")
print(f"Training labels shape: {train_labels.shape}")
print(f"Test features shape: {test_features.shape}")
print(f"Test labels shape: {test_labels.shape}")    

Loading cached dataset from data/chest_xray/train...
Loading cached dataset from data/chest_xray/test...
Training features shape: (5216, 8100)
Training labels shape: (5216,)
Test features shape: (624, 8100)
Test labels shape: (624,)


Section 1: randomly picked N elements approach

(we decide to label 10% of the unlabled images we have. we pick randomly which ones to label)

In [None]:
#TODO: show off how poorly perform a randomly choosen training 
# like the old one but with less labeled samples

Section 2: active learning approach with N labeled elements

(we decide to label 10% of the unlabled images we have. we pick which ones to label using active learning)

In [None]:
#TODO: repeat the same stuff above BUT with an active learning 
# loop to show off much better performances

Section 3: original model (traditional but with all samples)

(we decide to label all the unlabled images we have)

In [None]:
#TODO: almost a copy of original steps 
# to show that the active learning approach achieves similar 
# results but with less data

Section 4: fancy graphs to prove our thesis