# Using VGG (16, 19) model

Hook a bunch of shallow and deep layers to extract their features. Then using PCA, reduce their
dimensionality and use cosine similarity.

In [1]:
import numpy as np
import pandas as pd
import torchvision
import glob
import os


from typing import Callable
from PIL import Image
from sklearn.decomposition import PCA
from scipy.spatial import distance

## Import model, register hooks, open and transform images

In [2]:
# List of the layers to register the hook
LAYERS = [11, 30]
# List of the dimensions 
PCAS = [0, 5]

# path for the folder that contains the images
path = "../data/preprocess/"

# Define the case that will work on
cases = [f.path[19:] for f in os.scandir(path) if f.is_dir()]

# import model
model = torchvision.models.vgg16(pretrained=True)
model_name = "vgg16"

# Dictionary for the features
features = {}

# helping function for the hooks
def reg_hook(layer: int) -> Callable:
    def hook(model, input, output):
        features[layer] = output.detach()
    return hook
    
# register hooks for every layer of the list LAYERS
for layer in LAYERS:
    model.features[layer].register_forward_hook(reg_hook(layer))

# transform callable function for the images
transform = torchvision.transforms.Compose([
    torchvision.transforms.Resize(224),
    torchvision.transforms.CenterCrop(224),
#     torchvision.transforms.RandomHorizontalFlip(),
#     torchvision.transforms.RandomAffine(degrees=30),
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize(
        mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
    ),
])

# function to open images and return them as a list
def open_images(path: str, case: str, type: str) -> list:
    lst = []
    for filename in glob.glob(path + case + "/" + type + "/*.JPG"):
        lst.append(Image.open(filename).convert("RGB"))
    return lst


# function for the feature extraction
def extract_features(model: torchvision.models, transform: Callable, images: list, layers: list) -> list:

    # inner function to normalize the vector
    def normalize(A: np.ndarray) -> np.ndarray:
        norm = np.linalg.norm(A)
        return A / norm

    embeddings = []
    
    # iterate through all images and save the features in the list
    for image in images:
        x = transform(image).unsqueeze(0).to("cpu")
        _ = model(x)

        feats = [normalize(features[layer].cpu().numpy()) for layer in layers]
        embeddings.append(feats)
        
    return embeddings


# Two functions for the PCA implementation
def pca(image: np.ndarray, dimension: int = 8) -> np.ndarray:
    '''A tensor with H x W x C, we reshape it to an array of HW x C (pixels x dimension of data)'''
    N = image.shape[2] * image.shape[3] # HxW
    C = image.shape[1] # Dimensions: Kernels
    X = np.reshape(image, [N, C])
    feats = PCA(n_components=dimension).fit_transform(X)
    return np.reshape(feats, [image.shape[2], image.shape[3], feats.shape[1]])


def reduce_array_with_pca(array: list, dimension=8) -> list:
    '''Given an array, reduce the dimensional space of the elements.'''
    return [pca(image=image, dimension=dimension) for image in array]


# Functions for the similarity
def cosine_similarity(A: np.ndarray, B: np.ndarray) -> float:
    return np.dot(A, B) / (np.linalg.norm(A) * np.linalg.norm(B))

def Average(lst):
    return sum(lst) / len(lst)

def distance_metric(A: np.ndarray, B: np.ndarray, C: np.ndarray) -> float:
    A = A.flatten()
    B = B.flatten()
    C = C.flatten()
    
    def transform_dist_to_sim(x: float) -> float:
        return 1 / (1 + x)

    c1 = cosine_similarity(A, B)
    c2 = cosine_similarity(A, C)

#     e1 = transform_dist_to_sim(distance.euclidean(A, B))
#     e2 = transform_dist_to_sim(distance.euclidean(A, C))

    return Average([c1, c2])


In [3]:
for case in cases:

    images_before_treatment = open_images(path=path, case=case, type="BEFORE")
    images_after_treatment = open_images(path=path, case=case, type="AFTER")

    # get the features
    before_embeddings = extract_features(model=model, transform=transform, images=images_before_treatment, layers=LAYERS)
    after_embeddings = extract_features(model=model, transform=transform, images=images_after_treatment, layers=LAYERS)

    # Get the results for every layer and every pca
    data = []
    for pca_value in PCAS:
        pca_before_embeddings = []
        pca_after_embeddings = []

        if pca_value == 0:
            pca_before_embeddings = before_embeddings
            pca_after_embeddings = after_embeddings

        else:

            for index, lst in enumerate(before_embeddings):
                bf = [pca(array, dimension=pca_value) for array in before_embeddings[index]]
                af = [pca(array, dimension=pca_value) for array in after_embeddings[index]]
                pca_before_embeddings.append(bf)
                pca_after_embeddings.append(af)


        for index, lst in enumerate(pca_before_embeddings[0]):
            before = distance_metric(A=pca_before_embeddings[0][index],
            B=pca_before_embeddings[1][index],
            C=pca_before_embeddings[2][index])

            after = distance_metric(A=pca_after_embeddings[0][index],
            B=pca_after_embeddings[1][index],
            C=pca_after_embeddings[2][index])

            data.append((LAYERS[index], pca_value, round(before, 3), round(after, 3)))

    # Create a dataframe
    df = pd.DataFrame(data, columns=['Layer', 'PCA', 'Before','After'])
    # create the csv file
    df.to_csv(f'../csv/{case}/{case}_{model_name}_cosine.csv', index=False)

### For every layer, calculate the cosine similarity and the euclidean distance between the wounds

In [4]:
# from sklearn.metrics.pairwise import euclidean_distances
# wounds_data = []

# # function to transform distance to similarity:
# def transform_dist_to_sim(dist: float):
#     return 1 / (1 + dist)

# for index, layer in enumerate(before_embeddings[0]):
#     before_wound = before_embeddings[0][index].flatten()
#     after_wound = after_embeddings[0][index].flatten()


#     # calculate similarity and euclidean distance
#     sim = cosine_similarity(before_wound, after_wound)
#     dist = transform_dist_to_sim(np.linalg.norm(before_wound-after_wound))

#     wounds_data.append((LAYERS[index], sim, dist))

# # Create a dataframe
# df_wound = pd.DataFrame(wounds_data, columns=['Layer', 'cosine similarity', 'euclidean'])
# # create the csv file
# df_wound.to_csv(f'../csv/{case}/{case}_{model_name}_wounds.csv', index=False)