In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import sys
from skimage.transform import rescale, resize
from tqdm import tqdm

In [2]:
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import OneHotEncoder


def build_similarity_graph(X, var=1.0, eps=0.0, k=0, knn_or=True):
    n = X.shape[0]
    W = np.zeros((n, n))

    similarities = np.exp(- (euclidean_distances(X) ** 2) / (2 * var))
    similarities[np.arange(n), np.arange(n)] = 0

    if k == 0:
        W = similarities
        W[W < eps] = 0

    elif k != 0:
        for i in range(n):
            k_neighbors_indices = np.argsort(-similarities[i])[:k]

            if knn_or:
                for j in k_neighbors_indices:
                    W[i, j] = similarities[i, j]
                    W[j, i] = similarities[i, j]

            else:
                for j in k_neighbors_indices:
                    if i in np.argsort(-similarities[j])[:k]:
                        W[i, j] = similarities[i, j]
                        W[j, i] = similarities[i, j]
            
    return W


def build_laplacian(W, laplacian_normalization='unn'):
    D = np.diag(W.sum(axis=1))
    L = D - W

    if laplacian_normalization == "sym":
        D_inv_sqrt = np.sqrt(np.linalg.pinv(D))
        L = D_inv_sqrt.dot(L).dot(D_inv_sqrt)

    elif laplacian_normalization == "rw":
        D_inv = np.linalg.pinv(D)
        L = D_inv.dot(L)

    return L


def build_laplacian_regularized(X, laplacian_regularization=1.0, var=1.0, eps=0.0, k=0, laplacian_normalization="unn"):
    W = build_similarity_graph(X, var, eps, k)
    L = build_laplacian(W, laplacian_normalization)
    Q = L + laplacian_regularization*np.eye(W.shape[0])
    return Q


def mask_labels(Y, l, per_class=False):
    num_samples = np.size(Y, 0)
    min_label = Y.min()
    max_label = Y.max()
    assert min_label == 1

    if not per_class:
        Y_masked = np.zeros(num_samples)
        indices_to_reveal = np.arange(num_samples)
        np.random.shuffle(indices_to_reveal)
        indices_to_reveal = indices_to_reveal[:l]
        Y_masked[indices_to_reveal] = Y[indices_to_reveal]
    else:
        Y_masked = np.zeros(num_samples)
        for label in range(min_label, max_label+1):
            indices = np.where( Y == label)[0]
            np.random.shuffle(indices)
            indices = indices[:l]
            Y_masked[indices] = Y[indices]

    return Y_masked


def compute_hfs(L, Y, soft=False, c_l=0.99, c_u=0.01):
    num_samples = L.shape[0]
    l_idx = np.where(Y != 0)[0]
    u_idx = np.where(Y == 0)[0]
    y = OneHotEncoder().fit_transform(Y.reshape(-1, 1)).toarray()[:, 1:]

    if not soft:    
        f_l = y[l_idx]
        L_uu = L[u_idx][:, u_idx]
        L_ul = L[u_idx][:, l_idx]
        f_u = - (np.linalg.pinv(L_uu)).dot(L_ul.dot(f_l))
        f = np.zeros_like(y)
        f[l_idx] = f_l
        f[u_idx] = f_u

    else:
        C = np.zeros(num_samples)
        C[l_idx] = c_l
        C[u_idx] = c_u
        C = np.diag(C)
        f = np.linalg.pinv(((np.linalg.pinv(C)).dot(L) + np.identity(num_samples))).dot(y)

    labels = 1 + f.argmax(axis=1)
    return labels, f


In [3]:
def load_images(split_dir):
    images = {}

    for p in os.listdir(split_dir):
        p_dir = os.path.join(split_dir, p)
        if not os.path.isdir(p_dir):
            continue

        p_images = []
        for image_path in os.listdir(p_dir):
            p_images.append(os.path.join(p_dir, image_path))

        images[p] = p_images
    return images


def assign_labels(images_dict, labels_dict):
    images = []
    labels = []
    for p, p_images in images_dict.items():
        images.extend(p_images)
        labels.extend([labels_dict[p]] * len(p_images))
    return images, labels

In [4]:
from PIL import Image

def image_to_numpy(image_path):
    image_pil = Image.open(image_path)
    image_np = np.array(image_pil, dtype=np.float32) / 255.0
    return image_np

In [5]:
data_dir = "/kaggle/input/dlmi-lymphocytosis/dlmi-lymphocytosis-classification/"
train_dir = data_dir + "trainset/"
test_dir = data_dir + "testset/"

In [6]:
train_images = load_images(train_dir)
test_images = load_images(test_dir)
trainset_true_df = pd.read_csv(data_dir + "trainset/trainset_true.csv")
labels_dict = dict(zip(trainset_true_df["ID"], trainset_true_df["LABEL"]))
X_train, y_train = assign_labels(train_images, labels_dict)
X_train, y_train = np.array(X_train), np.array(y_train)
len(X_train)

13453

In [7]:
train_images = load_images(train_dir)
test_images = load_images(test_dir)
trainset_true_df = pd.read_csv(data_dir + "trainset/trainset_true.csv")
labels_dict = dict(zip(trainset_true_df["ID"], trainset_true_df["LABEL"]))
X_train, y_train = assign_labels(train_images, labels_dict)
X_train, y_train = np.array(X_train), np.array(y_train)
len(X_train)

13453

In [8]:
y_train_hfs = y_train + 1
masked_labels = mask_labels(y_train_hfs, 1000, per_class=True)

In [9]:
X_train_images = []
for image_path in tqdm(X_train):
    x = image_to_numpy(image_path)
    x = rescale(x, 0.20, anti_aliasing=False)
    x = x.flatten().tolist()
    X_train_images.append(x)
X_train_images = np.array(X_train_images)
X_train_images.shape

100%|██████████| 13453/13453 [00:45<00:00, 297.55it/s]


(13453, 2025)

In [10]:
laplacian_regularization = 1.0
var = 2025.0
eps = 0.0
k = 1000
laplacian_normalization="unn"
L = build_laplacian_regularized(
    X_train_images,
    laplacian_regularization=laplacian_regularization,
    var=var,
    eps=eps,
    k=k,
    laplacian_normalization=laplacian_normalization
)

In [11]:
labels, _ = compute_hfs(L, masked_labels, soft=False)

In [12]:
from sklearn import metrics
all_acc = metrics.accuracy_score(y_train_hfs, labels)
all_balanced_acc = metrics.balanced_accuracy_score(y_train_hfs, labels)
masked_acc = metrics.accuracy_score(y_train_hfs[masked_labels==0], labels[masked_labels==0])
masked_balanced_acc = metrics.balanced_accuracy_score(y_train_hfs[masked_labels==0], labels[masked_labels==0])

print(f"All - Acc = {all_acc:.4f} balanced acc = {all_balanced_acc:.4f}")
print(f"Masked - Acc = {masked_acc:.4f} balanced acc = {masked_balanced_acc:.4f}")

All - Acc = 0.3055 balanced acc = 0.5669
Masked - Acc = 0.1842 balanced acc = 0.5210
