# GDA on histopathological images

we will apply GDA

In [1]:
import numpy as np
import os
from PIL import Image


In [5]:
def load_data(force_load=False, size_considered = 2):
    dir = "data/histopathologic-cancer-detection/"
    # Paths to train and test folders
    

    train_folder = dir + "train"
    
    # Load CSV data for train labels
    import csv
    dico_labels = csv.DictReader(open( dir + "train_labels.csv", 'r'))
    dicos = [elt for elt in dico_labels]
    labels = np.array([elt["label"] for elt in dicos])
    train_ids = [elt["id"] for elt in dicos]  # IDs corresponding to training set
    
    if os.path.exists(dir + "x.npy") and not force_load:
        x = np.load(dir + "x.npy")
        return x, labels, train_ids
    # Initialize lists to hold images
    train_images = []
    
    # Open train images
    for img_id in train_ids:
        img_path = os.path.join(train_folder, f"{img_id}.tif")
        if os.path.exists(img_path):
            try:
                img = np.array(Image.open(img_path))
                length = img.shape[0]
                start,stop = length // 2 - 16, length // 2 + 16
                
                img_cropped = img[start:stop, start:stop, :]
                
                resized_pil_image = Image.fromarray(img_cropped).resize((size_considered, size_considered))
                resized_np_img = np.array(resized_pil_image)
                train_images.append(resized_np_img)
            except Exception as e:
                print(f"Error loading image {img_id} from train folder: {e}")
    
    print(f"Loaded {len(train_images)} train images.")
    x = np.array(train_images)
    np.save(open(dir + "x.npy", 'wb'), x)
    return x, labels, train_ids
    
    

x,labels,train_ids = load_data(force_load=True,size_considered=1)

Loaded 220025 train images.


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    x, labels, test_size=0.2, random_state=42
)


In [7]:
from GDA import GDA
import torch
X_flatten = X_train.reshape(X_train.shape[0], -1) / 255
dim = X_flatten.shape[1]
print("dim : ", dim)
model = GDA(n_components=20, km_init=True, km_cov_init=True)

model.fit(X_flatten, y_train)




dim :  3


In [8]:
def print_metrics(y_val_pred,y_val):
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

    # Assuming y_val_pred contains predictions and y_val contains true labels
    accuracy = accuracy_score(y_val, y_val_pred)
    precision = precision_score(y_val, y_val_pred, average='weighted')
    recall = recall_score(y_val, y_val_pred, average='weighted')
    f1 = f1_score(y_val, y_val_pred, average='weighted')
    conf_matrix = confusion_matrix(y_val, y_val_pred)
    
    print("Accuracy:", accuracy)
    print("Precision (Weighted):", precision)
    print("Recall (Weighted):", recall)
    print("F1-Score (Weighted):", f1)
    print("\nConfusion Matrix:\n", conf_matrix)
    
    # Detailed classification report
    print("\nClassification Report:\n", classification_report(y_val, y_val_pred))


In [9]:
X_val = X_val.reshape(X_val.shape[0], -1) / 255
y_val_pred = model.predict(X_val)
print_metrics(y_val_pred,y_val)

Accuracy: 0.7732757641177139
Precision (Weighted): 0.7792093556987142
Recall (Weighted): 0.7732757641177139
F1-Score (Weighted): 0.7747572746426413

Confusion Matrix:
 [[20224  5953]
 [ 4024 13804]]

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.77      0.80     26177
           1       0.70      0.77      0.73     17828

    accuracy                           0.77     44005
   macro avg       0.77      0.77      0.77     44005
weighted avg       0.78      0.77      0.77     44005



In [None]:
import matplotlib.pyplot as plt
import scipy
def marginal_density(gmm_means, gmm_covs, gmm_probs, dim_to_keep, points):
    """
    Compute the marginal density of the GMM
    """
    density_values = np.zeros(len(points))
    
    for mean, cov, prob in zip(gmm_means, gmm_covs, gmm_probs):
        mean_marginal = mean[dim_to_keep]
        cov_marginal = cov[np.ix_(dim_to_keep, dim_to_keep)]
        
        marginal_gaussian = scipy.stats.multivariate_normal(mean=mean_marginal, cov=cov_marginal)
        
        # Compute the density for each component
        density_values += prob * marginal_gaussian.pdf(points)
    
    return density_values
t = np.linspace(0,1,255)
density = marginal_density(model.means_, model.covariances_, model.weights_, [1], t)

fix,ax = plt.subplots(1,1)
ax.plot(t,density)
ax.hist(X_flatten[:,1], bins=255, density=True)
plt.show()