In [14]:
import torch
import numpy as np
from PIL import Image
import open_clip
import json
import os
import re

In [5]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
regensburg_pediatric_appendicitis = fetch_ucirepo(id=938) 
  
# data (as pandas dataframes) 
X = regensburg_pediatric_appendicitis.data.features 
y = regensburg_pediatric_appendicitis.data.targets 
  
# metadata 
print(regensburg_pediatric_appendicitis.metadata) 
  
# variable information 
print(regensburg_pediatric_appendicitis.variables) 

ModuleNotFoundError: No module named 'ucimlrepo'

In [7]:
file_path = '/home/RDC/simnacma/H:/simnacma/CITs/regensburg_pediatric_data/data/meta_data/diagnosis/imputed/final/app_data'

with open(file_path, 'r') as file:
    meta_data = json.load(file)
meta_data_short = {key: meta_data[key] for key in list(meta_data)[:5]}
print(meta_data_short)

{'61': [['61.2_App_L_surr._tissue_reaction.png', '61.4_App.png', '61.1_App_T_surr._tissue_reaction.png', '61.3_App.png'], 1, [0.714277813869835, 0.0, 0.7144322810413849, 0.017146306454157642, -0.5319299200212423, 6.0, 6.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.7055608818768323, 0.0, 0.0, -0.819509714853897, 0.5755899390666621, -0.04053867504609838, 0.0, 0.0, 0.0, 1.0, 0.47159474849000954, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0], [1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.47159474849000954, 0.0, 0.0, 0.0, 1.0]], '119': [['119.2_App_M.png', '119.3_App_D.png', '119.5_App_M.png', '119.4_App.png', '119.1_App.png'], 1, [0.7205136773514522, 1.0, -0.045378305960132655, 0.2482426226111042, 0.7292568761367859, 3.0, 2.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5840809909468845, 0.0, 1.0, -0.6324846251942166, -0.06217415473659873, -0.3956693704933981, 0.0, 0.0, 0.0, 1.0, 1.262948399962931, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0

In [10]:
# Directory containing the images
image_dir = '/home/RDC/simnacma/H:/simnacma/CITs/regensburg_pediatric_data/data/preprocessed/constant_padding/deepfilled_cropped/'
image_paths = []
for key, value in meta_data.items():
    #directory_name = key  # Directory name
    filename = value[0][0]   # List of filenames
    labels = value[1] 
    image_path = os.path.join(image_dir, filename)
    #print(image_path)
    image_paths.append(image_path)
print(image_paths[0:5])

['/home/RDC/simnacma/H:/simnacma/CITs/regensburg_pediatric_data/data/preprocessed/constant_padding/deepfilled_cropped/61.2_App_L_surr._tissue_reaction.png', '/home/RDC/simnacma/H:/simnacma/CITs/regensburg_pediatric_data/data/preprocessed/constant_padding/deepfilled_cropped/119.2_App_M.png', '/home/RDC/simnacma/H:/simnacma/CITs/regensburg_pediatric_data/data/preprocessed/constant_padding/deepfilled_cropped/342.1_App_M.png', '/home/RDC/simnacma/H:/simnacma/CITs/regensburg_pediatric_data/data/preprocessed/constant_padding/deepfilled_cropped/774.1_App.png', '/home/RDC/simnacma/H:/simnacma/CITs/regensburg_pediatric_data/data/preprocessed/constant_padding/deepfilled_cropped/591.12_App_M.png']


In [11]:
model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='laion2b_s34b_b79k')

In [53]:
# Define your process_image function
def process_image(image_path):
    # Load image using PIL
    img = Image.open(image_path)
    img_preprocessed = preprocess(img).unsqueeze(0)
    # Obtain embeddings for the images
    with torch.no_grad():
        embedding = model.encode_image(img_preprocessed).float()
        embedding /= embedding.norm(dim=-1, keepdim=True)
    return(embedding.numpy()[0])

In [54]:
features = {}
pattern = r'/(\d+)\.'
for i, path in enumerate(image_paths):
    # Use regex to search for the number in the file path
    pattern_ = str(re.search(pattern, image_paths[i]).group())
    subject_id = re.findall(r'\d+', pattern_)[0]
    features[subject_id] = process_image(path)

In [58]:
print(features['61'][0])

-0.009007174


In [59]:
features_path = '/home/RDC/simnacma/H:/simnacma/CITs/regensburg_pediatric_data/data/feature_representations/preprocessed_constant_padding/features.npz'
# Save the dictionary to a .npz file
np.savez(features_path, **features)