In [1]:
import pandas as pd
import os
from transformers import BertTokenizer, BertModel, ViTFeatureExtractor, ViTModel
from tqdm.notebook import tqdm
import torch
from PIL import Image

In [None]:
# df_train = pd.read_pickle("..\\data\\processed\\df_train.pkl")
# df_test = pd.read_pickle("..\\data\\processed\\df_test.pkl")

df = pd.read_csv("..\\data\\interim\\train_interim_filtered_singlelabel_selectedcuis.pkl")


# Text embedding

### BERT


In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()  # modalità valutazione

def get_bert_embeddings(texts, tokenizer, model, batch_size=64, split="train", output_file='rocov2_captions_embeddings.pt'):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    num_texts = len(texts)
    embeddings = []
   # Move model to the selected device
    model.to(device)
   # Initialize tqdm to track progress
    pbar = tqdm(total=num_texts, desc=f"Embedding texts {split}", unit="texts")
    for i in range(0, num_texts, batch_size):
        batch_texts = texts[i:i+batch_size]
        # Tokenize batch of texts
        inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
        # Process batch with the model
        with torch.no_grad():
            outputs = model(**inputs)
        # Calculate embeddings (mean pooling)
        batch_embeddings = outputs.last_hidden_state.mean(dim=1)
        # Accumulate batch embeddings
        embeddings.append(batch_embeddings)
        # Update progress bar
        pbar.update(len(batch_texts))
    # Close progress bar
    pbar.close()
    # Concatenate embeddings of all batches
    embeddings = torch.cat(embeddings, dim=0)
    # Save embeddings directly as a tensor
    torch.save(embeddings, output_file)
    print(f"Embeddings saved to {output_file}")

    return embeddings

In [14]:
# text_embeddings_train = get_bert_embeddings(df_train["Caption"].tolist(),tokenizer,model,split="train", output_file="..\\data\\processed\\rocov2_captions_embeddings_train.pt")
# text_embeddings_test = get_bert_embeddings(df_test["Caption"].tolist(),tokenizer,model, split="test", output_file="..\\data\\processed\\rocov2_captions_embeddings_test.pt")

text_embeddings = get_bert_embeddings(df["Caption"].tolist(),tokenizer,model,split="train_filtered_vec", output_file="..\\data\\interim\\rocov2_captions_embeddings_train_filtered_vec.pt")

Embedding texts train_filtered_vec:   0%|          | 0/2227 [00:00<?, ?texts/s]

Embeddings saved to ..\data\interim\rocov2_captions_embeddings_train_filtered_vec.pt


# Image embedding

### ViT

In [None]:
model_name = "google/vit-base-patch16-224-in21k"  # o google/vit-base-patch16-224
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name)
vit_model = ViTModel.from_pretrained(model_name)
vit_model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vit_model.to(device)

def get_vit_embeddings(image_paths, model, processor, device='cuda', batch_size=32, split="train", output_file="..\\data\\processed\\rocov2_image_embeddings_train.pt"):
    base_dir = "..\\data\\raw\\train"
    all_emb = []
    num_images = len(image_paths)
    pbar = tqdm(total=num_images, desc=f"Embedding images ({split})", unit="images")
    for i in range(0, len(image_paths), batch_size):
        batch_paths = image_paths[i:i+batch_size]
        images = [Image.open(os.path.join(base_dir, path)).convert("RGB") for path in batch_paths]
        
        inputs = processor(images=images, return_tensors="pt", padding=True, truncation=True)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = model(**inputs)
        
        cls_tokens = outputs.last_hidden_state[:, 0, :]  # shape: (batch_size, hidden_size)
        all_emb.append(cls_tokens.cpu())
        pbar.update(len(batch_paths))
    
    pbar.close()
    # Combine embeddings in one tensor
    full_embeddings = torch.cat(all_emb, dim=0)  # shape: (n_images, hidden_size)
    
    # Save file
    torch.save(full_embeddings, output_file)

    return full_embeddings



In [15]:
# image_embeddings_train = get_vit_embeddings(df_train["image_path"].tolist(), vit_model, feature_extractor, split="train", output_file="..\\data\\processed\\rocov2_image_embeddings_train.pt")
# image_embeddings_test = get_vit_embeddings(df_test["image_path"].tolist(), vit_model, feature_extractor, split="test", output_file="..\\data\\processed\\rocov2_image_embeddings_test.pt")

image_embeddings = get_vit_embeddings(df["Image"].tolist(), vit_model, feature_extractor, split="train_filtered_vec", output_file="..\\data\\interim\\rocov2_image_embeddings_train_filtered_vec.pt")

Embedding images (train_filtered_vec):   0%|          | 0/2227 [00:00<?, ?images/s]

  return self.preprocess(images, **kwargs)
