In [1]:
import pandas as pd

# Loading dataset
ind_df = pd.read_csv('data/indigenous_collection_processed.csv', index_col='id')
print(f'Dataframe columns: \n{ind_df.columns}')

Dataframe columns: 
Index(['url', 'thumbnail', 'creation_date', 'modification_date',
       'numero_do_item', 'tripticos', 'categoria', 'nome_do_item',
       'nome_do_item_dic', 'colecao', 'coletor', 'doador', 'modo_de_aquisicao',
       'data_de_aquisicao', 'ano_de_aquisicao', 'data_de_confeccao', 'autoria',
       'nome_etnico', 'descricao', 'dimensoes', 'funcao', 'materia_prima',
       'tecnica_confeccao', 'descritor_tematico', 'descritor_comum',
       'numero_de_pecas', 'itens_relacionados', 'responsavel_guarda',
       'inst_detentora', 'povo', 'autoidentificacao', 'lingua',
       'estado_de_origem', 'geolocalizacao', 'pais_de_origem', 'exposicao',
       'referencias', 'disponibilidade', 'qualificacao', 'historia_adm',
       'notas_gerais', 'observacao', 'conservacao', 'image_path'],
      dtype='object')


In [2]:
from IPython.core.magic import register_cell_magic

# Creating skip cell command
@register_cell_magic
def skip(line, cell):
    return

# Image Clustering

Clustering experiments with image feature extractors. The idea is to fine-tune some pre-trained models on our dataset and then remove the last layer of the model to cluster on the embedding space projections.

## Dataset Preparation

For fine-tuning the model on our dataset, we are going to try a few different labels and study how they affect the generated emebdding space. For now, we focus *povo* and *categoria*.

In [3]:
from PIL import Image

# Filtering out corrupted images
corrupted_images = []
for index, row in ind_df.loc[ind_df['image_path'].notna()].iterrows():
    try:
        Image.open(row['image_path'])
    except Exception as e:
        corrupted_images.append(row['image_path'])
        ind_df.loc[index, 'image_path'] = pd.NA
print(f'{len(corrupted_images)} corrupted images')

# Creating 'image_path_br' column
ind_df['image_path_br'] = ind_df['image_path'].values
ind_df.loc[ind_df['image_path_br'].notna(), 'image_path_br'] = \
    ind_df.loc[ind_df['image_path_br'].notna(), \
               'image_path'].apply(lambda path: \
                                   f"data/br_images/{path.split('/')[-1].split('.')[0]}.png")

1 corrupted images


In [4]:
import torch
from torchvision import transforms
from training_utils import preparing_image_labels, ImageDataset

# Getting the proper device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Building dataset for column 'povo' (though no specific column is used on off-the-shelf model)
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize((224, 224)),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])
labels, _ = preparing_image_labels(ind_df, 'povo')
dataset = ImageDataset("data/br_images/", labels, transform=transform)

## ViT Base Patch-16

### Pre-trained Embedding Space

In [5]:
# Projecting data onto the off-the-shelf pre-trained embedding space from ViT
from tqdm import tqdm
import numpy as np
from torch.utils.data import DataLoader
from transformers import ViTImageProcessor, ViTModel
from training_utils import get_vit_embeddings, data_projections

# Loading model
model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
model.to(device)

# Getting data
dataloader = DataLoader(dataset, batch_size=512, shuffle=True, num_workers=0, pin_memory=True)

# Computing image embeddings
image_embeddings = np.concatenate(get_vit_embeddings(model, dataloader, device), axis=0)

# Computing data projection
vanilla_vit_trimap, vanilla_vit_tsne, vanilla_vit_umap = data_projections(image_embeddings)

Computing embeddings: 100%|████| 23/23 [02:42<00:00,  7.08s/it]


In [6]:
from training_utils import clean_mem

# Cleaning up memory
clean_mem([model, image_embeddings])

### Fine-tuning Embedding Space

In [7]:
# Creating our own ViT classifier head for fine-tuning
import torch.nn as nn

class ViTClassifier(nn.Module):
    def __init__(self, num_classes):
        super(ViTClassifier, self).__init__()
        self.vit = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
        self.classifier = nn.Linear(self.vit.config.hidden_size, num_classes)

    def forward(self, x):
        outputs = self.vit(x)
        
        # Do I get the last_hidden_state of CLS token or the pooler_output?
        embeddings = outputs['last_hidden_state'][:, 0, :]
        # embeddings = outputs['pooler_output']

        logits = self.classifier(embeddings)
        return logits

In [30]:
# LET'S THINK ABOUT CLASS IMBALANCE?
categories = {}
for l in labels.values():
    try:
        categories[l] += 1
    except:
        categories[l] = 1
categories_freq = np.array(list(categories.values()))
print(f'Mean: {np.mean(categories_freq)}')
print(f'Median: {np.quantile(categories_freq, 0.98)}')
print(f'Categories with more than the median {np.sum(categories_freq > 65)}')

Mean: 73.20779220779221
Median: 423.3799999999997


np.int64(39)

In [26]:
ind_df.groupby('categoria').count()

Unnamed: 0_level_0,url,thumbnail,creation_date,modification_date,numero_do_item,tripticos,nome_do_item,nome_do_item_dic,colecao,coletor,...,exposicao,referencias,disponibilidade,qualificacao,historia_adm,notas_gerais,observacao,conservacao,image_path,image_path_br
categoria,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"adornos de materiais ecléticos, indumentária e toucador",4510,2988,4510,4510,4510,3279,4510,4347,4286,1059,...,391,4502,4263,563,485,1519,2990,4375,2988,2988
adornos plumários,2082,585,2082,2082,2081,1375,2082,1945,1914,521,...,23,2080,1631,57,179,436,1129,2047,584,584
armas,2318,237,2318,2318,2318,994,2318,2281,2074,494,...,0,2317,1481,88,71,311,871,2303,237,237
cerâmica,2742,2191,2742,2742,2742,940,2742,2651,2040,923,...,48,2742,2666,377,56,259,738,2700,2191,2191
cordões e tecidos,1995,1268,1995,1995,1995,865,1995,1818,1907,498,...,33,1993,1851,138,111,642,965,1957,1268,1268
etnobotânica,98,33,98,98,98,53,98,46,88,27,...,0,98,75,17,6,23,50,92,33,33
instrumentos musicais e de sinalização,900,482,900,900,900,479,900,870,838,196,...,15,899,787,57,68,127,369,894,482,482
"objetos rituais, mágicos e lúdicos",2221,1370,2221,2221,2220,904,2221,1882,2051,362,...,48,2221,2064,93,118,293,1016,2192,1370,1370
trançados,2584,1166,2584,2584,2584,1347,2584,2501,2455,729,...,30,2576,2329,247,145,384,1174,2552,1166,1166
utensílios e implementos de materiais ecléticos,1515,955,1515,1515,1515,755,1515,1251,1401,448,...,6,1515,1366,65,166,217,705,1489,955,955


#### *povo* Column

In [None]:
from training_utils import get_train_val_split, train_loop, plot_train_curves 
from torch.utils.data import random_split
import torch.optim as optim

# Creating training and validation datasets
train_size = int(0.85*len(dataset))
batch_size = 32
train_dataloader, val_dataloader = get_train_val_split(dataset, train_size, batch_size)

# Training set-up and execution for 'povo'
num_classes = ind_df['povo'].nunique()
model = ViTClassifier(num_classes).to(device)
criterion = nn.CrossEntropyLoss()
opt = optim.Adam(model.parameters(), lr=5e-5, weight_decay=0)
epochs = 30

losses, accuracies, class_precisions, class_recalls = train_loop(model, num_classes, \
                                                                 train_dataloader, \
                                                                 val_dataloader, device, \
                                                                 criterion, opt, \
                                                                 'vit_povo', epochs)
plot_train_curves(losses, accuracies, "ViT Fine-Tuned on 'povo'")
print(f'Per class precision: {class_precisions[-1]}')
print(f'Per class recall: {class_recalls[-1]}')

# Computing image embeddings
model.classifier = nn.Identity()
image_embeddings = np.concatenate(get_vit_embeddings(model, dataloader, device, True), axis=0)

# Computing data projection
povo_vit_trimap, povo_vit_tsne, povo_vit_umap = data_projections(image_embeddings)

#### *categoria* Column

In [None]:
# Cleaning up memory
clean_mem([model, image_embeddings])

# Preparing dataset for next training process
labels, name_to_num = preparing_image_labels(ind_df, 'categoria')
dataset = ImageDataset("data/br_images/", labels, transform=transform)

train_size = int(0.85*len(dataset))
batch_size = 32
train_dataloader, val_dataloader = get_train_val_split(dataset, train_size, batch_size)

# Training set-up and execution for 'categoria'
num_classes = ind_df['categoria'].nunique()
model = ViTClassifier(num_classes).to(device)
criterion = nn.CrossEntropyLoss()
opt = optim.Adam(model.parameters(), lr=2e-5, weight_decay=0)
epochs = 30

losses, accuracies, class_precisions, class_recalls = train_loop(model, num_classes, \
                                                                 train_dataloader, \
                                                                 val_dataloader, device, \
                                                                 criterion, opt, \
                                                                 'vit_categoria', epochs)
plot_train_curves(losses, accuracies, "ViT Fine-Tuned on 'categoria'")
print(f'Per class precision: {class_precisions[-1]}')
print(f'Per class recall: {class_recalls[-1]}')

# Computing image embeddings
model.classifier = nn.Identity()
image_embeddings = np.concatenate(get_vit_embeddings(model, dataloader, device, True), axis=0)

# Computing data projection
categoria_vit_trimap, categoria_vit_tsne, \
categoria_vit_umap = data_projections(image_embeddings)

In [None]:
# Cleaning up memory
clean_mem([model, image_embeddings])

### Visualizing and Comparing Projections

In [None]:
from training_utils import normalize

# Normalizing data for later plot on tool
norm_factor = 12
vanilla_vit_trimap = normalize(vanilla_vit_trimap, norm_factor)
vanilla_vit_tsne = normalize(vanilla_vit_tsne, norm_factor)
vanilla_vit_umap = normalize(vanilla_vit_umap, norm_factor)

povo_vit_trimap = normalize(povo_vit_trimap, norm_factor)
povo_vit_tsne = normalize(povo_vit_tsne, norm_factor)
povo_vit_umap = normalize(povo_vit_umap, norm_factor)

categoria_vit_trimap = normalize(categoria_vit_trimap, norm_factor)
categoria_vit_tsne = normalize(categoria_vit_tsne, norm_factor)
categoria_vit_umap = normalize(categoria_vit_umap, norm_factor)

In [None]:
# Visualizing resulting projections
plt.figure(figsize=(12,8))
plt.suptitle('Comparing Projections of ViT Models')

# Plotting vanilla ViT projections
for i, (vanilla_vit, proj_name) in enumerate(zip([vanilla_vit_trimap, \
                                                  vanilla_vit_tsne, vanilla_vit_umap], \
                                                 ['TriMap', 't-SNE', 'UMAP'])):
    plt.subplot(3, 3, i+1)
    plt.scatter(vanilla_vit[:, 0], vanilla_vit[:, 1], c='b')
    plt.title("Vanilla ViT with " + proj_name)
    plt.xlabel("")
    plt.ylabel("")
    plt.xticks([])
    plt.yticks([])

# Plotting ViT fine-tuned on 'povo' projections
for i, (povo_vit, proj_name) in enumerate(zip([povo_vit_trimap, \
                                               povo_vit_tsne, povo_vit_umap], \
                                              ['TriMap', 't-SNE', 'UMAP'])):
    plt.subplot(3, 3, i+4)
    plt.scatter(povo_vit[:, 0], povo_vit[:, 1], c='r')
    plt.title("ViT Fine-Tuned on 'povo' with " + proj_name)
    plt.xlabel("")
    plt.ylabel("")
    plt.xticks([])
    plt.yticks([])

# Plotting ViT fine-tuned on 'categoria' projections
for i, (categoria_vit, proj_name) in enumerate(zip([categoria_vit_trimap, \
                                                    categoria_vit_tsne, categoria_vit_umap], \
                                                   ['TriMap', 't-SNE', 'UMAP'])):
    plt.subplot(3, 3, i+7)
    plt.scatter(categoria_vit[:, 0], categoria_vit[:, 1], c='g')
    plt.title("ViT Fine-Tuned on 'categoria' with " + proj_name)
    plt.xlabel("")
    plt.ylabel("")
    plt.xticks([])
    plt.yticks([])

plt.tight_layout()
plt.show()

### Visualizing Clusters

In [None]:
# Filtering dataframe to get only the part that contains images
filtered_df = ind_df.loc[ind_df['image_path'].notna()]

# Building colormap for cluster visualization
column = 'categoria' # 'povo', 'categoria', 'ano_de_aquisicao'
unique_values = filtered_df[column].unique()
colors = plt.cm.gnuplot(np.linspace(0, 1, len(unique_values)))
color_dict = {cluster: colors[i] for i, cluster in enumerate(unique_values)}

# Plotting projections with clusters
plt.figure(figsize=(10,4))

for cluster in unique_values:
    mask = filtered_df.index[filtered_df[column] == cluster].tolist()
    sequential_indices = np.array([filtered_df.index.get_loc(idx) for idx in mask])
    plt.scatter(categoria_vit_umap[sequential_indices, 0], \
                categoria_vit_umap[sequential_indices, 1], 
                color=color_dict[cluster], label=f"{cluster.title()}", alpha=0.7)

plt.title(f"Visualizing Clusters for Categoria on UMAP Projection")
plt.xlabel("")
plt.ylabel("")
plt.xticks([])
plt.yticks([])
plt.legend(title="Clusters", bbox_to_anchor=(1.05, 1), loc="upper left", \
           fontsize=8, frameon=True)

plt.tight_layout()
plt.show()

In [44]:
for a, b, c in dataloader:
    print(c)
    break

ValueError: not enough values to unpack (expected 3, got 2)

In [40]:
# Saving outputs for visualization tool
# print(labels)
mask1 = filtered_df.index[filtered_df[column] == unique_values[5]].tolist()
mask2 = filtered_df.index[filtered_df[column] == unique_values[8]].tolist()

sequence1 = np.array([filtered_df.index.get_loc(idx) for idx in mask1])
sequence2 = np.array([filtered_df.index.get_loc(idx) for idx in mask2])

print(len(set(sequence1).union(set(sequence2))) == len(sequence1)+len(sequence2))

True
