In [1]:
import pandas as pd

# Loading dataset
ind_df = pd.read_csv('../data/indigenous_collection_processed.csv', index_col='id')
print(f'Dataframe columns: \n{ind_df.columns}')

Dataframe columns: 
Index(['url', 'thumbnail', 'creation_date', 'modification_date',
       'numero_do_item', 'tripticos', 'categoria', 'nome_do_item',
       'nome_do_item_dic', 'colecao', 'coletor', 'doador', 'modo_de_aquisicao',
       'data_de_aquisicao', 'ano_de_aquisicao', 'data_de_confeccao', 'autoria',
       'nome_etnico', 'descricao', 'dimensoes', 'funcao', 'materia_prima',
       'tecnica_confeccao', 'descritor_tematico', 'descritor_comum',
       'numero_de_pecas', 'itens_relacionados', 'responsavel_guarda',
       'inst_detentora', 'povo', 'autoidentificacao', 'lingua',
       'estado_de_origem', 'geolocalizacao', 'pais_de_origem', 'exposicao',
       'referencias', 'disponibilidade', 'qualificacao', 'historia_adm',
       'notas_gerais', 'observacao', 'conservacao', 'image_path'],
      dtype='object')


In [2]:
import importlib
from IPython.core.magic import register_cell_magic

# Creating skip cell command
@register_cell_magic
def skip(line, cell):
    return

In [3]:
# Centralizing main imports so we can run the models separately
import random
from tqdm.notebook import tqdm
from PIL import Image

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import transforms

import matplotlib.pyplot as plt
%matplotlib inline

from language_training_utils import *

# import language_training_utils
# importlib.reload(language_training_utils)
# from language_training_utils import *

# Language Clustering

Clustering experiments with text feature extractors. The idea is to fine-tune some pre-trained transformer models on our dataset and then remove the last layer of the model to cluster on the embedding space projections.

## BERTimbau

### Vanilla Model

In [5]:
from transformers import AutoModel, AutoTokenizer
from captum.attr import LayerIntegratedGradients

# Getting device
device = torch.device('cuda:0' if torch.cuda.is_available() else "cpu")

# Initializing model, turning it into eval mode and zeroing out the gradients
model = AutoModel.from_pretrained('neuralmind/bert-base-portuguese-cased')
model = model.to(device)
model.eval()
model.zero_grad()

# Initializing tokenizer
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased', \
                                          do_lower_case=False)

# Getting sentences' dataset, dataloader and splits
text_ind_df = ind_df[~ind_df['descricao'].isna()]
max_length = 64
text_dataset = TextDataset(ind_df, tokenizer, max_length=64)
batch_size = 3
text_dataloader = get_dataloaders(text_dataset, batch_size)

data_size = len(text_dataset)
train_size = int(0.8*data_size)
val_size = int(0.1*data_size)
test_size = data_size - train_size - val_size
splits = [train_size, val_size, test_size]
text_dataset_splits, text_dataloader_splits = get_dataloaders(text_dataset, batch_size, splits)

# Initializing baseline input_ids and attention_mask, and computing its embedding
baseline_input_ids = torch.full((1, max_length), tokenizer.pad_token_id).to(device)
baseline_input_ids[:, 0] = tokenizer.cls_token_id
baseline_input_ids[:, -1] = tokenizer.sep_token_id
baseline_attention_mask = torch.zeros_like(baseline_input_ids).to(device)
baseline_attention_mask[:, 0] = 1
baseline_attention_mask[:, -1] = 1
with torch.no_grad():
    baseline_outputs = model(input_ids=baseline_input_ids, \
                             attention_mask=baseline_attention_mask)
    baseline_embedding = baseline_outputs.last_hidden_state[:, 0]

# Initializing captum compatible model and layer integrated gradients. We use layer integrated
# gradients here because we can't compute gradients with respect to (discrete) indices
# directly, so we compute gradients with respect to the embeddings of the input tokens
vanilla_wrapped_model = VanillaWrappedModel(model, tokenizer.pad_token_id, \
                                            baseline_embedding, device, target_type='cos-sim')
vanilla_wrapped_model.eval()

lig = LayerIntegratedGradients(vanilla_wrapped_model, model.embeddings)

In [7]:
# Iterating through the dataloader to compute embeddings and token attributions
text_indices = []
text_embeddings = []
text_attributions = []
for indices, input_ids, labels in tqdm(text_dataloader):
    # Saving indices
    text_indices.append(indices)
    
    # Moving appropriate tensors to device
    input_ids = input_ids.to(device)
    # labels = labels.to(device)

    # Computing [CLS] token embeddings
    cls_embeddings = get_embeddings(model, tokenizer, input_ids, device)
    text_embeddings.append(cls_embeddings.cpu().detach())

    # Computing attributions
    attributions, delta = get_attributions(lig, tokenizer, input_ids, baseline_input_ids, \
                                           attrib_aggreg_type='l2-norm', verbose=True, \
                                           sample_num=0)
    text_attributions.append(attributions)

    break

# Concatenating the batches
text_embeddings = torch.cat(text_embeddings, dim=0)
text_attributions = torch.cat(text_attributions, dim=0)
text_indices = torch.cat(text_indices, dim=0)

  0%|                                 | 0/6989 [00:00<?, ?it/s]

Token importances:
faca                 -> 0.0812
de                   -> 0.0394
material             -> 0.0217
orgânico             -> 0.0357
confeccionado        -> 0.0236
com                  -> 0.0080
ponta                -> 0.0117
lanceolada           -> 0.0295
de                   -> 0.0081
taquara              -> 0.0209
e                    -> 0.0076
cabo                 -> 0.0099
de                   -> 0.0073
madeira              -> 0.0084
roliça               -> 0.0213
.                    -> 0.0098
apresenta            -> 0.0125
engate               -> 0.0181
reforçado            -> 0.0146
com                  -> 0.0059
cera                 -> 0.0073
preta                -> 0.0070
revestido            -> 0.0152
de                   -> 0.0076
fios                 -> 0.0078
de                   -> 0.0068
algodão              -> 0.0085
,                    -> 0.0074
arrematado           -> 0.0308
com                  -> 0.0075
penas                -> 0.0095
vermelhas           




In [None]:
# from transformers import AutoModelForPreTraining

### SimCSE (Contrastive Learning with no Labels)