In [1]:
import pandas as pd

# Loading dataset
ind_df = pd.read_csv('../data/indigenous_collection_processed.csv', index_col='id')
print(f'Dataframe columns: \n{ind_df.columns}')

Dataframe columns: 
Index(['url', 'thumbnail', 'creation_date', 'modification_date',
       'numero_do_item', 'tripticos', 'categoria', 'nome_do_item',
       'nome_do_item_dic', 'colecao', 'coletor', 'doador', 'modo_de_aquisicao',
       'data_de_aquisicao', 'ano_de_aquisicao', 'data_de_confeccao', 'autoria',
       'nome_etnico', 'descricao', 'dimensoes', 'funcao', 'materia_prima',
       'tecnica_confeccao', 'descritor_tematico', 'descritor_comum',
       'numero_de_pecas', 'itens_relacionados', 'responsavel_guarda',
       'inst_detentora', 'povo', 'autoidentificacao', 'lingua',
       'estado_de_origem', 'geolocalizacao', 'pais_de_origem', 'exposicao',
       'referencias', 'disponibilidade', 'qualificacao', 'historia_adm',
       'notas_gerais', 'observacao', 'conservacao', 'image_path'],
      dtype='object')


In [2]:
import importlib
from IPython.core.magic import register_cell_magic

# Creating skip cell command
@register_cell_magic
def skip(line, cell):
    return

In [3]:
# Centralizing main imports so we can run the models separately
import random
from tqdm.notebook import tqdm
from PIL import Image

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import transforms

import matplotlib.pyplot as plt
%matplotlib inline

from language_training_utils import *

# import language_training_utils
# importlib.reload(language_training_utils)
# from language_training_utils import *

# Language Clustering

Clustering experiments with text feature extractors. The idea is to fine-tune some pre-trained transformer models on our dataset and then remove the last layer of the model to cluster on the embedding space projections.

In [4]:
from transformers import AutoModel, AutoTokenizer
from captum.attr import LayerIntegratedGradients

# Getting device
device = torch.device('cuda:0' if torch.cuda.is_available() else "cpu")

# Initializing model, turning it into eval mode and zeroing out the gradients
model = AutoModel.from_pretrained('neuralmind/bert-base-portuguese-cased')
model.eval()
model.zero_grad()

# Initializing tokenizer
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased', \
                                          do_lower_case=False)

# Getting sentences' dataset, dataloader and splits
text_dataset = TextDataset(ind_df, tokenizer, max_length=64)
batch_size = 2
text_dataloader = get_dataloaders(text_dataset, batch_size)

data_size = len(text_dataset)
train_size = int(0.8*data_size)
val_size = int(0.1*data_size)
test_size = data_size - train_size - val_size
splits = [train_size, val_size, test_size]
text_dataset_splits, text_dataloader_splits = get_dataloaders(text_dataset, batch_size, splits)

# Initializing captum compatible model and layer integrated gradients. We use layer integrated
# gradients here because we can't compute gradients with respect to (discrete) indices
# directly, so we compute gradients with respect to the embeddings of the input tokens
vanilla_wrapped_model = VanillaWrappedModel(model, tokenizer.pad_token_id, device)
vanilla_wrapped_model.eval()

lig = LayerIntegratedGradients(vanilla_wrapped_model, model.embeddings)


# Iterating through the dataloader to compute embeddings and token attributions
text_indices = []
text_embeddings = []
text_attributions = []
for indices, input_ids, labels in text_dataloader:
    # Saving indices
    text_indices.append(indices)
    
    # Moving appropriate tensors to device
    input_ids = input_ids.to(device)
    # labels = labels.to(device)
    
    # Initializing baseline embedding
    baseline_input_ids = torch.full_like(input_ids, tokenizer.pad_token_id).to(device)

    # Computing attention mask and the [CLS] token embeddings
    attention_mask = (input_ids != tokenizer.pad_token_id).to(device).long()
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        cls_embeddings = outputs.last_hidden_state[:, 0]
        text_embeddings.append(cls_embeddings.cpu().detach())

    # Computing attributions and then summing over the embedding dimensions (because we
    # compute attributions for every dimension of the tokens' embeddings, so we need some kind
    # of aggregation to idedntify tokens individually)
    attributions, delta = lig.attribute(inputs=input_ids, baselines=baseline_input_ids, \
                                        return_convergence_delta=True, n_steps=50)
    attributions = attributions.sum(dim=-1)
    attributions = attributions.cpu().detach()
    text_attributions.append(attributions)
    
    # Decoding tokens and getting attributions for the [CLS] token for the first sample
    sample_input_ids = input_ids[0]
    tokens = tokenizer.convert_ids_to_tokens(sample_input_ids.cpu().tolist())
    sample_attribution = attributions[0]
    
    # Normalizing to output easier to interpret and because we are more interested in the
    # absolute importance of tokens rather then their signal
    sample_attribution = torch.abs(sample_attribution)
    if sample_attribution.sum() != 0:
        sample_attribution = sample_attribution/sample_attribution.sum()
    else:
        continue
    
    print("Token importances:")
    for token, score in zip(tokens[:-1], sample_attribution):
        print(f"{token:10} -> {score:.4f}")

    break

# Concatenating the batches
text_embeddings = torch.cat(text_embeddings, dim=0)
text_attributions = torch.cat(text_attributions, dim=0)
text_indices = torch.cat(text_indices, dim=0)

Token importances:
[CLS]      -> 0.0675
faca       -> 0.0079
de         -> 0.0313
material   -> 0.0319
org        -> 0.0102
##ânico    -> 0.0010
confec     -> 0.0094
##cionado  -> 0.0070
com        -> 0.0058
ponta      -> 0.0033
lance      -> 0.0088
##ola      -> 0.0024
##da       -> 0.0011
de         -> 0.0140
ta         -> 0.0139
##quara    -> 0.0513
e          -> 0.0132
cabo       -> 0.0138
de         -> 0.0084
madeira    -> 0.0112
ro         -> 0.0028
##li       -> 0.0011
##ça       -> 0.0078
.          -> 0.0029
apresenta  -> 0.0155
enga       -> 0.0120
##te       -> 0.0002
refor      -> 0.0037
##çado     -> 0.0029
com        -> 0.0105
cera       -> 0.0003
preta      -> 0.0046
revesti    -> 0.0005
##do       -> 0.0028
de         -> 0.0082
fios       -> 0.0042
de         -> 0.0092
algodão    -> 0.0136
,          -> 0.0145
arrem      -> 0.0121
##ata      -> 0.0068
##do       -> 0.0090
com        -> 0.0004
penas      -> 0.0002
vermelhas  -> 0.0001
presas     -> 0.0120
,          -> 0

In [5]:
# from transformers import AutoModelForPreTraining