In [1]:
import pandas as pd

# Loading dataset
ind_df = pd.read_csv('../data/indigenous_collection_processed.csv', index_col='id')
print(f'Dataframe columns: \n{ind_df.columns}')

Dataframe columns: 
Index(['url', 'thumbnail', 'creation_date', 'modification_date',
       'numero_do_item', 'tripticos', 'categoria', 'nome_do_item',
       'nome_do_item_dic', 'colecao', 'coletor', 'doador', 'modo_de_aquisicao',
       'data_de_aquisicao', 'ano_de_aquisicao', 'data_de_confeccao', 'autoria',
       'nome_etnico', 'descricao', 'dimensoes', 'funcao', 'materia_prima',
       'tecnica_confeccao', 'descritor_tematico', 'descritor_comum',
       'numero_de_pecas', 'itens_relacionados', 'responsavel_guarda',
       'inst_detentora', 'povo', 'autoidentificacao', 'lingua',
       'estado_de_origem', 'geolocalizacao', 'pais_de_origem', 'exposicao',
       'referencias', 'disponibilidade', 'qualificacao', 'historia_adm',
       'notas_gerais', 'observacao', 'conservacao', 'image_path'],
      dtype='object')


In [2]:
import importlib
from IPython.core.magic import register_cell_magic

# Creating skip cell command
@register_cell_magic
def skip(line, cell):
    return

In [3]:
# Centralizing main imports so we can run the models separately
import random
from tqdm.notebook import tqdm
from PIL import Image

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import transforms

import matplotlib.pyplot as plt

from language_training_utils import *

# import language_training_utils
# importlib.reload(language_training_utils)
# from language_training_utils import *

# Language Clustering

Clustering experiments with text feature extractors. The idea is to fine-tune some pre-trained transformer models on our dataset and then remove the last layer of the model to cluster on the embedding space projections.

In [43]:
from transformers import AutoModel, AutoTokenizer
# from transformers import AutoModelForPreTraining
from captum.attr import LayerIntegratedGradients

# Initializing device
device = torch.device('cuda:0' if torch.cuda.is_available() else "cpu")

# Initializing model
model = AutoModel.from_pretrained('neuralmind/bert-base-portuguese-cased')
model.to(device)
model.eval()
model.zero_grad()

# Initializing tokenizer
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased', \
                                          do_lower_case=False)

# Getting sentence enconding
sentence = 'Haverá uma festa amanhã'
inputs = tokenizer(sentence, return_tensors='pt')
input_ids = inputs['input_ids'].to(device)

# Initializing baseline embedding
baseline_input_ids = torch.full_like(input_ids, tokenizer.pad_token_id).to(device)

# Captum compatible wrapper model. Notice that we need to use a scalar representation for the 
# [CLS] token so we can evaluate how ir varies with the other tokens (integrated gradients as
# the name suggests). In this case, we decided to go for the norm of the embedding vector
class WrappedModel(nn.Module):
    def __init__(self, model, pad_token_id):
        super(WrappedModel, self).__init__()
        self.model = model
        self.pad_token_id = pad_token_id
        
    def forward(self, input_ids):
        attention_mask = (input_ids != self.pad_token_id).to(device)
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = outputs.last_hidden_state[:, 0]
        cls_scalar = cls_embedding.norm(p=2, dim=1)
        return cls_scalar

# Initializing captum compatible model
wrapped_model = WrappedModel(model, tokenizer.pad_token_id).to(device)

# Initializing integrated gradients
lig = LayerIntegratedGradients(wrapped_model, model.embeddings)

# Computing attributions and then summing over the embedding dimensions (because we compute 
# attributions for every dimension of the tokens' embeddings, so we need some kind of
# aggregation to idedntify tokens individually)
attributions, delta = lig.attribute(inputs=input_ids, baselines=baseline_input_ids, \
                                    return_convergence_delta=True, n_steps=50)
attributions = attributions.sum(dim=-1)

# Decoding tokens and getting attributions for the [CLS] token
tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
attribs = attributions.squeeze().detach().cpu().numpy()

# Normalizing to output easier to interpret and because we are more interested in the absolute
# importance of tokens rather then their signal
attribs = np.abs(attribs)
attribs = attribs / attribs.sum()

print("Token importances:")
for token, score in zip(tokens[:-1], attribs):
    print(f"{token:10} -> {score:.4f}")

Token importances:
[CLS]      -> 0.0375
Ha         -> 0.0529
##ver      -> 0.0526
##á        -> 0.1022
uma        -> 0.0885
festa      -> 0.1248
aman       -> 0.0471
##hã       -> 0.0980
