In [1]:
import pandas as pd

# Loading dataset
ind_df = pd.read_csv('data/indigenous_collection_processed.csv', index_col='id')
print(f'Dataframe columns: \n{ind_df.columns}')

Dataframe columns: 
Index(['url', 'thumbnail', 'creation_date', 'modification_date',
       'numero_do_item', 'tripticos', 'categoria', 'nome_do_item',
       'nome_do_item_dic', 'colecao', 'coletor', 'doador', 'modo_de_aquisicao',
       'data_de_aquisicao', 'ano_de_aquisicao', 'data_de_confeccao', 'autoria',
       'nome_etnico', 'descricao', 'dimensoes', 'funcao', 'materia_prima',
       'tecnica_confeccao', 'descritor_tematico', 'descritor_comum',
       'numero_de_pecas', 'itens_relacionados', 'responsavel_guarda',
       'inst_detentora', 'povo', 'autoidentificacao', 'lingua',
       'estado_de_origem', 'geolocalizacao', 'pais_de_origem', 'exposicao',
       'referencias', 'disponibilidade', 'qualificacao', 'historia_adm',
       'notas_gerais', 'observacao', 'conservacao', 'image_path'],
      dtype='object')


In [2]:
from IPython.core.magic import register_cell_magic

# Creating skip cell command
@register_cell_magic
def skip(line, cell):
    return

## Baseline Clustering

### Random Orthogonal Projection

For this step, we are going to use a few categoric features of the data. Since we don't have any kind of taxonomy on the categories of each feature, the approach will be to use **random orthogonal projections** for the categories to be equidistant in the hyperspace. This way, we keep the representation unbiased - not placing some categories closer to other categories randomly.

Selected features (*italic features are being evaluated*):
- categoria (10 categories)
- *ano_de_confeccao (75 categories)*
- tipo_materia_prima (4 categories)
- *povo (187 categories)*
- *lingua (41 categories)*
- estado_de_origem (27 categories)

In [3]:
import ast

# Creating feature 'tipo_materia_prima'
tipo_materia_prima = []
for index, row in ind_df.iterrows():
    tipo_materia_prima_aux = []
    
    if type(row['materia_prima']) is not float:
        material_list = ast.literal_eval(row['materia_prima'])
        
        if len(material_list[0]) > 0:
            tipo_materia_prima_aux.append('animal')
        if len(material_list[1]) > 0:
            tipo_materia_prima_aux.append('vegetal')
        if len(material_list[2]) > 0:
            tipo_materia_prima_aux.append('mineral')
        if len(material_list[3]) > 0:
            tipo_materia_prima_aux.append('sintetico')

    tipo_materia_prima.append(tipo_materia_prima_aux)
    
ind_df['tipo_materia_prima'] = tipo_materia_prima
ind_df['tipo_materia_prima']

id
55663    [animal, vegetal]
55668            [vegetal]
55673                   []
55678                   []
55688                   []
               ...        
41913            [vegetal]
41918                   []
41923            [vegetal]
41928                   []
41933            [vegetal]
Name: tipo_materia_prima, Length: 20965, dtype: object

In [4]:
import numpy as np

# Fixing experiments for testing
np.random.seed(42)

# Generating random orthonormal projection (equidistant categorical representation)
def rand_ortho_mat(k):
    q = np.random.randn(k, k-1)
    q, _ = np.linalg.qr(q)
    return q

# Getting embeddings for each category in each feature
tipos_materia_prima = ['animal', 'vegetal', 'mineral', 'sintetico']
estados = ['AC', 'AL', 'AP', 'AM', 'BA', 'CE', 'DF', 'ES', 'GO', 'MA',\
           'MT', 'MS', 'MG', 'PA', 'PB', 'PR', 'PE', 'PI', 'RJ', 'RN',\
           'RS', 'RO', 'RR', 'SC', 'SP', 'SE', 'TO']

features = ['categoria', 'tipo_materia_prima', 'estado_de_origem']
feature_index_map = []
qs = []
for feature in features:
    if feature == 'tipo_materia_prima':
        qs.append(rand_ortho_mat(len(tipos_materia_prima)))
        feature_index_map.append({k: i for i, k in enumerate(tipos_materia_prima)})
        
    elif feature == 'estado_de_origem':
        qs.append(rand_ortho_mat(len(estados)))
        feature_index_map.append({k: i for i, k in enumerate(estados)})
    
    else:
        qs.append(rand_ortho_mat(ind_df[feature].nunique()))
        feature_index_map.append({k: i for i, k in enumerate(ind_df[feature].unique())})

# Projecting datapoints
features_size = ind_df['categoria'].nunique() + len(tipos_materia_prima) + len(estados)
data_points = []
for index, row in ind_df.iterrows():
    data_point = []
    
    if type(row['categoria']) is not float:
        data_point.append(qs[0][feature_index_map[0][row['categoria']]])
    else:
        data_point.append(np.zeros_like(qs[0][0]))

    if type(row['tipo_materia_prima']) is not float:
        data_point_aux = np.zeros_like(qs[1][0])
        for materia in row['tipo_materia_prima']:
            data_point_aux += qs[1][feature_index_map[1][materia]]
        data_point.append(data_point_aux)
    else:
        data_point.append(np.zeros_like(qs[1][0]))

    if type(row['estado_de_origem']) is not float:
        data_point_aux = np.zeros_like(qs[2][0])
        for estado in ast.literal_eval(row['estado_de_origem']):
            data_point_aux += qs[2][feature_index_map[2][estado]]
        data_point.append(data_point_aux)
    else:
        data_point.append(np.zeros_like(qs[2][0]))

    data_point = np.concatenate(data_point)
    data_points.append(data_point)

data_points = np.stack(data_points)
print(f'Shape of the final array: {data_points.shape}')

Shape of the final array: (20965, 38)


### Projecting Data Points Onto Lower Dimensional Space

In here we try two main techniques, mainly because of their abilities to preserve distances between points (or rather global structure of the dataset). The first one is MDS, which should work alright for low-dimensional data, but is probably not the one to use on higher dimensionality. The second one is TriMap, a more general approach that should work well enough in high dimensional data as well. 