In [1]:
import pandas as pd

# Loading dataset
ind_df = pd.read_csv('data/indigenous_collection.csv', index_col='id')
print(f'Dataframe columns: \n{ind_df.columns}')

Dataframe columns: 
Index(['url', 'thumbnail', 'creation_date', 'modification_date',
       'numero_do_item', 'tripticos', 'categoria', 'nome_do_item',
       'nome_do_item_dic', 'colecao', 'coletor', 'doador', 'modo_de_aquisicao',
       'data_de_aquisicao', 'ano_de_aquisicao', 'data_de_confeccao', 'autoria',
       'nome_etnico', 'descricao', 'dimensoes', 'funcao', 'materia_prima',
       'tecnica_confeccao', 'descritor_tematico', 'descritor_comum',
       'numero_de_pecas', 'itens_relacionados', 'responsavel_guarda',
       'inst_detentora', 'povo', 'autoidentificacao', 'lingua',
       'estado_de_origem', 'geolocalizacao', 'pais_de_origem', 'exposicao',
       'referencias', 'disponibilidade', 'qualificacao', 'historia_adm',
       'notas_gerais', 'observacao', 'conservacao', 'image_path'],
      dtype='object')


In [2]:
from IPython.core.magic import register_cell_magic

@register_cell_magic
def skip(line, cell):
    return

## Exploring the Numbers

In [3]:
%%skip

# How many item do we have?
print(f'Amount of items: {len(ind_df)}')

# How many items per category?
print(f'Amount of items per category: \n{ind_df.groupby(["categoria"]).count()["url"]}')

In [4]:
%%skip

# How many items with images do we have?
print(f'Amount of items with images: {len(ind_df[ind_df["thumbnail"].notna()])}')

# How many items with images do we have per category?
print(f'Amount of items with image per category: \n{ind_df.groupby(["categoria"]).count()["thumbnail"]}')

In [5]:
%%skip

pd.set_option('display.max_rows', None)

# How many items do we have per community?
print(f'Amount of items per community: \n{ind_df.groupby(["povo"]).count()["url"]}')

In [6]:
%%skip

# How many items do we have per category per community?
print(f'Amount of items per category per community: \n{ind_df.groupby(["categoria", "povo"]).count()["url"]}')

In [7]:
%%skip

# How many items with image do we have per category per community?
print(f'Amount of items with image per category per community: \n{ind_df.groupby(["categoria", "povo"]).count()["thumbnail"]}')

In [8]:
%%skip

# Function to answer three basic questions about a column
# 1. How many unique values of the column do we have?
# 2. How many items have that column?
# 2. How many items do we have per unique value of the column?
def column_basic_info(column):
    n_unique_items = ind_df[column].nunique()
    print('Amount of unique ' + column + f': {n_unique_items}')
    print('Amount of items with an associated ' + column + f': {sum(ind_df[column].notna())}')
    
    if n_unique_items < len(ind_df)/4:
        print('Amount of items per ' + column + f': \n{ind_df.groupby([column]).count()["url"]}')
    
    print()

# Exploring basic infor about other columns
other_columns = [col for col in ind_df.columns]

for col in other_columns:
    column_basic_info(col)

## Studying, Organizing and Filtering Dataset

In [9]:
# Explanation of each column
# url               -> Link to the object in Tainacan's archive.

# thumbnail         -> Link to thumbnail image of element. NaN if there's no image associated with the object.

# creation_date     -> Creation date of the object in Tainacan's archive. This is an internal variable from the platform.

# modification_date -> Date of last modification of the object in Tainacan's archive. This is an internal variable from the platform.

# numero_do_item    -> String identifying the item. It has multiple formats (DD.D.DD, DD.D.DDC, D, DD, DDD, DDDD, DDDDC, DDDDCCC and more).

# tripticos         -> No relevant explanation (D.DD or DD.DD).

# categoria         -> Category of the item. There are ten different and well-defined of these.

# nome_do_item      -> Name of the object. Sometimes its followed by an observation in parenthesis.

# nome_do_item_dic  -> New version of 'nome_do_item_de_acordo_com_o_dicionario'. Name of the item according to dictionary. 
#                      A second and more generic name of the object.

# colecao           -> Name of the collection the item belongs to.

# coletor           -> Person or institution responsible for collecting the item.

# doador            -> Person or institution responsible for donating the item to the museum.

# modo_de_aquisicao -> How the item was obtained, either bought, donated, exchanged, another way or unknown.

# data_de_aquisicao -> When the item was acquired by the museum.

# ano_de_aquisicao  -> New version of 'ano_de_aquisicao_do_objeto'. The year in which the item was acquired by the museum.

# data_de_confeccao -> New version of 'data_de_confeccao_do_item'. When the item was made.

# autoria           -> Person or institutions that made the item. Sometimes its followed by an observation in parenthesis.

# nome_etnico       -> New version of 'nome_etnico_do_item'. Indigenous name of the item. Normally the noun itself is inside quotes and
#                      sometimes more information can be found on the string.

# descricao         -> Description of the object. Commonly includes material, parts that it is made of (or how it was made), and 
#                      functionality.

# dimensoes         -> Dimensions of the object.

# funcao            -> Function of the object.

# materia_prima     -> New version of 'materia-prima'. Material the object is made of. Materials are subdivided in a few categories:
#                      'animal', 'vegetal', 'mineral' and 'sintetico'.

# tecnica_confeccao -> New version of 'tecnica_de_confeccao'. Techniques used to make the item.

# descritor_tematico-> A few words that describe themes to which the item is related.

# descritor_comum   -> A few words that describe categories related to the item more generically. It is not clear how it is actually 
#                      different from 'descritor_tematico'.

# numero_de_pecas   -> Number of pieces for an item with an associated string, normally a short 'description' of the pieces.

# itens_relacionados-> New version of 'itens_relacionados_ao_objeto'. List of other items (in 'numer_do_item' form) related to the item.

# responsavel_guarda-> New version of 'responsavel_pela_guarda'. Museum responsible for the item (wither Museu do Indio or musees  
#                      d'historie naturalle, industriel commerciel e d'ethnographiede).

# inst_detentora    -> New version of 'instituicao_detentora'. Museum that has the item (always Museu do Indio).

# povo              -> Community associated to the item.

# autoidentificacao -> List of communities associated to the item by the original owner of the item.

# lingua            -> Language of the community associated to the item.

# estado_de_origem  -> List of brazilian states associated with the item.

# geolocalizacao    -> New version of 'localizacao_geografica_especifica'. String describing specifics of the location where the item comes
#                      from (either city, community, description of the location or smoething else). 

# pais_de_origem    -> Country where the item is from.

# exposicao         -> New version of 'participacao_em_exposicao'. Exhibition in which the item took part, possibly together with the date
#                      it was returned to the museum.

# referencias       -> New version of 'referencia_bibliografica'. String containing all the bibliographic references to the item.

# disponibilidade   -> New version of 'disponibilidade_do_objeto'. Whether the object is not accessible, locally accessible or completly
#                      accessible.

# qualificacao      -> Yet another description column.

# historia_adm      -> New version of 'historia_administrativa'. String normally containing the history of how the item was acquired by
#                      the museum. Sometimes it contains random information either about the object or about a person related to the object.

# notas_gerais      -> String containing all sorts of information, directly related or not to the item.

# observacao        -> New version of 'observacao_sobre_o_item'. Yet another string containing all sorts of information, directly related
#                      or not to the item.

# conservacao       -> New version of 'estado_de_conservacao'. Conservation state of the item. Either good, regular or bad.

# image_path        -> Local path to associated image. 

In [10]:
%%skip

import re

# Fixing data types
ind_df['creation_date'] = pd.to_datetime(ind_df['creation_date'])
ind_df['modification_date'] = pd.to_datetime(ind_df['modification_date'])

# Fixing name
ind_df.rename(columns={'nome_do_item_de_acordo_com_o_dicionario': 'nome_do_item_dic'}, inplace=True)

# Extracting correct date when possible
def parse_dates_1(date_str):
    try:
        return pd.to_datetime(date_str)
    except:
        # Possible formats that are not completly random
        for fmt in ("%Y%m%d", "%Y%m00", "$Y", "%Y0", "%Y00", "%Y000", "%Y0000", "%Y00000", "%Y000000", "%Y%m%dE7"):
            try:
                return pd.to_datetime(date_str.replace('.',''), format=fmt)
            except ValueError:
                continue
        return pd.NaT
ind_df['data_de_aquisicao'] = ind_df['data_de_aquisicao'].apply(parse_dates_1)

# Fixing name and extracting correct year when possible
def extract_year(date):
    return pd.to_datetime(date).year if pd.notnull(date) else pd.NaT
ind_df.rename(columns={'ano_de_aquisicao_do_objeto': 'ano_de_aquisicao'}, inplace=True)
ind_df['ano_de_aquisicao'] = ind_df['ano_de_aquisicao'].apply(extract_year)

# Fixing name and extracting correct date
def parse_dates_2(date):
    # Is it already a date?
    try:
        return pd.to_datetime(date).date()
    except:
        pass

    # Remove '?' and try to match an year
    date = date.replace('?', '')
    try:
        if len(date) == 4 and date.isdigit():
            return pd.to_datetime(f'{date}-01-01').date()
    except:
        pass

    # Do we have a date of the type '<month_in_portuguese> of YYYY'?
    try:
        date = date_entry.lower().strip()
        months_pt = {'janeiro': '01', 'fevereiro': '02', 'março': '03', 'abril': '04', 'maio': '05', 'junho': '06', 'julho': '07', \
                     'agosto': '08', 'setembro': '09', 'outubro': '10', 'novembro': '11', 'dezembro': '12'}
        for month_pt, month_num in months_pt.items():
            if month_pt in date:
                year = ''.join(filter(str.isdigit, date))
                return pd.to_datetime(f'{year}-{month_num}-01').date()
    except:
        pass

    # Do we have multiple years split by '|'?
    if '|' in date:
        years = date.split('|')
        years = [year.strip() for year in years if year.strip().isdigit()]
        return [pd.to_datetime(f'{year}-01-01').date() for year in years]

    # Do we have multiple years split by '/'?
    elif '/' in date:
        years = date.split('/')
        years = [year.strip() for year in years if year.strip().isdigit()]
        return [pd.to_datetime(f'{year}-01-01').date() for year in years]
    
    return pd.NaT
ind_df.rename(columns={'data_de_confeccao_do_item': 'data_de_confeccao'}, inplace=True)
ind_df['data_de_confeccao'] = ind_df['data_de_confeccao'].apply(parse_dates_2)

# Fixing name
ind_df.rename(columns={'nome_etnico_do_item': 'nome_etnico'}, inplace=True)

# Extracting dimensions of the object [length, width, height, diameter]
all_dimensions = []
for index, value in ind_df['dimensoes'].items():
    dimensions = [.0, .0, .0, .0]

    # Do I even have information do be extracted?
    if type(value) is not str:
        all_dimensions.append(pd.NA)
    
    else:
        try:
            # Fix typos and change ',' to '.' for type casting later
            string_dimensions = value.replace(',o', ',0')
            string_dimensions = string_dimensions.replace(',', '.')

            # Matching numbers and studying context to decide what we are measuring (length, width, height or diameter)
            numbers = re.findall(r'\d+\.\d', string_dimensions)
            for number in numbers:
                num_pos = string_dimensions.find(number)
                context = string_dimensions[num_pos:min(num_pos+len(number)+18, len(string_dimensions))].lower()

                # Checking if we have meters or centimeters
                if len(re.findall(r'(?<!c)m', string_dimensions[num_pos:min(num_pos+len(number)+4, len(string_dimensions))])) > 0:
                    number = float(number)*100
                else:
                    number = float(number)
                
                if 'comprimento' in context:
                    dimensions[0] = number
                elif 'largura' in context:
                    dimensions[1] = number
                elif 'altura' in context:
                    dimensions[2] = number
                elif 'diametro' in context:
                    dimensions[3] = number
        except Exception as e:
            print(number, context, e)
            all_dimensions.append(pd.NA)
            continue

        # Last sanity check to account for degenerate strings
        if sum(dimensions) != .0:
            all_dimensions.append(dimensions)
        else:
            all_dimensions.append(pd.NA)
ind_df['dimensoes'] = all_dimensions


# Fixing name and parsing 'materia_prima' into a list of lists with materials
# [[<'animal' materials>], [<'vegetal' materials>], [<'sintetico' materials>]]
def parse_materials(row):
    if type(row) is not str:
        return pd.NA
    
    material_categories = {'animal': [], 'vegetal': [], 'mineral': [], 'sintetico': []}

    # Preprocessing string
    materials = row.split('|')
    for material in materials:
        material = material.strip().replace('&gt;', '>')

        # Matching categories and materials
        match = re.match(r'(\w+)\s*>\s*(.*)', material)
        try:
            c, m = match.groups()
            material_categories[c].append(m.strip())
        except:
            continue

    return [material_categories['animal'], material_categories['vegetal'], material_categories['sintetico']]
ind_df.rename(columns={'materia-prima': 'materia_prima'}, inplace=True)
ind_df['materia_prima'] = ind_df['materia_prima'].apply(parse_materials)

# Generic parsing function to deal with more well-behaved columns
def parse_generic(row):
    if type(row) is not str:
        return pd.NA
    return [c.strip() for c in row.split('|')]

# Fixing name and parsing 'tecnica_confeccao' into a list with all the ways the techniques used to make the item
ind_df.rename(columns={'tecnica_de_confeccao': 'tecnica_confeccao'}, inplace=True)
ind_df['tecnica_confeccao'] = ind_df['tecnica_confeccao'].apply(parse_generic)

# Parsing 'descritor_tematico' into a list with all the words of themes related to the item
ind_df['descritor_tematico'] = ind_df['descritor_tematico'].apply(parse_generic)

# Parsing 'descritor_comum' into a list with all the generic words related to the item
ind_df['descritor_comum'] = ind_df['descritor_comum'].apply(parse_generic)

# Parsing 'numero_de_pecas' into a list containing the number of pieces and the associated 'description' of the pieces
def parse_number_of_pieces(row):
    if type(row) is not str:
        return pd.NA

    # Processing number of pieces string
    pieces_information = row.strip().split(' ', 1)
    try:
        number = pieces_information[0].strip()
        description = ''
        if len(pieces_information) > 1:
            description = pieces_information[1].strip()

        return [int(number), description]

    except:
        return pd.NA
ind_df['numero_de_pecas'] = ind_df['numero_de_pecas'].apply(parse_number_of_pieces)

# Fixing name and parsing 'itens_relacionados' into a list with all the related items (in 'numero_do_item' form)
ind_df.rename(columns={'itens_relacionados_ao_objeto': 'itens_relacionados'}, inplace=True)
ind_df['itens_relacionados'] = ind_df['itens_relacionados'].apply(parse_generic)

# Fixing name
ind_df.rename(columns={'responsavel_pela_guarda': 'responsavel_guarda'}, inplace=True)

# Fixing name
ind_df.rename(columns={'instituicao_detentora': 'inst_detentora'}, inplace=True)

# Parsing 'autoidentificacao' into a list with all communities associated to the item by the original owner of the item
ind_df['autoidentificacao'] = ind_df['autoidentificacao'].apply(parse_generic)

# Processing language information to standardize it
def processing_language(row):
    if type(row) is not str or row.strip() == 'sem identificacao' or row.strip() == 'sem identificação':
        return pd.NA
    return row.strip()
ind_df['lingua'] = ind_df['lingua'].apply(processing_language)

# Processing state information to standardize it
def processing_state(row):
    if type(row) is not str or row.strip() == 's/ procedencia':
        return pd.NA

    # Mapping states to acronyms
    state_to_acronym = {'acre': 'AC', 'alagoas': 'AL', 'amapa': 'AP', 'amazonas': 'AM', 'bahia': 'BA', 'ceara': 'CE', \
                        'distrito federal': 'DF', 'espirito santo': 'ES', 'goias': 'GO', 'maranhao': 'MA', 'mato grosso': 'MT',\
                        'mato grosso do sul': 'MS', 'minas gerais': 'MG', 'para': 'PA', 'paraiba': 'PB', 'parana': 'PR',\
                        'pernambuco': 'PE', 'piaui': 'PI', 'rio de janeiro': 'RJ', 'rio grande do norte': 'RN', 'rio grande do sul': 'RS',\
                        'rondonia': 'RO', 'roraima': 'RR', 'santa catarina': 'SC', 'sao paulo': 'SP', 'sergipe': 'SE', 'tocantins': 'TO'}
    state_list = []
    for c in row.split('|'):
        c = c.strip()
        state_list.append(state_to_acronym[c])
    
    return state_list
ind_df['estado_de_origem'] = ind_df['estado_de_origem'].apply(processing_state)

# Fixing name
ind_df.rename(columns={'localizacao_geografica_especifica': 'geolocalizacao'}, inplace=True)

# Parsing 'pais_de_origem' into a list with all countries associated with the origin of the item
ind_df['pais_de_origem'] = ind_df['pais_de_origem'].apply(parse_generic)

# Fixing name and processing 'exposicao' information to standardize it
def processing_exhibition(row):
    if type(row) is not str:
        return pd.NA
    
    # Cleaning text
    text = re.sub(r'\s+', ' ', row).strip()

    # Extracting exhibition
    exhibition = re.match(r'^(.*?)\s*(?=data de retorno das pecas|$)', text)
    exhibition = exhibition.group(1).strip()
    
    # Possibly extracting and converting date
    date = re.search(r'\b(janeiro|fevereiro|marco|abril|maio|junho|julho|agosto|setembro|outubro|novembro|dezembro) \d{1,2}, \d{4}\b', text)
    months_pt = {'janeiro': '01', 'fevereiro': '02', 'março': '03', 'abril': '04','maio': '05', 'junho': '06', 'julho': '07', \
                 'agosto': '08', 'setembro': '09', 'outubro': '10', 'novembro': '11', 'dezembro': '12'}
    if date:
        date = date.group()
        for m_pt, m_num in months_pt.items():
            if m_pt in date:
                date = date.replace(m_pt, m_num)
        date = pd.to_datetime(date, format='%m %d, %Y').date()
    
        return [exhibition, date]

    return [exhibition, ]
ind_df.rename(columns={'participacao_em_exposicao': 'exposicao'}, inplace=True)
ind_df['exposicao'] = ind_df['exposicao'].apply(processing_exhibition)

# Fixing name
ind_df.rename(columns={'referencia_bibliografica': 'referencias'}, inplace=True)

# Fixing name
ind_df.rename(columns={'disponibilidade_do_objeto': 'disponibilidade'}, inplace=True)

# Fixing name
ind_df.rename(columns={'historia_administrativa': 'historia_adm'}, inplace=True)

# Fixing name
ind_df.rename(columns={'observacao_sobre_o_item': 'observacao'}, inplace=True)

# Fixing name
ind_df.rename(columns={'estado_de_conservacao': 'conservacao'}, inplace=True)

# Saving new version of dataframe
ind_df.to_csv('data/indigenous_collection.csv')