## Exploring the Numbers

In [None]:
import pandas as pd

# Loading dataset
ind_df = pd.read_csv('data/indigenous_collection.csv', index_col='id')
print(f'Dataframe columns: \n{ind_df.columns}')

In [None]:
# How many item do we have?
print(f'Amount of items: {len(ind_df)}')

# How many items per category?
print(f'Amount of items per category: \n{ind_df.groupby(["categoria"]).count()["url"]}')

In [None]:
# How many items with images do we have?
print(f'Amount of items with images: {len(ind_df[ind_df["thumbnail"].notna()])}')

# How many items with images do we have per category?
print(f'Amount of items with image per category: \n{ind_df.groupby(["categoria"]).count()["thumbnail"]}')

In [None]:
pd.set_option('display.max_rows', None)

# How many items do we have per community?
print(f'Amount of items per community: \n{ind_df.groupby(["povo"]).count()["url"]}')

In [None]:
# How many items do we have per category per community?
print(f'Amount of items per category per community: \n{ind_df.groupby(["categoria", "povo"]).count()["url"]}')

In [None]:
# How many items with image do we have per category per community?
print(f'Amount of items with image per category per community: \n{ind_df.groupby(["categoria", "povo"]).count()["thumbnail"]}')

In [None]:
# Function to answer three basic questions about a column
# 1. How many unique values of the column do we have?
# 2. How many items have that column?
# 2. How many items do we have per unique value of the column?
def column_basic_info(column):
    n_unique_items = ind_df[column].nunique()
    print('Amount of unique ' + column + f': {n_unique_items}')
    print('Amount of items with an associated ' + column + f': {sum(ind_df[column].notna())}')
    
    if n_unique_items < len(ind_df)/4:
        print('Amount of items per ' + column + f': \n{ind_df.groupby([column]).count()["url"]}')
    
    print()

In [None]:
# Exploring basic infor about other columns
# avoid_columns = ['url', 'thumbnail', 'categoria', 'povo']
# unique_columns = ['numero_do_item', 'descricao', 'image_path']
other_columns = [col for col in ind_df.columns]

for col in other_columns:
    column_basic_info(col)

In [None]:
ind_df.columns

## Organizing and Filtering Dataset

In [None]:
# Explanation of each column
# url               -> Link to the object in Tainacan's archive.
# thumbnail         -> Link to thumbnail image of element. NaN if there's no image associated with the object.
# creation_date     -> Creation date of the object in Tainacan's archive. This is an internal variable from the platform.
# modification_date -> Date of last modification of the object in Tainacan's archive. This is an internal variable from the platform.
# numero_do_item    -> String identifying the item. It has multiple formats (DD.D.DD, DD.D.DDC, D, DD, DDD, DDDD, DDDDC, DDDDCCC and more).
# tripticos         -> No relevant explanation (D.DD or DD.DD).
# categoria         -> Category of the item. There are ten different and well-defined of these.
# nome_do_item      -> Name of the object. Sometimes its followed by an observation in parenthesis.
# nome_do_item_dic  -> New version of 'nome_do_item_de_acordo_com_o_dicionario'. Name of the item according to dictionary. 
#                      A second and more generic name of the object.
# colecao           -> Name of the collection the item belongs to.
# coletor           -> Person or institution responsible for collecting the item.
# doador            -> Person or institution responsible for donating the item to the museum.
# modo_de_aquisicao -> How the item was obtained, either bought, donated, exchanged, another way or unknown.
# data_de_aquisicao -> When the item was acquired by the museum.
# ano_de_aquisicao  -> New version of 'ano_de_aquisicao_do_objeto'. The year in which the item was acquired by the museum.
# data_de_confeccao -> New version of 'data_de_confeccao_do_item'. When the item was made.
# autoria           -> Person or institutions that made the item. Sometimes its followed by an observation in parenthesis.
# nome_etnico       -> New version of 'nome_etnico_do_item'. Indigenous name of the item. Normally the noun itself is inside quotes and
#                      sometimes more information can be found on the string.
# descricao         -> Description of the object. Commonly includes material, parts that it is made of (or how it was made), and 
#                      functionality
# dimensoes         -> Dimensions of the object.
# funcao            -> Function of the object.
# materia-prima     -> Material the object is made of. Materials are subdivided in a few categories: animal, plant and synthetic.
# 

In [None]:
import re

# Fixing data types
ind_df['creation_date'] = pd.to_datetime(ind_df['creation_date'])
ind_df['modification_date'] = pd.to_datetime(ind_df['modification_date'])

# Fixing name
ind_df.rename(columns={'nome_do_item_de_acordo_com_o_dicionario': 'nome_do_item_dic'}, inplace=True)

# Extracting correct date when possible
def parse_dates_1(date_str):
    try:
        return pd.to_datetime(date_str)
    except:
        # Possible formats that are not completly random
        for fmt in ("%Y%m%d", "%Y%m00", "$Y", "%Y0", "%Y00", "%Y000", "%Y0000", "%Y00000", "%Y000000", "%Y%m%dE7"):
            try:
                return pd.to_datetime(date_str.replace('.',''), format=fmt)
            except ValueError:
                continue
        return pd.NaT
ind_df['data_de_aquisicao'] = ind_df['data_de_aquisicao'].apply(parse_dates_1)

# Fixing name and extracting correct year when possible
ind_df.rename(columns={'ano_de_aquisicao_do_objeto': 'ano_de_aquisicao'}, inplace=True)
def extract_year(date):
    return pd.to_datetime(date).year if pd.notnull(date) else pd.NaT
ind_df['ano_de_aquisicao'] = ind_df['ano_de_aquisicao'].apply(extract_year)

# Fixing name and extracting correct date
ind_df.rename(columns={'data_de_confeccao_do_item': 'data_de_confeccao'}, inplace=True)
def parse_dates_2(date):
    # Is it already a date?
    try:
        return pd.to_datetime(date).date()
    except:
        pass

    # Remove '?' and try to match an year
    date = date.replace('?', '')
    try:
        if len(date) == 4 and date.isdigit():
            return pd.to_datetime(f'{date}-01-01').date()
    except:
        pass

    # Do we have a date of the type '<month_in_portuguese> of YYYY'?
    try:
        date = date_entry.lower().strip()
        months_pt = {
            'janeiro': '01', 'fevereiro': '02', 'março': '03', 'abril': '04',
            'maio': '05', 'junho': '06', 'julho': '07', 'agosto': '08',
            'setembro': '09', 'outubro': '10', 'novembro': '11', 'dezembro': '12'
        }
        for month_pt, month_num in months_pt.items():
            if month_pt in date:
                year = ''.join(filter(str.isdigit, date))
                return pd.to_datetime(f'{year}-{month_num}-01').date()
    except:
        pass

    # Do we have multiple years split by '|'?
    if '|' in date:
        years = date.split('|')
        years = [year.strip() for year in years if year.strip().isdigit()]
        return [pd.to_datetime(f'{year}-01-01').date() for year in years]

    # Do we have multiple years split by '/'?
    elif '/' in date:
        years = date.split('/')
        years = [year.strip() for year in years if year.strip().isdigit()]
        return [pd.to_datetime(f'{year}-01-01').date() for year in years]
    
    return pd.NaT
ind_df['data_de_confeccao'] = ind_df['data_de_confeccao'].apply(parse_dates_2)

# Fixing name
ind_df.rename(columns={'nome_etnico_do_item': 'nome_etnico'}, inplace=True)

# Extracting dimensions of the object [length, width, height, diameter]
all_dimensions = []
for index, value in ind_df['dimensoes'].items():
    dimensions = [.0, .0, .0, .0]

    # Do I even have information do be extracted?
    if type(value) is not str:
        all_dimensions.append(pd.NA)
    
    else:
        try:
            # Fix typos and change ',' to '.' for type casting later
            string_dimensions = value.replace(',o', ',0')
            string_dimensions = string_dimensions.replace(',', '.')

            # Matching numbers and studying context to decide what we are measuring (length, width, height or diameter)
            numbers = re.findall(r'\d+\.\d', string_dimensions)
            for number in numbers:
                num_pos = string_dimensions.find(number)
                context = string_dimensions[num_pos:min(num_pos+len(number)+18, len(string_dimensions))].lower()

                # Checking if we have meters or centimeters
                if len(re.findall(r'(?<!c)m', string_dimensions[num_pos:min(num_pos+len(number)+4, len(string_dimensions))])) > 0:
                    number = float(number)*100
                else:
                    number = float(number)
                
                if 'comprimento' in context:
                    dimensions[0] = number
                elif 'largura' in context:
                    dimensions[1] = number
                elif 'altura' in context:
                    dimensions[2] = number
                elif 'diametro' in context:
                    dimensions[3] = number
        except Exception as e:
            print(number, context, e)
            all_dimensions.append(pd.NA)
            continue

        # Last sanity check to account for degenerate strings
        if sum(dimensions) != .0:
            all_dimensions.append(dimensions)
        else:
            all_dimensions.append(pd.NA)
ind_df['dimensoes'] = all_dimensions