# Data Exploration

This notebook is more or less a sandbox made for me to understand and decide how to prepare the dataset and what target to select.

### Constants and imports

In [1]:
import os
import pandas as pd
import pycountry
import pygwalker as pyg
from tqdm.notebook import tqdm

METADATA_DIRECTORY = 'metadata'
JOINED_METADATA_FILE = os.path.join(METADATA_DIRECTORY, 'metadata_joined.csv')

### Loading common dataset

In [2]:
df = pd.read_csv(JOINED_METADATA_FILE)

df.head()

Unnamed: 0,id,title,date,begin_date,end_date,culture,technique,type,department,collection,path
0,cma_122190,Long Sampler,c. 1650–70,,,"England, 17th century",embroidery: silk on linen tabby ground,Sampler,Textiles,Textiles,dataset/raw/cma_122190.jpg
1,cma_171485,"Vanity Fair: Statesman, No. 57 ""A man of Fashi...",1870,,,"Italy, 19th century",color lithograph,Print,Prints,PR - Lithograph,dataset/raw/cma_171485.jpg
2,cma_161711,Cellarette,c. 1840,,,"America, New York",chiefly rosewood veneer with pine and poplar s...,Furniture and woodwork,Decorative Art and Design,Furniture,dataset/raw/cma_161711.jpg
3,cma_102749,Gradual,c. 1520,,,"Italy, Verona","Ink, tempera, and gold on vellum; wood binding",Manuscript,Medieval Art,MED - Manuscript Illuminations,dataset/raw/cma_102749.jpg
4,cma_101229,Fragment,1800s,,,"China, 19th century",silk,Textile,Textiles,Textiles,dataset/raw/cma_101229.jpg


### Checking type values

In [3]:
walker = pyg.walk(df)

Box(children=(HTML(value='<div id="ifr-pyg-00061054f872c52fHx6XR9SpmJNFKiYL" style="height: auto">\n    <head>…

### Extracting culture values

In [5]:
country_names = [x.name for x in list(pycountry.countries)] + [x.name for x in list(pycountry.historic_countries)]

map_cache = { # I've iterated a lot on this cache, to minimize false values and unknowns
    'America': 'United States',
    'American': 'United States',
    'Flanders': 'Flanders',
    'Venetian': 'Italy',
    'Florentine': 'Italy',
    'Tuscan': 'Italy',
    'Flemings': 'Flemings',
    'Flemish': 'Flemings',
    'Milanese': 'Italy',
    'Genoese': 'Italy',
    'Eastern India': 'Eastern India',
    'Mughal India': 'Mughal India',
    'Sardinia': 'Italy',
    'Byzantium': 'Byzantium',
    'Byzantium (style of)': 'Byzantium',
    'Bohemia': 'Czechia',
    'North Italy': 'Italy',
    'Ferrarese': 'Italy',
    'Sienese': 'Italy',
    'North coast of Peru': 'Peru',
    'Frankish': 'France',
    'Western India': 'Western India',
    'South German': 'Germany',
    'Italy(?)': 'Italy',
    'Chinese': 'China',
    'Netherlandish': 'Netherlands',
    'Austrian': 'Austria',
    'Bohemian': 'Czechia',
    'Paduan': 'Italy',
    'Veronese': 'Italy',
    'Southern France': 'France',
    'Japanese': 'Japan',
    'Greek': 'Greece',
    'Bolognese': 'Italy',
    'Mantuan': 'Italy',
    'North Italian': 'Italy',
    'Norwegian': 'Norway',
    'Canadian': 'Canada',
    'Belgian': 'Belgium',
    'Nuremberg': 'Germany',
    'Turkey': 'Turkey',
    'Swedish': 'Sweden'
}

def attempt_raw_mapping(x):
    if x in map_cache:
        return map_cache[x]
    for name in country_names:
        if x == name:
            map_cache[x] = name
            return name
    y = x.split(',')[0]
    if y in map_cache:
        map_cache[x] = map_cache[y]
        return map_cache[y]
    for name in country_names:
        if x == name:
            map_cache[x] = name
            map_cache[y] = name
            return name
    return None

def attempt_fuzzy_mapping(x): 
    if x in map_cache:
        return map_cache[x]
    try:
        result = pycountry.countries.search_fuzzy(x)[0]
        if 'country' in result:
            result = result.country.name
        else:
            result = result.name
        map_cache[x] = result
        return result
    except:
        try:
            result = pycountry.historic_countries.search_fuzzy(x)[0]
            if 'country' in result:
                result = result.country.name
            else:
                result = result.name
            map_cache[x] = result
            return result
        except:
            y = x.split(',')[0]
            if y in map_cache:
                map_cache[x] = map_cache[y]
                return map_cache[y]
            try:
                result = pycountry.countries.search_fuzzy(y)[0]
                if 'country' in result:
                    result = result.country.name
                else:
                    result = result.name
                map_cache[x] = result
                return result
            except:
                try:
                    result = pycountry.historic_countries.search_fuzzy(y)[0]
                    if 'country' in result:
                        result = result.country.name
                    else:
                        result = result.name
                    map_cache[x] = result
                    return result
                except:
                    map_cache[x] = 'unknown'
                    return 'unknown'
    map_cache[x] = 'unknown'
    return 'unknown'

def mapping(x):
    x = str(x)
    raw = attempt_raw_mapping(x)
    if raw is not None:
        return raw
    
    fuzzy = attempt_fuzzy_mapping(x)
    return fuzzy

tqdm.pandas()

df['country'] = df['culture'].progress_map(mapping)

  0%|          | 0/61621 [00:00<?, ?it/s]

SubdivisionHierarchy(code='GB-ENG', country_code='GB', name='England', parent_code=None, type='Country')
SubdivisionHierarchy(code='GB-ENG', country_code='GB', name='England', parent_code=None, type='Country')
SubdivisionHierarchy(code='GB-ENG', country_code='GB', name='England', parent_code=None, type='Country')
SubdivisionHierarchy(code='MX-MEX', country_code='MX', name='México', parent_code=None, type='State')
SubdivisionHierarchy(code='MX-MEX', country_code='MX', name='México', parent_code=None, type='State')
SubdivisionHierarchy(code='US-CA', country_code='US', name='California', parent_code=None, type='State')
SubdivisionHierarchy(code='MX-MEX', country_code='MX', name='México', parent_code=None, type='State')
SubdivisionHierarchy(code='US-CA', country_code='US', name='California', parent_code=None, type='State')
SubdivisionHierarchy(code='MX-MEX', country_code='MX', name='México', parent_code=None, type='State')
SubdivisionHierarchy(code='MX-MEX', country_code='MX', name='México

In [6]:
df['country']

0        United Kingdom
1                 Italy
2         United States
3                 Italy
4                 China
              ...      
61616     United States
61617            France
61618     United States
61619     United States
61620            France
Name: country, Length: 61621, dtype: object

In [7]:
walker = pyg.walk(df)

Box(children=(HTML(value='<div id="ifr-pyg-00061055007b3b41Sp0iRfkn7EsBQX3M" style="height: auto">\n    <head>…

In [8]:
df[df['country'] == 'unknown']

Unnamed: 0,id,title,date,begin_date,end_date,culture,technique,type,department,collection,path,country
35,cma_132296,Tomb Cover,"1640–1950 (radiocarbon date, 95% probability)",,,"Iraq or Iran, in the style of the Buyid period...",compound tabby; silk,Textile,Textiles,T - Islamic,dataset/raw/cma_132296.jpg,unknown
148,cma_117504,Vest,late 19th century,,,"Dalmatia, Late 19th century","velvet, gold braid, sequins",Garment,Textiles,Textiles,dataset/raw/cma_117504.jpg,unknown
194,cma_153279,Fragment of a Band,1300–1350,,,"Egypt or Syria, Mamluk period","compound tabby, double cloth, and bands of ext...",Textile,Textiles,T - Islamic,dataset/raw/cma_153279.jpg,unknown
207,cma_170907,"Nanda Solves Vasudeva’s Dilemma, from a Bhagav...",c. 1610,,,"Northwestern India, Rajasthan, Rajput Kingdom ...",Gum tempera and gold on paper,Painting,Indian and Southeast Asian Art,Indian Art,dataset/raw/cma_170907.jpg,unknown
222,cma_148128,Passion Set: Christ before Herod,1596–98,,,"Netherlandss, 16th century",engraving,Print,Prints,PR - Engraving,dataset/raw/cma_148128.jpg,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...
60231,met_11348,Bather III (Baigneuse de Dos),1896,1896.0,1896.0,Swedish,,Print,,Rosenwald Collection,dataset/raw/nga_11348.jpg,unknown
60232,met_11349,Precipice,1909,1909.0,1909.0,Swedish,,Print,,Rosenwald Collection,dataset/raw/nga_11349.jpg,unknown
60274,met_11585,Two Peasants Fighting,c. 1475/1480,1475.0,1480.0,Middle Rhenish,,Print,,Rosenwald Collection,dataset/raw/nga_11585.jpg,unknown
60280,met_11596,Saint Sebastian,c. 1470/1480,1470.0,1480.0,Emilian,niello print,Print,,Rosenwald Collection,dataset/raw/nga_11596.jpg,unknown


In [9]:
df[df['country'] == 'unknown'].groupby(['culture'])['culture'].count().reset_index(name='count').sort_values(['count'], ascending=False).head(10)

Unnamed: 0,culture,count
210,Danube,98
1100,[],87
1053,Unknown,70
768,Nuremberg,40
77,Canadian,35
782,Parthian Empire (247 BCE–224 CE),34
279,Etruscan,33
47,Belgian,31
170,"China or Japan, 8th century",28
1019,"Turkey, 19th century",23


In [None]:
list(pycountry.historic_countries)