# Data Exploration

This notebook is more or less a sandbox made for me to understand and decide how to prepare the dataset and what target to select.

### Constants and imports

In [20]:
import os
import shutil
import pandas as pd
import pycountry
import pygwalker as pyg
from tqdm.notebook import tqdm

METADATA_DIRECTORY = 'metadata'
JOINED_METADATA_FILE = os.path.join(METADATA_DIRECTORY, 'metadata_joined.csv')
CLASS_EXAMPLES_DIRECTORY = 'class_examples'
EXAMPLES_TO_EXPORT_PER_CLASS = 10

### Loading common dataset

In [2]:
df = pd.read_csv(JOINED_METADATA_FILE, lineterminator='\n')

df.head()

  df = pd.read_csv(JOINED_METADATA_FILE, lineterminator='\n')


Unnamed: 0,id,title,date,begin_date,end_date,culture,technique,type,department,collection,path
0,cma_96021,Crossbow Bolt,1500s–1600s,,,"Germany, 16th-17th century","wood, leather, steel",Arms and Armor,Medieval Art,MED - Arms & Armor,dataset/raw/cma_96021.jpg
1,cma_384221,"West and South Sides of the Vimana Walls, Grea...",1858,,,"England, 19th century",albumenized salt print from a waxed paper nega...,Photograph,Photography,PH - British 19th Century,dataset/raw/cma_384221.jpg
2,cma_99725,Sword Guard (Tsuba),1615–1868,,,"Japan, Edo period (1615-1868)",iron,Arms and Armor,Japanese Art,Japanese Art,dataset/raw/cma_99725.jpg
3,cma_138286,"The Gotha Missal: Fol. 7r, Text",c. 1375,,,"France, Paris","ink, tempera, and gold on vellum; blind-tooled...",Bound Volume,Medieval Art,MED - Manuscript Illuminations,dataset/raw/cma_138286.jpg
4,cma_102552,Fragment,c. 1900–1910,,,"Italy, Florence, 16th century",glazed terracotta,Sculpture,European Painting and Sculpture,Sculpture,dataset/raw/cma_102552.jpg


### Checking type values

In [3]:
walker = pyg.walk(df)

Box(children=(HTML(value='<div id="ifr-pyg-0006111d2e44b544GVnpw0MuUHDSsQC3" style="height: auto">\n    <head>…

### Extracting culture values

In [None]:
country_names = [x.name for x in list(pycountry.countries)] + [x.name for x in list(pycountry.historic_countries)]

map_cache = { # I've iterated a lot on this cache, to minimize false values and unknowns
    'America': 'United States',
    'American': 'United States',
    'Flanders': 'Flanders',
    'Venetian': 'Italy',
    'Florentine': 'Italy',
    'Tuscan': 'Italy',
    'Flemings': 'Flemings',
    'Flemish': 'Flemings',
    'Milanese': 'Italy',
    'Genoese': 'Italy',
    'Eastern India': 'Eastern India',
    'Mughal India': 'Mughal India',
    'Sardinia': 'Italy',
    'Byzantium': 'Byzantium',
    'Byzantium (style of)': 'Byzantium',
    'Bohemia': 'Czechia',
    'North Italy': 'Italy',
    'Ferrarese': 'Italy',
    'Sienese': 'Italy',
    'North coast of Peru': 'Peru',
    'Frankish': 'France',
    'Western India': 'Western India',
    'South German': 'Germany',
    'Italy(?)': 'Italy',
    'Chinese': 'China',
    'Netherlandish': 'Netherlands',
    'Austrian': 'Austria',
    'Bohemian': 'Czechia',
    'Paduan': 'Italy',
    'Veronese': 'Italy',
    'Southern France': 'France',
    'Japanese': 'Japan',
    'Greek': 'Greece',
    'Bolognese': 'Italy',
    'Mantuan': 'Italy',
    'North Italian': 'Italy',
    'Norwegian': 'Norway',
    'Canadian': 'Canada',
    'Belgian': 'Belgium',
    'Nuremberg': 'Germany',
    'Turkey': 'Turkey',
    'Swedish': 'Sweden'
}

def attempt_raw_mapping(x):
    if x in map_cache:
        return map_cache[x]
    for name in country_names:
        if x == name:
            map_cache[x] = name
            return name
    y = x.split(',')[0]
    if y in map_cache:
        map_cache[x] = map_cache[y]
        return map_cache[y]
    for name in country_names:
        if x == name:
            map_cache[x] = name
            map_cache[y] = name
            return name
    return None

def attempt_fuzzy_mapping(x): 
    if x in map_cache:
        return map_cache[x]
    try:
        result = pycountry.countries.search_fuzzy(x)[0]
        if 'country' in result:
            result = result.country.name
        else:
            result = result.name
        map_cache[x] = result
        return result
    except:
        try:
            result = pycountry.historic_countries.search_fuzzy(x)[0]
            if 'country' in result:
                result = result.country.name
            else:
                result = result.name
            map_cache[x] = result
            return result
        except:
            y = x.split(',')[0]
            if y in map_cache:
                map_cache[x] = map_cache[y]
                return map_cache[y]
            try:
                result = pycountry.countries.search_fuzzy(y)[0]
                if 'country' in result:
                    result = result.country.name
                else:
                    result = result.name
                map_cache[x] = result
                return result
            except:
                try:
                    result = pycountry.historic_countries.search_fuzzy(y)[0]
                    if 'country' in result:
                        result = result.country.name
                    else:
                        result = result.name
                    map_cache[x] = result
                    return result
                except:
                    map_cache[x] = 'unknown'
                    return 'unknown'
    map_cache[x] = 'unknown'
    return 'unknown'

def mapping(x):
    x = str(x)
    raw = attempt_raw_mapping(x)
    if raw is not None:
        return raw
    
    fuzzy = attempt_fuzzy_mapping(x)
    return fuzzy

tqdm.pandas()

df['country'] = df['culture'].progress_map(mapping)

  0%|          | 0/145008 [00:00<?, ?it/s]

SubdivisionHierarchy(code='GB-ENG', country_code='GB', name='England', parent_code=None, type='Country')
SubdivisionHierarchy(code='GB-ENG', country_code='GB', name='England', parent_code=None, type='Country')
SubdivisionHierarchy(code='GB-ENG', country_code='GB', name='England', parent_code=None, type='Country')
SubdivisionHierarchy(code='GB-ENG', country_code='GB', name='England', parent_code=None, type='Country')
SubdivisionHierarchy(code='GB-ENG', country_code='GB', name='England', parent_code=None, type='Country')
SubdivisionHierarchy(code='GB-ENG', country_code='GB', name='England', parent_code=None, type='Country')
SubdivisionHierarchy(code='GB-ENG', country_code='GB', name='England', parent_code=None, type='Country')
SubdivisionHierarchy(code='GB-ENG', country_code='GB', name='England', parent_code=None, type='Country')
SubdivisionHierarchy(code='GB-ENG', country_code='GB', name='England', parent_code=None, type='Country')
SubdivisionHierarchy(code='GB-ENG', country_code='GB', 

In [None]:
df['country']

In [None]:
walker = pyg.walk(df)

In [None]:
df[df['country'] == 'unknown']

In [None]:
df[df['country'] == 'unknown'].groupby(['culture'])['culture'].count().reset_index(name='count').sort_values(['count'], ascending=False).head(10)

In [None]:
list(pycountry.historic_countries)

### Extracting classes: Attempt 2
First attempt didn't cause the network to learn anything after dozen of epochs. This time, I'll pick type.

In [2]:
df = pd.read_csv(JOINED_METADATA_FILE, lineterminator='\n')

df.head()

  df = pd.read_csv(JOINED_METADATA_FILE, lineterminator='\n')


Unnamed: 0,id,title,date,begin_date,end_date,culture,technique,type,department,collection,path
0,cma_96021,Crossbow Bolt,1500s–1600s,,,"Germany, 16th-17th century","wood, leather, steel",Arms and Armor,Medieval Art,MED - Arms & Armor,dataset/raw/cma_96021.jpg
1,cma_384221,"West and South Sides of the Vimana Walls, Grea...",1858,,,"England, 19th century",albumenized salt print from a waxed paper nega...,Photograph,Photography,PH - British 19th Century,dataset/raw/cma_384221.jpg
2,cma_99725,Sword Guard (Tsuba),1615–1868,,,"Japan, Edo period (1615-1868)",iron,Arms and Armor,Japanese Art,Japanese Art,dataset/raw/cma_99725.jpg
3,cma_138286,"The Gotha Missal: Fol. 7r, Text",c. 1375,,,"France, Paris","ink, tempera, and gold on vellum; blind-tooled...",Bound Volume,Medieval Art,MED - Manuscript Illuminations,dataset/raw/cma_138286.jpg
4,cma_102552,Fragment,c. 1900–1910,,,"Italy, Florence, 16th century",glazed terracotta,Sculpture,European Painting and Sculpture,Sculpture,dataset/raw/cma_102552.jpg


In [3]:
walker = pyg.walk(df)

Box(children=(HTML(value='<div id="ifr-pyg-0006111d39fbbeecdQ7G24Wl6wyKsRnO" style="height: auto">\n    <head>…

In [29]:
grouping = {
    'Ceramics': 'Ceramic',
    'Drawings': 'Drawing',
    'Helmets': 'Armor',
    'Manuscripts and Illuminations': 'Manuscript',
    'Miniatures': 'Miniature',
    'Paintings': 'Painting',
    'Photographs': 'Photograph',
    'Prints': 'Print',
    'Textiles': 'Textile',
    'Woodwork': 'Wood',
    'Woodblock': 'Wood',
    'Leatherwork': 'Leather',
    'Bone': 'Bone/Ivory',
    'Ivory': 'Bone/Ivory',
    'Ivories': 'Bone/Ivory',
    'Enamels': 'Enamel',
    'Silver': 'Gold and Silver',
    'Furniture and woodwork': 'Wood',
    'Stone Sculpture': 'Sculpture',
    'Jade': 'Gems',
    'Codices': 'Books',
    'Shafted Weapons': 'Arms and Armor',
    'Swords': 'Arms and Armor'
}

def get_first_part(x):
    x = str(x)
    if '-' in x:
        return x.split('-')[0]
    return x

def mapping(x):
    x = get_first_part(x)
    if 'Armor' in x:
        return 'Arms and Armor'
    if x in grouping:
        return grouping[x]
    return x

tqdm.pandas()

df['type'] = df['type'].progress_map(mapping)


  0%|          | 0/133163 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['type'] = df['type'].progress_map(mapping)


In [30]:
types_to_select = list(df['type'].value_counts()[df['type'].value_counts() > 500].index)
types_to_select.remove('nan')

df = df[df['type'].isin(types_to_select)]

ValueError: list.remove(x): x not in list

In [31]:
types_to_select

['Print',
 'Photograph',
 'Drawing',
 'Ceramic',
 'Textile',
 'Index of American Design',
 'Painting',
 'Sculpture',
 'Vases',
 'Metalwork',
 'Bound Volume',
 'Glass',
 'Arms and Armor',
 'Manuscript',
 'Jewelry',
 'Wood',
 'Lace',
 'Books',
 'Miscellaneous',
 'Gold and Silver',
 'Coins',
 'Gems',
 'Metal',
 'Embroidery',
 'Portfolio']

In [27]:
if not os.path.exists(CLASS_EXAMPLES_DIRECTORY):
    os.mkdir(CLASS_EXAMPLES_DIRECTORY)



In [28]:
for type in types_to_select:
    dir_path = os.path.join(CLASS_EXAMPLES_DIRECTORY, type.replace('/', ', '))
    if not os.path.exists(dir_path):
        os.mkdir(dir_path)
        entries_of_type = df[df['type'] == type]
        df_to_export = entries_of_type.sample(n=EXAMPLES_TO_EXPORT_PER_CLASS)
        for _, row in df_to_export.iterrows():
            shutil.copyfile(row['path'], os.path.join(dir_path, os.path.basename(row['path'])))

In [32]:
df['type'].value_counts()

type
Print                       39877
Photograph                  12513
Drawing                     11647
Ceramic                      7663
Textile                      7575
Index of American Design     7423
Painting                     6690
Sculpture                    5587
Vases                        5109
Metalwork                    3680
Bound Volume                 2795
Glass                        2063
Arms and Armor               1722
Manuscript                   1429
Jewelry                      1316
Wood                         1216
Lace                         1153
Books                        1049
Miscellaneous                 880
Gold and Silver               788
Coins                         766
Gems                          727
Metal                         646
Embroidery                    591
Portfolio                     524
Bronzes                       418
Sword Furniture               410
Stucco                        400
Volume                        397
Lacquer  