# Data Exploration

This notebook is more or less a sandbox made for me to understand and decide how to prepare the dataset and what target to select.

### Constants and imports

In [1]:
import os
import shutil
import pandas as pd
import pycountry
import pygwalker as pyg
from tqdm.notebook import tqdm

METADATA_DIRECTORY = 'metadata'
JOINED_METADATA_FILE = os.path.join(METADATA_DIRECTORY, 'metadata_joined.csv')
CLASS_EXAMPLES_DIRECTORY = 'class_examples'
EXAMPLES_TO_EXPORT_PER_CLASS = 10

### Loading common dataset

In [2]:
df = pd.read_csv(JOINED_METADATA_FILE, lineterminator='\n')

df.head()

  df = pd.read_csv(JOINED_METADATA_FILE, lineterminator='\n')


Unnamed: 0,id,title,date,begin_date,end_date,culture,technique,type,department,collection,path
0,cma_96021,Crossbow Bolt,1500s–1600s,,,"Germany, 16th-17th century","wood, leather, steel",Arms and Armor,Medieval Art,MED - Arms & Armor,dataset/raw/cma_96021.jpg
1,cma_384221,"West and South Sides of the Vimana Walls, Grea...",1858,,,"England, 19th century",albumenized salt print from a waxed paper nega...,Photograph,Photography,PH - British 19th Century,dataset/raw/cma_384221.jpg
2,cma_99725,Sword Guard (Tsuba),1615–1868,,,"Japan, Edo period (1615-1868)",iron,Arms and Armor,Japanese Art,Japanese Art,dataset/raw/cma_99725.jpg
3,cma_138286,"The Gotha Missal: Fol. 7r, Text",c. 1375,,,"France, Paris","ink, tempera, and gold on vellum; blind-tooled...",Bound Volume,Medieval Art,MED - Manuscript Illuminations,dataset/raw/cma_138286.jpg
4,cma_102552,Fragment,c. 1900–1910,,,"Italy, Florence, 16th century",glazed terracotta,Sculpture,European Painting and Sculpture,Sculpture,dataset/raw/cma_102552.jpg


### Checking type values

In [3]:
walker = pyg.walk(df)

Box(children=(HTML(value='<div id="ifr-pyg-000612c3550a693bcYim71LagkvdZG6W" style="height: auto">\n    <head>…

### Extracting culture values

In [4]:
country_names = [x.name for x in list(pycountry.countries)] + [x.name for x in list(pycountry.historic_countries)]

map_cache = { # I've iterated a lot on this cache, to minimize false values and unknowns
    'America': 'United States',
    'American': 'United States',
    'Flanders': 'Flanders',
    'Venetian': 'Italy',
    'Florentine': 'Italy',
    'Tuscan': 'Italy',
    'Flemings': 'Flemings',
    'Flemish': 'Flemings',
    'Milanese': 'Italy',
    'Genoese': 'Italy',
    'Eastern India': 'Eastern India',
    'Mughal India': 'Mughal India',
    'Sardinia': 'Italy',
    'Byzantium': 'Byzantium',
    'Byzantium (style of)': 'Byzantium',
    'Bohemia': 'Czechia',
    'North Italy': 'Italy',
    'Ferrarese': 'Italy',
    'Sienese': 'Italy',
    'North coast of Peru': 'Peru',
    'Frankish': 'France',
    'Western India': 'Western India',
    'South German': 'Germany',
    'Italy(?)': 'Italy',
    'Chinese': 'China',
    'Netherlandish': 'Netherlands',
    'Austrian': 'Austria',
    'Bohemian': 'Czechia',
    'Paduan': 'Italy',
    'Veronese': 'Italy',
    'Southern France': 'France',
    'Japanese': 'Japan',
    'Greek': 'Greece',
    'Bolognese': 'Italy',
    'Mantuan': 'Italy',
    'North Italian': 'Italy',
    'Norwegian': 'Norway',
    'Canadian': 'Canada',
    'Belgian': 'Belgium',
    'Nuremberg': 'Germany',
    'Turkey': 'Turkey',
    'Swedish': 'Sweden'
}

def attempt_raw_mapping(x):
    if x in map_cache:
        return map_cache[x]
    for name in country_names:
        if x == name:
            map_cache[x] = name
            return name
    y = x.split(',')[0]
    if y in map_cache:
        map_cache[x] = map_cache[y]
        return map_cache[y]
    for name in country_names:
        if x == name:
            map_cache[x] = name
            map_cache[y] = name
            return name
    return None

def attempt_fuzzy_mapping(x): 
    if x in map_cache:
        return map_cache[x]
    try:
        result = pycountry.countries.search_fuzzy(x)[0]
        if 'country' in result:
            result = result.country.name
        else:
            result = result.name
        map_cache[x] = result
        return result
    except:
        try:
            result = pycountry.historic_countries.search_fuzzy(x)[0]
            if 'country' in result:
                result = result.country.name
            else:
                result = result.name
            map_cache[x] = result
            return result
        except:
            y = x.split(',')[0]
            if y in map_cache:
                map_cache[x] = map_cache[y]
                return map_cache[y]
            try:
                result = pycountry.countries.search_fuzzy(y)[0]
                if 'country' in result:
                    result = result.country.name
                else:
                    result = result.name
                map_cache[x] = result
                return result
            except:
                try:
                    result = pycountry.historic_countries.search_fuzzy(y)[0]
                    if 'country' in result:
                        result = result.country.name
                    else:
                        result = result.name
                    map_cache[x] = result
                    return result
                except:
                    map_cache[x] = 'unknown'
                    return 'unknown'
    map_cache[x] = 'unknown'
    return 'unknown'

def mapping(x):
    x = str(x)
    raw = attempt_raw_mapping(x)
    if raw is not None:
        return raw
    
    fuzzy = attempt_fuzzy_mapping(x)
    return fuzzy

tqdm.pandas()

# df['country'] = df['culture'].progress_map(mapping)

In [5]:
# df['country']

In [6]:
# walker = pyg.walk(df)

In [7]:
#df[df['country'] == 'unknown']

In [8]:
# df[df['country'] == 'unknown'].groupby(['culture'])['culture'].count().reset_index(name='count').sort_values(['count'], ascending=False).head(10)

In [9]:
# list(pycountry.historic_countries)

### Extracting classes: Attempt 2
First attempt didn't cause the network to learn anything after dozen of epochs. This time, I'll pick type.

In [10]:
df = pd.read_csv(JOINED_METADATA_FILE, lineterminator='\n')

df.head()

  df = pd.read_csv(JOINED_METADATA_FILE, lineterminator='\n')


Unnamed: 0,id,title,date,begin_date,end_date,culture,technique,type,department,collection,path
0,cma_96021,Crossbow Bolt,1500s–1600s,,,"Germany, 16th-17th century","wood, leather, steel",Arms and Armor,Medieval Art,MED - Arms & Armor,dataset/raw/cma_96021.jpg
1,cma_384221,"West and South Sides of the Vimana Walls, Grea...",1858,,,"England, 19th century",albumenized salt print from a waxed paper nega...,Photograph,Photography,PH - British 19th Century,dataset/raw/cma_384221.jpg
2,cma_99725,Sword Guard (Tsuba),1615–1868,,,"Japan, Edo period (1615-1868)",iron,Arms and Armor,Japanese Art,Japanese Art,dataset/raw/cma_99725.jpg
3,cma_138286,"The Gotha Missal: Fol. 7r, Text",c. 1375,,,"France, Paris","ink, tempera, and gold on vellum; blind-tooled...",Bound Volume,Medieval Art,MED - Manuscript Illuminations,dataset/raw/cma_138286.jpg
4,cma_102552,Fragment,c. 1900–1910,,,"Italy, Florence, 16th century",glazed terracotta,Sculpture,European Painting and Sculpture,Sculpture,dataset/raw/cma_102552.jpg


In [11]:
walker = pyg.walk(df)

Box(children=(HTML(value='<div id="ifr-pyg-000612c355443d39a41m2W8TKqpekN9j" style="height: auto">\n    <head>…

In [12]:
grouping = {
    'Ceramics': 'Sediment-based wares',
    'Ceramic': 'Sediment-based wares',
    'Stucco': 'Sediment-based wares',
    'Terracottas': 'Terracottas',
    
    'Miniatures': 'Miniature',
    
    'Photographs': 'Photograph',
    'Card photograph': 'Photograph',

    'Basketry': 'Basketry',
    'Vases': 'Vases',
    'Spindle Whorl': 'Spindle Whorl',
    'Implements': 'Common Items and Festives',
    'Vessels': 'Vessels',
    'Horology': 'Common Items and Festives',
    'Funerary Equipment': 'Funerary Equipment',
    'Decorative Art': 'Common Items and Festives',
    'Coins': 'Coins',
    'Coin': 'Coins',
    'Equestrian Equipment': 'Equestrian Equipment',
    
    'Leatherwork': 'Leather',
    
    'Enamels': 'Enamels',
    'Enamel': 'Enamels',
    'Glass': 'Glass and glaze',
    'Lacquer': 'Lacquer',
    
    'Stone Sculpture': 'Sculpture',
    
    'Jade': 'Gem-based wares',
    'Stone': 'Stone',
    'Gems': 'Gem-based wares',

    'Woodwork': 'Organic-based wares',
    'Wood': 'Organic-based wares',
    'Woodblock': 'Organic-based wares',
    'Furniture and woodwork': 'Organic-based wares',

    'Bone/Ivory': 'Bone/Ivory',
    'Bone': 'Bone/Ivory',
    'Ivory': 'Bone/Ivory',
    'Ivories': 'Bone/Ivory',
    'Ivories and Bone': 'Bone/Ivory',

    'Arms': 'Arms and Armor',
    'Daggers': 'Arms and Armor',
    'Helmets': 'Arms and Armor',
    'Shafted Weapons': 'Arms and Armor',
    'Swords': 'Arms and Armor',
    'Sword Furniture': 'Arms and Armor',
    'Shafted Weapons': 'Arms and Armor',
    'Archery Equipment': 'Arms and Armor',
    'Firearms': 'Arms and Armor',

    'Garment': 'Clothing',
    'Costumes': 'Clothing',

    'Textiles': 'Fabrics and fabric-based manufacture',
    'Textile': 'Fabrics and fabric-based manufacture',
    'Velvet': 'Fabrics and fabric-based manufacture',
    
    'Lace': 'Lace',

    'Aerophone': 'Music instruments',
    'Chordophone': 'Music instruments',
    'Idiophone': 'Music instruments',
    'Musical Instrument': 'Music instruments',
    'Membranophone': 'Music instruments',
    
    'Metalwork': 'Metalurgy',
    'Metal': 'Metalurgy',
    'Bronzes': 'Metalurgy',
    'Gold and Silver': 'Metalurgy',
    'Silver': 'Metalurgy',

    'Drawings': 'Drawings',
    'Drawing': 'Drawings',
    'Drawing and Watercolor': 'Drawings',
    'Illustrated Books': 'Drawings',
    
    'Portrait Miniature': 'Portrait Miniature',
    'Miniature Painting': 'Portrait Miniature',
    
    'Albums': 'Drawings',
    'Index of American Design': 'Drawings',
    
    'Prints': 'Prints',
    'Print': 'Prints',
    
    'Portfolio': 'Portfolio',
    
    'Paintings': 'Paintings',
    'Painting': 'Paintings',
    
    'Calligraphy': 'Calligraphy',

    'Chess Sets': 'Gaming pieces',

    'Book': 'Books',
    'Books': 'Books',
    'Codices': 'Books',
    'Bound Volume': 'Books',
    'Volume': 'Books',

    'Manuscript': 'Manuscripts and illuminations',
    'Manuscripts and Illuminations': 'Manuscripts and illuminations',

    'Jewelry': 'Jewelry',
    'Medals and Plaquettes': 'Medals and Plaquettes',
    'Medals': 'Medals and Plaquettes',
    'Netsuke': 'Netsuke',
    'Snuff Bottles': 'Snuff Bottles',
    'Amulets': 'Amulets',
    
    'Fans': 'Fans',
    'Inrō': 'Inrō',
}
def get_first_part(x):
    x = str(x)
    if '-' in x:
        return x.split('-')[0]
    return x

def mapping(x):
    x = get_first_part(x)
    if 'Armor' in x:
        return 'Arms and Armor'
    if x in grouping:
        return grouping[x]
    return x

tqdm.pandas()

df['type'] = df['type'].progress_map(mapping)


  0%|          | 0/279293 [00:00<?, ?it/s]

In [13]:
types_to_select = list(df['type'].value_counts()[df['type'].value_counts() > 10].index)
types_to_select.remove('nan')

df = df[df['type'].isin(types_to_select)]

In [14]:
types_to_select

['Prints',
 'Drawings',
 'Photograph',
 'Fabrics and fabric-based manufacture',
 'Sediment-based wares',
 'Paintings',
 'Sculpture',
 'Metalurgy',
 'Vases',
 'Books',
 'Stereograph',
 'Glass and glaze',
 'Arms and Armor',
 'Coins',
 'Vessel',
 'Manuscripts and illuminations',
 'Jewelry',
 'Organic-based wares',
 'Costume and Accessories',
 'Folio',
 'Decorative Arts',
 'Lace',
 'Miscellaneous',
 'Gem-based wares',
 'Common Items and Festives',
 'Music instruments',
 'Portfolio',
 'Embroidery',
 'Bone/Ivory',
 'Cased object',
 'Stone',
 'Medals and Plaquettes',
 'Enamels',
 'Lacquer',
 'Terracottas',
 'Clothing',
 'Furniture',
 'Netsuke',
 'Religious/Ritual Object',
 'Funerary Equipment',
 'Portrait Miniature',
 'Stencils',
 'Basketry',
 'Inrō',
 'Equestrian Equipment',
 'Snuff Bottles',
 'Architectural Drawing',
 'Lamp',
 'Fans',
 'Architectural fragment',
 'Female figure',
 'Gem',
 'Spindle Whorl',
 'Calligraphy',
 'Frames',
 'Vessels',
 'Gaming pieces',
 'Amulets',
 'Cup',
 'Lapidary

In [15]:
if not os.path.exists(CLASS_EXAMPLES_DIRECTORY):
    os.mkdir(CLASS_EXAMPLES_DIRECTORY)


In [16]:
for type in types_to_select:
    break
    dir_path = os.path.join(CLASS_EXAMPLES_DIRECTORY, type.replace('/', ', '))
    if not os.path.exists(dir_path):
        os.mkdir(dir_path)
        entries_of_type = df[df['type'] == type]
        df_to_export = entries_of_type.sample(n=EXAMPLES_TO_EXPORT_PER_CLASS)
        for _, row in df_to_export.iterrows():
            shutil.copyfile(row['path'], os.path.join(dir_path, os.path.basename(row['path'])))

In [17]:

pd.set_option('display.max_rows', 150)
df['type'].value_counts()

type
Prints                                          92419
Drawings                                        42808
Photograph                                      27342
Fabrics and fabric-based manufacture            13985
Sediment-based wares                            10874
Paintings                                       10430
Sculpture                                        8786
Metalurgy                                        7344
Vases                                            6670
Books                                            5192
Stereograph                                      4497
Glass and glaze                                  3668
Arms and Armor                                   3450
Coins                                            2065
Vessel                                           1868
Manuscripts and illuminations                    1490
Jewelry                                          1426
Organic-based wares                              1403
Costume and Accessories