# Data Preparation

In this notebook, we'll read the joined metadata file and do some data preprocessing to get it ready for the ResNet to be trained on.
To see the process the decisions were made, see Data Exploration notebook.

### Constants and imports

In [1]:
import os
import pandas as pd
import pycountry
from tqdm.notebook import tqdm
import magic

METADATA_DIRECTORY = 'metadata'
JOINED_METADATA_FILE = os.path.join(METADATA_DIRECTORY, 'metadata_joined.csv')
OUTPUT_FILE = os.path.join(METADATA_DIRECTORY, 'dataset.csv')
DATASET_DIRECTORY = 'dataset'
PREPROCESSED_DIRECTORY = os.path.join(DATASET_DIRECTORY, 'preprocessed')
USE_MAPPING = 'type'
MIN_CLASS_SAMPLES_COUNT = 300

HAVING_IT_PREPROCESSED_IS_PROOF_OF_VALIDITY = True

### Loading the metadata

In [2]:
df = pd.read_csv(JOINED_METADATA_FILE, lineterminator='\n')


  df = pd.read_csv(JOINED_METADATA_FILE, lineterminator='\n')


### Setting up 'class' column, based on culture

#### Override cache values

In [3]:
map_cache = { # I've iterated a lot on this cache, to minimize false values and unknowns
    'America': 'United States',
    'American': 'United States',
    'Flanders': 'Flanders',
    'Venetian': 'Italy',
    'Florentine': 'Italy',
    'Tuscan': 'Italy',
    'Flemings': 'Flemings',
    'Flemish': 'Flemings',
    'Milanese': 'Italy',
    'Genoese': 'Italy',
    'Eastern India': 'Eastern India',
    'Mughal India': 'Mughal India',
    'Sardinia': 'Italy',
    'Byzantium': 'Byzantium',
    'Byzantium (style of)': 'Byzantium',
    'Bohemia': 'Czechia',
    'North Italy': 'Italy',
    'Ferrarese': 'Italy',
    'Sienese': 'Italy',
    'North coast of Peru': 'Peru',
    'Frankish': 'France',
    'Western India': 'Western India',
    'South German': 'Germany',
    'Italy(?)': 'Italy',
    'Chinese': 'China',
    'Netherlandish': 'Netherlands',
    'Austrian': 'Austria',
    'Bohemian': 'Czechia',
    'Paduan': 'Italy',
    'Veronese': 'Italy',
    'Southern France': 'France',
    'Japanese': 'Japan',
    'Greek': 'Greece',
    'Bolognese': 'Italy',
    'Mantuan': 'Italy',
    'North Italian': 'Italy',
    'Norwegian': 'Norway',
    'Canadian': 'Canada',
    'Belgian': 'Belgium',
    'Nuremberg': 'Germany',
    'Turkey': 'Turkey',
    'Swedish': 'Sweden'
}

#### Mapping functions

In [4]:
country_names = [x.name for x in list(pycountry.countries)] + [x.name for x in list(pycountry.historic_countries)]

def attempt_raw_mapping(x):
    if x in map_cache:
        return map_cache[x]
    for name in country_names:
        if x == name:
            map_cache[x] = name
            return name
    y = x.split(',')[0]
    if y in map_cache:
        map_cache[x] = map_cache[y]
        return map_cache[y]
    for name in country_names:
        if x == name:
            map_cache[x] = name
            map_cache[y] = name
            return name
    return None

def attempt_fuzzy_mapping(x): 
    if x in map_cache:
        return map_cache[x]
    try:
        result = pycountry.countries.search_fuzzy(x)[0]
        if 'country' in result:
            result = result.country.name
        else:
            result = result.name
        map_cache[x] = result
        return result
    except:
        try:
            result = pycountry.historic_countries.search_fuzzy(x)[0]
            if 'country' in result:
                result = result.country.name
            else:
                result = result.name
            map_cache[x] = result
            return result
        except:
            y = x.split(',')[0]
            if y in map_cache:
                map_cache[x] = map_cache[y]
                return map_cache[y]
            try:
                result = pycountry.countries.search_fuzzy(y)[0]
                if 'country' in result:
                    result = result.country.name
                else:
                    result = result.name
                map_cache[x] = result
                return result
            except:
                try:
                    result = pycountry.historic_countries.search_fuzzy(y)[0]
                    if 'country' in result:
                        result = result.country.name
                    else:
                        result = result.name
                    map_cache[x] = result
                    return result
                except:
                    map_cache[x] = 'unknown'
                    return 'unknown'
    map_cache[x] = 'unknown'
    return 'unknown'

def country_mapping(x):
    x = str(x)
    raw = attempt_raw_mapping(x)
    if raw is not None:
        return raw
    
    fuzzy = attempt_fuzzy_mapping(x)
    return fuzzy

tqdm.pandas()

In [5]:
grouping = {
    'Ceramics': 'Sediment-based wares',
    'Ceramic': 'Sediment-based wares',
    'Stucco': 'Sediment-based wares',
    'Terracottas': 'Terracottas',
    
    'Miniatures': 'Miniature',
    
    'Photographs': 'Photograph',
    'Card photograph': 'Photograph',

    'Basketry': 'Basketry',
    'Vases': 'Vases',
    'Vase': 'Vases',
    'Plasting vase': 'Vases',
    'Spindle Whorl': 'Spindle Whorl',
    'Implements': 'Common Items and Festives',
    'Jar': 'Vessels',
    'Vessels': 'Vessels',
    'Horology': 'Common Items and Festives',
    'Funerary Equipment': 'Funerary Equipment',
    'Funerary Object': 'Funerary Equipment',
    'Tomb Pottery': 'Funerary Equipment',
    'Decorative Art': 'Common Items and Festives',
    'Coins': 'Coins',
    'Coin': 'Coins',
    'Equestrian Equipment': 'Equestrian Equipment',
    
    'Leatherwork': 'Leather',
    
    'Enamels': 'Enamels',
    'Enamel': 'Enamels',
    'Glass': 'Glass and glaze',
    'Stained glass': 'Glass and glaze',
    'Lacquer': 'Lacquer',
    
    'Stone Sculpture': 'Sculpture',
    
    'Jade': 'Gem-based wares',
    'Stone': 'Stone',
    'Gems': 'Gem-based wares',

    'Woodwork': 'Organic-based wares',
    'Wood': 'Organic-based wares',
    'Woodblock': 'Organic-based wares',
    'Furniture and woodwork': 'Organic-based wares',

    'Bone/Ivory': 'Bone/Ivory',
    'Bone': 'Bone/Ivory',
    'Ivory': 'Bone/Ivory',
    'Ivories': 'Bone/Ivory',
    'Ivories and Bone': 'Bone/Ivory',

    'Arms': 'Arms and Armor',
    'Daggers': 'Arms and Armor',
    'Helmets': 'Arms and Armor',
    'Shafted Weapons': 'Arms and Armor',
    'Swords': 'Arms and Armor',
    'Sword Furniture': 'Arms and Armor',
    'Shafted Weapons': 'Arms and Armor',
    'Archery Equipment': 'Arms and Armor',
    'Firearms': 'Arms and Armor',

    'Garment': 'Clothing',
    'Costumes': 'Clothing',

    'Textiles': 'Fabrics and fabric-based manufacture',
    'Textile': 'Fabrics and fabric-based manufacture',
    'Velvet': 'Fabrics and fabric-based manufacture',
    
    'Lace': 'Lace',

    'Aerophone': 'Music instruments',
    'Chordophone': 'Music instruments',
    'Idiophone': 'Music instruments',
    'Musical Instrument': 'Music instruments',
    'Membranophone': 'Music instruments',
    
    'Metalwork': 'Metalurgy',
    'Metal': 'Metalurgy',
    'Bronzes': 'Metalurgy',
    'Gold and Silver': 'Metalurgy',
    'Silver': 'Metalurgy',

    'Drawings': 'Drawings',
    'Drawing': 'Drawings',
    'Drawing and Watercolor': 'Drawings',
    'Illustrated Books': 'Drawings',
    
    'Portrait Miniature': 'Portrait Miniature',
    'Miniature Painting': 'Portrait Miniature',
    
    'Albums': 'Albums',
    'Album': 'Albums',
    
    'Index of American Design': 'Drawings',
    
    'Prints': 'Prints',
    'Print': 'Prints',
    
    'Portfolio': 'Portfolio',
    
    'Paintings': 'Paintings',
    'Painting': 'Paintings',
    'Female portrait': 'Paintings',
    'Male portrait': 'Paintings',
    
    'Calligraphy': 'Calligraphy',

    'Chess Sets': 'Gaming pieces',

    'Book': 'Books',
    'Books': 'Books',
    'Codices': 'Books',
    'Bound Volume': 'Books',
    'Volume': 'Books',

    'Manuscript': 'Manuscripts and illuminations',
    'Manuscripts and Illuminations': 'Manuscripts and illuminations',

    'Jewelry': 'Jewelry',
    'Earring': 'Jewelry',
    'Necklace': 'Jewelry',
    'Ring': 'Jewelry',
    'Bracelet': 'Jewelry',
    
    'Medals and Plaquettes': 'Medals and Plaquettes',
    'Medals': 'Medals and Plaquettes',
    'Netsuke': 'Netsuke',
    'Snuff Bottles': 'Snuff Bottles',
    'Pendant': 'Amulets',
    'Amulets': 'Amulets',
    
    'Fans': 'Fans',
    'Inrō': 'Inrō',

    'Miniature': 'Miniature',
    'Miniature': 'Miniature room',

    'Scarabs': 'Scarab',
    'Scarab': 'Scarab'
}

def get_first_part(x):
    x = str(x)
    if '-' in x:
        return x.split('-')[0]
    return x

def type_mapping(x):
    x = get_first_part(x)
    if 'Armor' in x:
        return 'Arms and Armor'
    if x in grouping:
        return grouping[x]
    return x

tqdm.pandas()


#### Running mapping

In [6]:
tqdm.pandas()

if USE_MAPPING == 'culture':
    df['class'] = df['culture'].progress_map(country_mapping)
else:
    df['class'] = df['type'].progress_map(type_mapping)
    
classes_to_select = list(df['class'].value_counts()[df['class'].value_counts() > MIN_CLASS_SAMPLES_COUNT].index)
if 'nan' in classes_to_select:
    classes_to_select.remove('nan')
if 'unknown' in classes_to_select:
    classes_to_select.remove('unknown')

df = df[df['class'].isin(classes_to_select)]

  0%|          | 0/341710 [00:00<?, ?it/s]

#### Filter rows that lack downloaded images or the image isn't jpeg/png

In [7]:
def check_image_existance_and_type(path):
    if os.path.exists(path):
        if HAVING_IT_PREPROCESSED_IS_PROOF_OF_VALIDITY and os.path.exists(os.path.join(PREPROCESSED_DIRECTORY, os.path.basename(path))):
            # If it's preprocessed, then the file is valid (as it had to be to get preprocessed)
            return path
        type = magic.from_file(path, mime=True)
        if type == 'image/jpeg' or type == 'image/png':
            return path
    return 'unknown'

df['path'] = df['path'].progress_map(check_image_existance_and_type)

df = df[df['path'] != 'unknown']

  0%|          | 0/320358 [00:00<?, ?it/s]

### Saving the metadata into csv

In [8]:
df.to_csv(OUTPUT_FILE, index=False)

In [9]:
classes_to_select

['Prints',
 'Drawings',
 'Photograph',
 'Stereograph',
 'Fabrics and fabric-based manufacture',
 'Paintings',
 'Sediment-based wares',
 'Sculpture',
 'Metalurgy',
 'Vases',
 'Books',
 'Folio',
 'Glass and glaze',
 'Arms and Armor',
 'Coins',
 'Cased object',
 'Vessel',
 'Jewelry',
 'Manuscripts and illuminations',
 'Organic-based wares',
 'Costume and Accessories',
 'Decorative Arts',
 'Lace',
 'Miscellaneous',
 'Common Items and Festives',
 'Gem-based wares',
 'Music instruments',
 'Portfolio',
 'Female figure',
 'Lamp',
 'Embroidery',
 'Gem',
 'Bone/Ivory',
 'Stone',
 'Furniture',
 'Medals and Plaquettes',
 'Enamels',
 'Lacquer',
 'Terracottas',
 'Funerary Equipment',
 'Clothing',
 'Cup',
 'Netsuke',
 'Albums',
 'Religious/Ritual Object']

In [10]:
len(classes_to_select)

45

In [11]:
len(df)

318563

In [12]:
df['class'].value_counts()

class
Prints                                  120022
Drawings                                 45271
Photograph                               32104
Stereograph                              18229
Fabrics and fabric-based manufacture     13465
Paintings                                10887
Sediment-based wares                     10872
Sculpture                                 9019
Metalurgy                                 7326
Vases                                     6715
Books                                     5439
Folio                                     5084
Glass and glaze                           3704
Arms and Armor                            3395
Coins                                     3119
Cased object                              1914
Vessel                                    1849
Jewelry                                   1595
Manuscripts and illuminations             1585
Organic-based wares                       1402
Costume and Accessories                   1288
Decorat