# Data Preparation

In this notebook, we'll read the joined metadata file and do some data preprocessing to get it ready for the ResNet to be trained on.
To see the process the decisions were made, see Data Exploration notebook.

### Constants and imports

In [5]:
import os
import pandas as pd
import pycountry
from tqdm.notebook import tqdm

METADATA_DIRECTORY = 'metadata'
JOINED_METADATA_FILE = os.path.join(METADATA_DIRECTORY, 'metadata_joined.csv')
OUTPUT_FILE = os.path.join(METADATA_DIRECTORY, 'dataset.csv')
USE_MAPPING = 'type'

### Loading the metadata

In [6]:
df = pd.read_csv(JOINED_METADATA_FILE)


### Setting up 'class' column, based on culture

#### Override cache values

In [7]:
map_cache = { # I've iterated a lot on this cache, to minimize false values and unknowns
    'America': 'United States',
    'American': 'United States',
    'Flanders': 'Flanders',
    'Venetian': 'Italy',
    'Florentine': 'Italy',
    'Tuscan': 'Italy',
    'Flemings': 'Flemings',
    'Flemish': 'Flemings',
    'Milanese': 'Italy',
    'Genoese': 'Italy',
    'Eastern India': 'Eastern India',
    'Mughal India': 'Mughal India',
    'Sardinia': 'Italy',
    'Byzantium': 'Byzantium',
    'Byzantium (style of)': 'Byzantium',
    'Bohemia': 'Czechia',
    'North Italy': 'Italy',
    'Ferrarese': 'Italy',
    'Sienese': 'Italy',
    'North coast of Peru': 'Peru',
    'Frankish': 'France',
    'Western India': 'Western India',
    'South German': 'Germany',
    'Italy(?)': 'Italy',
    'Chinese': 'China',
    'Netherlandish': 'Netherlands',
    'Austrian': 'Austria',
    'Bohemian': 'Czechia',
    'Paduan': 'Italy',
    'Veronese': 'Italy',
    'Southern France': 'France',
    'Japanese': 'Japan',
    'Greek': 'Greece',
    'Bolognese': 'Italy',
    'Mantuan': 'Italy',
    'North Italian': 'Italy',
    'Norwegian': 'Norway',
    'Canadian': 'Canada',
    'Belgian': 'Belgium',
    'Nuremberg': 'Germany',
    'Turkey': 'Turkey',
    'Swedish': 'Sweden'
}

#### Mapping functions

In [8]:
country_names = [x.name for x in list(pycountry.countries)] + [x.name for x in list(pycountry.historic_countries)]

def attempt_raw_mapping(x):
    if x in map_cache:
        return map_cache[x]
    for name in country_names:
        if x == name:
            map_cache[x] = name
            return name
    y = x.split(',')[0]
    if y in map_cache:
        map_cache[x] = map_cache[y]
        return map_cache[y]
    for name in country_names:
        if x == name:
            map_cache[x] = name
            map_cache[y] = name
            return name
    return None

def attempt_fuzzy_mapping(x): 
    if x in map_cache:
        return map_cache[x]
    try:
        result = pycountry.countries.search_fuzzy(x)[0]
        if 'country' in result:
            result = result.country.name
        else:
            result = result.name
        map_cache[x] = result
        return result
    except:
        try:
            result = pycountry.historic_countries.search_fuzzy(x)[0]
            if 'country' in result:
                result = result.country.name
            else:
                result = result.name
            map_cache[x] = result
            return result
        except:
            y = x.split(',')[0]
            if y in map_cache:
                map_cache[x] = map_cache[y]
                return map_cache[y]
            try:
                result = pycountry.countries.search_fuzzy(y)[0]
                if 'country' in result:
                    result = result.country.name
                else:
                    result = result.name
                map_cache[x] = result
                return result
            except:
                try:
                    result = pycountry.historic_countries.search_fuzzy(y)[0]
                    if 'country' in result:
                        result = result.country.name
                    else:
                        result = result.name
                    map_cache[x] = result
                    return result
                except:
                    map_cache[x] = 'unknown'
                    return 'unknown'
    map_cache[x] = 'unknown'
    return 'unknown'

def country_mapping(x):
    x = str(x)
    raw = attempt_raw_mapping(x)
    if raw is not None:
        return raw
    
    fuzzy = attempt_fuzzy_mapping(x)
    return fuzzy

tqdm.pandas()

In [9]:
grouping = {
    'Ceramics': 'Ceramic',
    'Drawings': 'Drawing',
    'Helmets': 'Armor',
    'Manuscripts and Illuminations': 'Manuscript',
    'Miniatures': 'Miniature',
    'Paintings': 'Painting',
    'Photographs': 'Photograph',
    'Prints': 'Print',
    'Textiles': 'Textile',
    'Woodwork': 'Wood',
    'Woodblock': 'Wood',
    'Leatherwork': 'Leather',
    'Bone': 'Bone/Ivory'
}

def get_first_part(x):
    x = str(x)
    if '-' in x:
        return x.split('-')[0]
    return x

def type_mapping(x):
    x = get_first_part(x)
    if 'Armor' in x:
        return 'Arms and Armor'
    if x in grouping:
        return grouping[x]
    return x

tqdm.pandas()


#### Running mapping

In [11]:
tqdm.pandas()

if USE_MAPPING == 'culture':
    df['class'] = df['culture'].progress_map(country_mapping)
else:
    df['class'] = df['type'].progress_map(type_mapping)
    
classes_to_select = list(df['class'].value_counts()[df['class'].value_counts() > 100].index)
if 'nan' in classes_to_select:
    classes_to_select.remove('nan')
if 'unknown' in classes_to_select:
    classes_to_select.remove('unknown')

df = df[df['class'].isin(classes_to_select)]

  0%|          | 0/61621 [00:00<?, ?it/s]

#### Filter rows that lack downloaded images 

In [12]:
def check_image_existance(path):
    if os.path.exists(path):
        return path
    return 'unknown'

df['path'] = df['path'].progress_map(check_image_existance)

df = df[df['path'] != 'unknown']

  0%|          | 0/60224 [00:00<?, ?it/s]

### Saving the metadata into csv

In [13]:
df.to_csv(OUTPUT_FILE)