# Common Data Format Conversion
Every of our datasources keep their metadata in different formats and have different sets of parameters. In this notebook, we'll join them all into a proper .csv file for the future data processing and learning.

### Constants and imports

In [1]:
import pandas as pd
import json
import os
from tqdm.notebook import tqdm


METADATA_DIRECTORY = 'metadata'
CMA_METADATA_FILE = os.path.join(METADATA_DIRECTORY, 'cma.json')
MET_METADATA_FILE = os.path.join(METADATA_DIRECTORY, 'met.json')
NGA_METADATA_FILE = os.path.join(METADATA_DIRECTORY, 'nga.json')
AIC_METADATA_FILE = os.path.join(METADATA_DIRECTORY, 'aic.json')
GET_METADATA_FILE = os.path.join(METADATA_DIRECTORY, 'get.json')
RESULT_METADATA_FILE = os.path.join(METADATA_DIRECTORY, 'metadata_joined.csv')

### Preparing output dataframe

In [2]:
result = pd.DataFrame()

### Loading CMA data

In [3]:
if os.path.exists(CMA_METADATA_FILE):
    with open(CMA_METADATA_FILE, 'r') as f:
        cma_metadata = json.load(f)

In [4]:
if cma_metadata is not None:
    rows = []
    for art_id in cma_metadata:
        row = {}
        metadata = cma_metadata[art_id]
        row['id'] = f'cma_{art_id}'
        row['title'] = metadata['title']
        row['date'] = metadata['creation_date']
        row['begin_date'] = None
        row['end_date'] = None
        row['culture'] = metadata['culture']
        row['technique'] = metadata['technique']
        row['type'] = metadata['type']
        row['department'] = metadata['department']
        row['collection'] = metadata['collection']
        row['path'] = metadata['path']
        rows.append(row)
        
    df_cma = pd.DataFrame(rows)
    result = pd.concat([result, df_cma], ignore_index=True)

In [5]:
result.describe()

Unnamed: 0,id,title,date,begin_date,end_date,culture,technique,type,department,collection,path
count,38599,38599,36989,0.0,0.0,38599,38255,38599,38599,38599,38599
unique,38599,27130,5317,0.0,0.0,4844,7144,63,19,165,38599
top,cma_96021,Page from Tales of a Parrot (Tuti-nama): text ...,1800s,,,"France, 19th century",etching,Print,Prints,PR - Etching,dataset/raw/cma_96021.jpg
freq,1,424,1093,,,3571,2586,10110,9936,3744,1


### Loading MET data

In [6]:
if os.path.exists(MET_METADATA_FILE):
    with open(MET_METADATA_FILE, 'r') as f:
        met_metadata = json.load(f)

In [7]:
if met_metadata is not None:
    rows = []
    for art_id in met_metadata:
        row = {}
        metadata = met_metadata[art_id]
        if 'path' not in metadata:
            continue
        row['id'] = f'met_{art_id}'
        row['title'] = metadata['objectName']
        row['date'] = metadata['objectDate']
        row['begin_date'] = metadata['objectBeginDate']
        row['end_date'] = metadata['objectEndDate']
        row['culture'] = f"{metadata['culture']}, {metadata['period']}"
        row['technique'] = metadata['medium']
        row['type'] = metadata['classification']
        row['department'] = metadata['department']
        row['collection'] = None
        row['path'] = metadata['path']
        rows.append(row)
        
    df_met = pd.DataFrame(rows)
    result = pd.concat([result, df_met], ignore_index=True)

In [8]:
result.describe()

Unnamed: 0,id,title,date,begin_date,end_date,culture,technique,type,department,collection,path
count,119297,119297,117687,80698,80698,119297,118953,119297,119297,38599,119297
unique,119297,35890,14986,1422,1421,7834,22995,581,35,165,118780
top,cma_96021,Print,19th century,1700,1899,",",Terracotta,Prints,Drawings and Prints,PR - Etching,dataset/raw/met_4208.jpg
freq,1,12763,3319,4364,3850,40461,7199,13782,21330,3744,12


### Loading NGA data

In [9]:
if os.path.exists(NGA_METADATA_FILE):
    with open(NGA_METADATA_FILE, 'r') as f:
        nga_metadata = json.load(f)

In [10]:
if nga_metadata is not None:
    rows = []
    for art_id in nga_metadata:
        row = {}
        metadata = nga_metadata[art_id]
        if 'path' not in metadata:
            continue
        row['id'] = f'nga_{art_id}'
        row['title'] = metadata['title']
        row['date'] = metadata['displaydate']
        row['begin_date'] = metadata['beginyear']
        row['end_date'] = metadata['endyear']
        row['culture'] = metadata['terms']['School'] if 'School' in metadata['terms'] else None
        row['technique'] =  metadata['terms']['Technique'] if 'Technique' in metadata['terms'] else None
        row['type'] = metadata['classification']
        row['department'] = None
        row['collection'] = metadata['creditline']
        row['path'] = metadata['path']
        rows.append(row)
        
    df_nga = pd.DataFrame(rows)
    result = pd.concat([result, df_nga], ignore_index=True)

In [11]:
result.describe()

Unnamed: 0,id,title,date,begin_date,end_date,culture,technique,type,department,collection,path
count,228869,228869,217953,190135,190124.0,217304,160882,228869,119297,148151,228869
unique,228869,110536,21765,1519,1485.0,8025,23195,586,35,3548,228352
top,cma_96021,Print,c. 1936,1936,1936.0,American,gelatin silver print,Print,Drawings and Prints,Index of American Design,dataset/raw/met_4208.jpg
freq,1,12763,4010,5056,4983.0,57783,12697,57610,21330,18200,12


### Loading AIC data

In [12]:
if os.path.exists(AIC_METADATA_FILE):
    with open(AIC_METADATA_FILE, 'r') as f:
        aic_metadata = json.load(f)

In [13]:
if aic_metadata is not None:
    rows = []
    for art_id in aic_metadata:
        row = {}
        metadata = aic_metadata[art_id]
        if 'path' not in metadata:
            continue
        row['id'] = f'aic_{art_id}'
        row['title'] = metadata['title']
        row['date'] = metadata['date_display']
        row['begin_date'] = metadata['date_start']
        row['end_date'] = metadata['date_end']
        row['culture'] = metadata['place_of_origin']
        row['technique'] =  metadata['medium_display']
        row['type'] = metadata['artwork_type_title']
        row['department'] = metadata['department_title']
        row['collection'] = metadata['credit_line']
        row['path'] = metadata['path']
        rows.append(row)
        
    df_aic = pd.DataFrame(rows)
    result = pd.concat([result, df_aic], ignore_index=True)

In [14]:
result.describe()

Unnamed: 0,id,title,date,begin_date,end_date,culture,technique,type,department,collection,path
count,270820,270820,259786,231911,231900,259134,202315,270660,161236,190099,270820
unique,270820,137443,28107,1842,1795,8722,36825,606,45,7420,270198
top,cma_96021,Print,19th century,1800,1899,American,gelatin silver print,Print,Drawings and Prints,Index of American Design,dataset/raw/met_4208.jpg
freq,1,12763,4359,5213,5421,57783,12697,74729,21330,18200,12


### Loading GET data

In [15]:
if os.path.exists(GET_METADATA_FILE):
    with open(GET_METADATA_FILE, 'r') as f:
        get_metadata = json.load(f)

In [16]:
if get_metadata is not None:
    rows = []
    for art_id in tqdm(get_metadata):
        row = {}
        metadata = get_metadata[art_id]
        if 'path' not in metadata:
            continue
        row['id'] = metadata['id']
        row['title'] = metadata['title']
        row['type'] = metadata['type'] 
        row['path'] = metadata['path']
        row['department'] = metadata['department']
        row['collection'] = metadata['collection']
        row['culture'] = metadata['culture']
        row['technique'] = metadata['technique']
                
        row['date'] = metadata['date']
        row['begin_date'] = metadata['begin_date']
        row['end_date'] = metadata['end_date']
        rows.append(row)
        
    df_get = pd.DataFrame(rows)
    result = pd.concat([result, df_get], ignore_index=True)

  0%|          | 0/156511 [00:00<?, ?it/s]

In [17]:
result.describe()

Unnamed: 0,id,title,date,begin_date,end_date,culture,technique,type,department,collection,path
count,341710,341710,330676,302674,302788,320040,273066,341344,232126,260987,341710
unique,341710,195595,32261,2677,2628,8957,38253,915,51,13034,341088
top,cma_96021,Print,n.d.,1865-01-01 00:00:00,1899,American,Albumen silver print,Print,Photographs (Curatorial Department),"The J. Paul Getty Museum, Los Angeles",dataset/raw/met_4208.jpg
freq,1,12763,4451,6404,5421,79245,36768,106677,56911,47653,12


### Saving the .csv file

In [18]:
result.to_csv(RESULT_METADATA_FILE, index=False)

In [19]:
result.tail(10)

Unnamed: 0,id,title,date,begin_date,end_date,culture,technique,type,department,collection,path
341700,fff29df2-d221-48a5-bca4-42cdda47af2b,July Calendar Page; Reaping; Leo (Ms. Ludwig I...,early 16th century,1500-01-01 00:00:00,1525-12-31 23:59:59,German,Tempera colors,Folio,Manuscripts (Curatorial Department),"The J. Paul Getty Museum, Los Angeles, Ms. Lud...",dataset/raw/get_fff29df2-d221-48a5-bca4-42cdda...
341701,fff40fe9-3a63-43b6-8be8-155ec8ad247a,The Blind Beggar.,about 1862,1862-01-01 00:00:00,1862-12-31 23:59:59,British,Albumen silver print,Print,Photographs (Curatorial Department),"The J. Paul Getty Museum, Los Angeles",dataset/raw/get_fff40fe9-3a63-43b6-8be8-155ec8...
341702,fff5a494-354a-4259-a68f-ad86328fee6a,[Group of Tycoon's Officers],1866–1867,1866-01-01 00:00:00,1867-12-31 23:59:59,English,Hand-colored albumen silver print,Print,Photographs (Curatorial Department),"The J. Paul Getty Museum, Los Angeles",dataset/raw/get_fff5a494-354a-4259-a68f-ad8632...
341703,fff90fdc-1428-411d-97de-e1fa46d3a14b,"Old Bridge of Garry, Pass of Killiecrankie.","September 29, 1865",1865-01-01 00:00:00,1865-12-31 23:59:59,Scottish,Albumen silver print,Card photograph,Photographs (Curatorial Department),"The J. Paul Getty Museum, Los Angeles",dataset/raw/get_fff90fdc-1428-411d-97de-e1fa46...
341704,fff9971a-0432-4b31-ab42-81979e6f6240,[Man in pensive mood seated at riverbank],1857–1859,1857-01-01 00:00:00,1859-12-31 23:59:59,English,Hand colored albumen silver print,Stereograph,Photographs (Curatorial Department),Gift of Weston J. and Mary M. Naef,dataset/raw/get_fff9971a-0432-4b31-ab42-81979e...
341705,fff9f072-d6b9-4bd3-9c29-abff16d335bf,"Inhabited Initial S (Ms. 2 (84.ML.67), fol. 124)",about 1450–1455,1450-01-01 00:00:00,1455-12-31 23:59:59,Flemish,"Tempera colors, gold leaf, and ink",Folio,Manuscripts (Curatorial Department),"The J. Paul Getty Museum, Los Angeles, Ms. 2, ...",dataset/raw/get_fff9f072-d6b9-4bd3-9c29-abff16...
341706,fffb8b83-efe8-4ebd-bfeb-45ce34d8b6ec,"Boulevard des Capucines, Corner of Rue Louis l...",about 1870s,1870-01-01 00:00:00,1879-12-31 23:59:59,English,Albumen silver print,Print,Photographs (Curatorial Department),Gift of Mr. and Mrs. Otto Wittmann,dataset/raw/get_fffb8b83-efe8-4ebd-bfeb-45ce34...
341707,fffc063a-14a7-4981-be5c-cd629e3a6392,"[Town of Walkerton, Ontario, Canada]",1870s,1870-01-01 00:00:00,1879-12-31 23:59:59,American,Albumen silver print,Stereograph,Photographs (Curatorial Department),Gift of Weston J. and Mary M. Naef,dataset/raw/get_fffc063a-14a7-4981-be5c-cd629e...
341708,fffcf776-9cbe-465f-a8c6-409f4d60a682,Female Head,4th century B.C.,-0399-01-01T00:00:00,-0300-12-31T23:59:59,Greek (South Italian),Terracotta,Female figure,Antiquities (Curatorial Department),"The J. Paul Getty Museum, Villa Collection, Ma...",dataset/raw/get_fffcf776-9cbe-465f-a8c6-409f4d...
341709,ffff349d-6af7-4a24-8f7a-2038c431f401,"[John Drew, Jr. and woman] (84.XP.751.42)",negative 1885–1903; print 1903–1920,1885-01-01 00:00:00,1920-12-31 23:59:59,American,Gelatin silver print,Print,Photographs (Curatorial Department),"The J. Paul Getty Museum, Los Angeles",dataset/raw/get_ffff349d-6af7-4a24-8f7a-2038c4...
