# Common Data Format Conversion
Every of our datasources keep their metadata in different formats and have different sets of parameters. In this notebook, we'll join them all into a proper .csv file for the future data processing and learning.

### Constants and imports

In [1]:
import pandas as pd
import json
import os

METADATA_DIRECTORY = 'metadata'
CMA_METADATA_FILE = os.path.join(METADATA_DIRECTORY, 'cma.json')
MET_METADATA_FILE = os.path.join(METADATA_DIRECTORY, 'met.json')
NGA_METADATA_FILE = os.path.join(METADATA_DIRECTORY, 'nga.json')
RESULT_METADATA_FILE = os.path.join(METADATA_DIRECTORY, 'metadata_joined.csv')

### Preparing output dataframe

In [2]:
result = pd.DataFrame()

### Loading CMA data

In [3]:
if os.path.exists(CMA_METADATA_FILE):
    with open(CMA_METADATA_FILE, 'r') as f:
        cma_metadata = json.load(f)

In [4]:
if cma_metadata is not None:
    rows = []
    for art_id in cma_metadata:
        row = {}
        metadata = cma_metadata[art_id]
        row['id'] = f'cma_{art_id}'
        row['title'] = metadata['title']
        row['date'] = metadata['creation_date']
        row['begin_date'] = None
        row['end_date'] = None
        row['culture'] = metadata['culture']
        row['technique'] = metadata['technique']
        row['type'] = metadata['type']
        row['department'] = metadata['department']
        row['collection'] = metadata['collection']
        row['path'] = metadata['path']
        rows.append(row)
        
    df_cma = pd.DataFrame(rows)
    result = pd.concat([result, df_cma], ignore_index=True)

In [5]:
result.describe()

Unnamed: 0,id,title,date,begin_date,end_date,culture,technique,type,department,collection,path
count,29745,29745,28507,0.0,0.0,29745,29470,29745,29745,29745,29745
unique,29745,21530,4656,0.0,0.0,4105,5976,61,19,161,29745
top,cma_122190,Page from Tales of a Parrot (Tuti-nama): text ...,1800s,,,"France, 19th century",etching,Print,Prints,PR - Etching,dataset/raw/cma_122190.jpg
freq,1,335,839,,,2807,1978,7808,7670,2870,1


### Loading MET data

In [6]:
if os.path.exists(MET_METADATA_FILE):
    with open(MET_METADATA_FILE, 'r') as f:
        met_metadata = json.load(f)

In [7]:
if met_metadata is not None:
    rows = []
    for art_id in met_metadata:
        row = {}
        metadata = met_metadata[art_id]
        if 'path' not in metadata:
            continue
        row['id'] = f'met_{art_id}'
        row['title'] = metadata['objectName']
        row['date'] = metadata['objectDate']
        row['begin_date'] = metadata['objectBeginDate']
        row['end_date'] = metadata['objectEndDate']
        row['culture'] = f"{metadata['culture']}, {metadata['period']}"
        row['technique'] = metadata['medium']
        row['type'] = metadata['classification']
        row['department'] = metadata['department']
        row['collection'] = None
        row['path'] = metadata['path']
        rows.append(row)
        
    df_met = pd.DataFrame(rows)
    result = pd.concat([result, df_met], ignore_index=True)

In [8]:
result.describe()

Unnamed: 0,id,title,date,begin_date,end_date,culture,technique,type,department,collection,path
count,31649,31649,30411,1904,1904,31649,31374,31649,31649,29745,31649
unique,31649,22071,5385,479,473,4394,6717,221,35,161,31648
top,cma_122190,Page from Tales of a Parrot (Tuti-nama): text ...,1800s,1800,1899,"France, 19th century",etching,Print,Prints,PR - Etching,dataset/raw/met_190192.jpg
freq,1,335,839,98,88,2807,1978,7808,7670,2870,2


### Loading NGA data

In [9]:
if os.path.exists(NGA_METADATA_FILE):
    with open(NGA_METADATA_FILE, 'r') as f:
        nga_metadata = json.load(f)

In [12]:
if nga_metadata is not None:
    rows = []
    for art_id in nga_metadata:
        row = {}
        metadata = nga_metadata[art_id]
        if 'path' not in metadata:
            continue
        row['id'] = f'met_{art_id}'
        row['title'] = metadata['title']
        row['date'] = metadata['displaydate']
        row['begin_date'] = metadata['beginyear']
        row['end_date'] = metadata['endyear']
        row['culture'] = metadata['terms']['School'] if 'School' in metadata['terms'] else None
        row['technique'] =  metadata['terms']['Technique'] if 'Technique' in metadata['terms'] else None
        row['type'] = metadata['classification']
        row['department'] = None
        row['collection'] = metadata['creditline']
        row['path'] = metadata['path']
        rows.append(row)
        
    df_nga = pd.DataFrame(rows)
    result = pd.concat([result, df_nga], ignore_index=True)

In [13]:
result.describe()

Unnamed: 0,id,title,date,begin_date,end_date,culture,technique,type,department,collection,path
count,61621,61621,57851,31838.0,31836.0,58539,42861,61621,31649,59712,61621
unique,61512,45266,8495,784.0,768.0,4535,6867,226,35,1926,61620
top,met_36726,Untitled,c. 1936,1936.0,1936.0,American,gelatin silver print,Print,Prints,Index of American Design,dataset/raw/met_190192.jpg
freq,2,644,1071,1346.0,1316.0,15678,3441,20536,7670,4885,2


### Saving the .csv file

In [15]:
result.to_csv(RESULT_METADATA_FILE)