##### Import metadata files to PyDataverse and create Dataverse objects to upload

In [1]:
import os
from pyDataverse.utils import read_csv_as_dicts
import requests

In [2]:
#read in dataset metadata csv as pyDataverse dictionaries
ds_data = read_csv_as_dicts('test/levy_illustrations_dataset.csv', encoding = 'utf-8-sig')

In [3]:
#read in data files metadata csv as pyDataverse dictionaries
df_data = read_csv_as_dicts('test/levy_illustrations_datafiles.csv', encoding = 'utf-8-sig')

In [4]:
#turn dataset dicts into Dataset objects
from pyDataverse.models import Dataset
ds_lst = []
for ds in ds_data:
    ds_obj = Dataset()
    ds_obj.set(ds)
    ds_lst.append(ds_obj)

In [5]:
#turn data files dicts into Datafile objects
from pyDataverse.models import Datafile
df_lst = []
for df in df_data:
    df_obj = Datafile()
    df_obj.set(df)
    df_lst.append(df_obj)

##### POST pyDataverse objects to Harvard Dataverse

In [6]:
import itertools
from time import sleep
import json

In [7]:
BASE_URL = 'http://localhost:8080'
API_TOKEN = 'xxx-yyy-zzz'
dv_alias = 'levy_test'

In [8]:
#test API connection
from pyDataverse.api import NativeApi
api = NativeApi(BASE_URL, API_TOKEN)
resp = api.get_info_version()
resp.json()

{'status': 'OK', 'data': {'version': '5.9', 'build': None}}

In [9]:
#create dataset and add file
dataset_id_2_pid = {}

for (ds, df) in zip(ds_lst, df_lst):
    resp_ds = api.create_dataset(dv_alias, ds.json())
    if not ('status' in resp_ds.json().keys()) or (resp_ds.json()['status'] != 'OK'):
        print('failed to create dataset for '+df.get()['org.filename']+' (skipping)')
        sleep(10)
        continue
    dataset_id_2_pid[ds.get()['org.dataset_id']] = resp_ds.json()['data']['persistentId']
    pid = dataset_id_2_pid[df.get()['org.dataset_id']]
    # check if indexing is still chugging along:
    timestamp_url = "{0}/datasets/:persistentId/timestamps?persistentId={1}".format(api.base_url_api_native,pid)
    has_stale_index=True
    has_stale_perm_index=True
    while has_stale_index or has_stale_perm_index:
        print('sleeping...')
        sleep(10)
        # make the API call; if these index stamps are still showing as "stale", we'll sleep some more, check again, (repeat)
        resp_timestamps=api.get_request(timestamp_url, True)
        if 'data' in resp_timestamps.json().keys():
            has_stale_index=resp_timestamps.json()['data']['hasStaleIndex']
            has_stale_perm_index=resp_timestamps.json()['data']['hasStalePermissionIndex']
        else:
            print('failed to get a response from /timestamps api; will sleep and try again.')
    # OK, the dataset is ready and has been reindexed successfully. Safe to proceed with the file upload.
    filename = os.path.join(os.getcwd(), 'illustrations', df.get()['org.filename'])
    df.set({"pid": pid, "filename": filename})
    resp_df = api.upload_datafile(pid, filename, df.json())
    if not ('status' in resp_df.json().keys()) or (resp_df.json()['status'] != 'OK'):
        print('failed to upload file '+df.get()['org.filename'])
        sleep(10)
        continue
    else:
        print('successfully uploaded file '+df.get()['org.filename'])
    # Again, make sure the dataset has finished reindexing, using the /timestamps API:
    has_stale_index=True
    has_stale_perm_index=True
    while has_stale_index or has_stale_perm_index:
        print('sleeping...')
        sleep(10)
        # make the API call; if these index stamps are still showing as "stale", we'll sleep some more, check again, (repeat)
        resp_timestamps=api.get_request(timestamp_url, True)
        if 'data' in resp_timestamps.json().keys():
            has_stale_index=resp_timestamps.json()['data']['hasStaleIndex']
            has_stale_perm_index=resp_timestamps.json()['data']['hasStalePermissionIndex']
        else:
            print('failed to get a response from /timestamps api; will sleep and try again.')
    # OK, the dataset has been reindexed successfully. Safe to proceed with the next file.


Dataset with pid 'doi:10.70122/FK2/SYWNGZ' created.
sleeping...
successfully uploaded file DwgID10463.jpg
sleeping...
Dataset with pid 'doi:10.70122/FK2/HQUBZV' created.
sleeping...
successfully uploaded file DwgID10464.jpg
sleeping...
Dataset with pid 'doi:10.70122/FK2/N96IC3' created.
sleeping...
successfully uploaded file DwgID10465.jpg
sleeping...


In [14]:
print(df_lst[2].json())

{
  "description": "http://pi.lib.uchicago.edu/1001/org/ochre/6256105d-a4f7-cd66-1f0a-fd6dda185986",
  "categories": [
    "Illustration"
  ],
  "label": "DwgID10465.jpg",
  "pid": "doi:10.70122/FK2/65",
  "filename": "/Users/landreev/wrk/katie/batch/batch_02/illustrations/DwgID10465.jpg"
}


In [None]:
#save list of DOIs
df = pd.DataFrame.from_dict(dataset_id_2_pid, orient = "index")
df[0] = df[0].replace('\\n',' ')
df.columns = df.columns.astype(str)
df = df.rename(columns = {"0":"doi"})
df.to_csv("2016_dois.csv")
df.head()