In [14]:
# Importing required python libraries
import requests
import json
import pandas as pd
import re
from datetime import datetime

In [15]:
# Variables - UPDATE AS REQUIRED

todays_date = datetime.now().date()

# What is the name/location of the csv file with the UUIDs 
#(Hint: put file in same folder as the script and you wont need to specify the location)
uuid_csv_filename = "UUID_CEDA.csv"

#What is the number of the column the UUIDs are listed in (first column = 0, second column = 1 etc..)
uuid_column_number = 0

# Name of the csv file containing the details from the CEDA catalogue
ceda_catalogue_details_filename = "CEDA_metadata_catalogue_extract_{}.csv".format(todays_date)

#Name of the csv file containing a mapping of the ceda details to our schema
mdw_import_filename = "CEDA_bulk_import_mapping_{}.csv".format(todays_date)

In [16]:
# This code opens the file, reads the UUIDs, and if successful prints them below

dataset_details = pd.read_csv(uuid_csv_filename, header=None)
uuid_list = dataset_details[uuid_column_number].tolist()
uuid_list

['35a7ee81a50c4b95ab59f9bd128f9b63',
 '118104a74a5e460b8cea189c67558e0b',
 '76cad0b4f6f141ada1c44a4ce9e7d4bd',
 'cfe938e70f8f4e98b0622296743f7913']

In [17]:
# This code requests the metadata for these UUIDs from the CEDA API
# it then puts the CEDA details into an csv file

# If this code fails, the most likely reason is that one of the UUIDs is wrong, or it is in draft and the details
# are not avaliable

# Collecting details from API
data_list = []
for uuid in uuid_list:
    response = requests.get(
        "http://api.catalogue.ceda.ac.uk/api/v2/observations/?uuid={}".format(uuid))
    data = response.json()
    data_list.append(data['results'][0])

# Converting details to a Pandas Dataframe
ceda_df = pd.DataFrame(data_list)

# Sending details to csv file
ceda_df.to_csv(ceda_catalogue_details_filename)

In [18]:
# This code maps the details from the CEDA archive to the IPCC metadata schema version 1
# It then send this mapping to a csv file

import_file_df = pd.DataFrame()

import_file_df['Title'] = ceda_df["title"].copy()
import_file_df['Abstract'] = ''
import_file_df['Contact Point'] = 'ipcc.ddc.datasupport@metadata.atlassian.net'
import_file_df['Keywords'] = ceda_df['keywords'].apply(lambda x: re.split(' ,', x))
import_file_df['DOI Name'] = ''
import_file_df['Alternate Identifier'] = ''

ceda_df['dataPublishedTime'] = pd.to_datetime(ceda_df['dataPublishedTime'])
import_file_df['Publication Date'] = ceda_df['dataPublishedTime'].dt.date

import_file_df['Identifier'] = 'https://ror.org/02b5d8509'
import_file_df['Name'] = 'NERC EDS Centre for Environmental Data Analysis'
import_file_df['Logo'] = ''
import_file_df['P_Description'] = 'https://nerc.ukri.org/'
import_file_df['P_Contact Point'] = 'support@ceda.ac.uk'

import_file_df['Description'] = ceda_df["abstract"].copy()
import_file_df['Associated Media'] = ''
cat_list = []
for l in ceda_df['observationcollection_set']:
    i_list = []
    for x in l:
        i_list.append(requests.get(x).json()['title'])
    cat_list.append(i_list)
import_file_df['Is Part Of'] = cat_list

import_file_df['Spatial Coverage']= ''
import_file_df['Spatial Aggregation'] = ''
import_file_df['Spatial Resolution'] = ''

start_t_list = []
end_t_list = []
for t in ceda_df['timePeriod']:
    if t != None:
        response = requests.get(t)
        data = response.json()
        start_t_list.append(data['startTime'].split('T')[0])
        if data['endTime']!= None:
            end_t_list.append(data['endTime'].split('T')[0])
        else:
            end_t_list.append(' ')
    else:
        start_t_list.append(' ')
        end_t_list.append(' ')
import_file_df['Start Date'] = start_t_list
import_file_df['End Date'] = end_t_list
import_file_df['Temporal Resolution'] = ''
ll_lat_l = []
ll_long_l = []
ur_lat_l = []
ur_long_l = []
for bb in ceda_df['geographicExtent']:
    data = requests.get(bb).json()
    ll_lat_l.append(data['southBoundLatitude'])
    ll_long_l.append(data['westBoundLongitude'])
    ur_lat_l.append(data['northBoundLatitude'])
    ur_long_l.append(data['eastBoundLongitude'])
import_file_df['Lower Left Latitude'] = ll_lat_l
import_file_df['Lower Left Longitude'] = ll_long_l
import_file_df['Upper Right Latitude'] = ur_lat_l
import_file_df['Upper Right Longitude'] = ur_long_l

import_file_df['Purpose'] = ''
import_file_df['Source'] = ceda_df['dataLineage'].copy()
import_file_df['License'] = 'http://creativecommons.org/licenses/by/4.0/'
resource_creator_list = []
for rpis in ceda_df['responsiblepartyinfo_set']:
    for rp in rpis:
        data = requests.get(rp).json()
        author_list = []
        if data['party']['partyType'] == 'individual':
            f_name = data['party']['firstName']
            l_name = data['party']['lastName']
            author_list.append(l_name + ", " + f_name[0:1] +".")
    resource_creator_list.append(author_list)
import_file_df['Resource Creator'] = resource_creator_list
import_file_df['Investigations'] = ''
import_file_df['Is Referenced By'] = ''
import_file_df['References'] = ''

import_file_df['Access URL'] = ''
import_file_df['Access Service'] = 'This data is publically avaliable for download. When using these data you must cite them correctly using the citation given on the Data Catalogue record.'
import_file_df['Jurisdiction'] = ''
import_file_df['Language']= pd.Series([['en']] * len(import_file_df))
file_formats=[]
for r in ceda_df['result_field']:
    if r == None:
        file_formats.append("")
    else:
        file_formats.append(r['fileFormat'])
import_file_df['Format']= file_formats
import_file_df['Qualified Relation'] = ''
import_file_df['Tools'] = ''

# Sending detail to a csv file
import_file_df.to_csv(mdw_import_filename)
