In [None]:
pip install -r requirements.txt

In [1]:
import json
import re
import pandas as pd
import requests

In [None]:
#query from the curl command of the search request 
query = {"query" : {
  "bool" : {
    "must" : [ {
      "term" : {
        "publicState" : {
          "value" : "RELEASED"
        }
      }
    }, {
      "term" : {
        "versionState" : {
          "value" : "RELEASED"
        }
      }
    }, {
      "bool" : {
        "should" : [ {
          "term" : {
            "metadata.creators.person.organizations.identifierPath" : {
              "value" : "ou_persistent25"
            }
          }
        }, {
          "term" : {
            "metadata.creators.organization.identifierPath" : {
              "value" : "ou_persistent25"
            }
          }
        } ]
      }
    } ]
  }
},"sort" : [{"metadata.title.keyword":{"order":"asc"}}],"size" : "5000","from" : "0"}

In [None]:
#https://requests.readthedocs.io/en/latest/user/quickstart/#passing-parameters-in-urls
#Sends search request to PuRe REST API with the parameter defined above 

r = requests.post("https://pure.mpg.de/rest/items/search?format=json", json=query)

data = r.json()                                                    #Output is the same as the JSON-Export file

In [None]:
#Create empty DataFrame where Metadata columns will be added
df = pd.DataFrame() 

In [None]:
#Extract PuRe_ID from JSON and add to DataFrame

n = 0
ID = []

for i in data['records']:                                   #Goes through individual record items
    ID.append(data['records'][n]['persistenceId'])
    n += 1
    
df['ID'] = ID

In [None]:
#Extract Creators from JSON and add to DataFrame

n = 0 
creators = []

for i in data['records']:
    c_in = []
    m = 0
    for p in data['records'][n]['data']['metadata']['creators']:           #To capture all persons listed under creators
        if 'person' in data['records'][n]['data']['metadata']['creators'][m]:
            if 'givenName' in data['records'][n]['data']['metadata']['creators'][m]['person']:
                name = ""
                name = data['records'][n]['data']['metadata']['creators'][m]['person']['givenName'] + " " + data['records'][n]['data']['metadata']['creators'][m]['person']['familyName']
                                                                                                                                                    #string of first name + space + last name
                c_in.append(name)
            else:
                name = ""
                name = data['records'][n]['data']['metadata']['creators'][m]['person']['familyName']
                c_in.append(name)
        else:
            name = ""
            name = data['records'][n]['data']['metadata']['creators'][m]['organization']['name']
            c_in.append(name)
        m +=1 
    creators.append("; ".join(c_in))      #combines the names of all creators in this item with the seperator "; " 
    n +=1
    
df['Creators'] = creators

In [None]:
#Extract Title from JSON and add to DataFrame

n = 0
title = []

for i in data['records']:
    title.append(data['records'][n]['data']['metadata']['title'])
    n += 1
    
df['Title'] = title

In [None]:
#Extract Genre from JSON and add to DataFrame

n = 0
genre = []

for i in data['records']:
    genre.append(data['records'][n]['data']['metadata']['genre'])
    n += 1
    
df['Genre'] = genre

In [None]:
#Extract Date from JSON and add to DataFrame

n = 0
date = []
status = []

for i in data['records']:
    if 'datePublishedInPrint' in data['records'][n]['data']['metadata']:                  #Only ever one Date selected by priorities 
        date.append(data['records'][n]['data']['metadata']['datePublishedInPrint'])       #Published before Online before Accepten before Submitted 
        status.append("Published in Print")
    elif 'datePublishedOnline' in data['records'][n]['data']['metadata']:
        date.append(data['records'][n]['data']['metadata']['datePublishedOnline'])
        status.append("Published Online")
    elif 'dateAccepted'in data['records'][n]['data']['metadata']:
        date.append(data['records'][n]['data']['metadata']['dateAccepted'])
        status.append("Accepted")
    elif "dateSubmitted" in data['records'][n]['data']['metadata']:
        date.append(data['records'][n]['data']['metadata']['dateSubmitted'])
        status.append("Submitted")
    elif "dateCreated" in data['records'][n]['data']['metadata']:
        date.append(data['records'][n]['data']['metadata']['dateCreated'])
        status.append("Created")
    else:
        date.append('NA')
        status.append('NA')
    n += 1

df['Status'] = status
df['Date'] = date

In [None]:
#Extract OA Status from JSON and add to DataFrame

n = 0 
OA = []

for i in data['records']:
    if "files" in data['records'][n]['data']:                   #Only if there are files
        oa_in = []
        m = 0
        for p in data['records'][n]['data']['files']:
            if 'oaStatus' in data['records'][n]['data']['files'][m]['metadata']:
                oa_in.append(data['records'][n]['data']['files'][m]['metadata']['oaStatus'])
            else:
                oa_in.append("NA")
            m +=1 
        OA.append("; ".join(oa_in))                           #combines OA status for all files attached separated by "; "
        n +=1
    else:
        OA.append("NA")                                       #Needed so index of existing df and column match 
    
df['OAStatus'] = OA

In [None]:
#Extract local tag from JSON and add to DataFrame

n = 0
local_tag = []

for i in data['records']:
    local_tag_i = []
    for p in data['records'][n]['data']['localTags']:
        local_tag_i.append(p)
    local_tag.append("; ".join(local_tag_i))
    n += 1

df["Local Tag"] = local_tag

In [None]:
#Check Dataframe
df

In [None]:
#Write Dataframe as csv into file 

institute = "MPDL"
df.to_csv(institute + '.csv')