In [1]:
import json
import re
import pandas as pd
import requests

In [58]:
#query from the curl command of the search request 
query = {"query" : {
  "bool" : {
    "must" : [ {
      "term" : {
        "publicState" : {
          "value" : "RELEASED"
        }
      }
    }, {
      "term" : {
        "versionState" : {
          "value" : "RELEASED"
        }
      }
    }, {
      "bool" : {
        "must" : [ {
          "bool" : {
            "should" : [ {
              "term" : {
                "metadata.creators.person.organizations.identifierPath" : {
                  "value" : "ou_2421692"
                }
              }
            }, {
              "term" : {
                "metadata.creators.organization.identifierPath" : {
                  "value" : "ou_2421692"
                }
              }
            } ]
          }
        }, {
          "bool" : {
            "should" : [ {
              "range" : {
                "metadata.datePublishedInPrint" : {
                  "gte" : "2018||/y",
                  "lte" : "2023||/y"
                }
              }
            }, {
              "range" : {
                "metadata.datePublishedOnline" : {
                  "gte" : "2018||/y",
                  "lte" : "2023||/y"
                }
              }
            }, {
              "range" : {
                "metadata.dateAccepted" : {
                  "gte" : "2018||/y",
                  "lte" : "2023||/y"
                }
              }
            }, {
              "range" : {
                "metadata.dateSubmitted" : {
                  "gte" : "2018||/y",
                  "lte" : "2023||/y"
                }
              }
            }, {
              "range" : {
                "metadata.dateModified" : {
                  "gte" : "2018||/y",
                  "lte" : "2023||/y"
                }
              }
            }, {
              "range" : {
                "metadata.dateCreated" : {
                  "gte" : "2018||/y",
                  "lte" : "2023||/y"
                }
              }
            } ]
          }
        } ]
      }
    } ]
  }
},"sort" : [{"metadata.event.startDate":{"order":"asc"}}],"size" : "5000","from" : "0"}

In [59]:
#https://requests.readthedocs.io/en/latest/user/quickstart/#passing-parameters-in-urls
#Sends search request to PuRe REST API with the parameter defined above 

r = requests.post("https://pure.mpg.de/rest/items/search?format=json", json=query)

data = r.json()                                                    #Output is the same as the JSON-Export file

In [60]:
#Create empty DataFrame where Metadata columns will be added
df = pd.DataFrame() 

In [61]:
#Extract PuRe_ID from JSON and add to DataFrame

n = 0
ID = []

for i in data['records']:                                   #Goes through individual record items
    ID.append(data['records'][n]['persistenceId'])
    n += 1
    
df['ID'] = ID

In [62]:
#Extract Creators from JSON and add to DataFrame

n = 0 
creators = []

for i in data['records']:
    c_in = []
    m = 0
    for p in data['records'][n]['data']['metadata']['creators']:           #To capture all persons listed under creators
        if 'person' in data['records'][n]['data']['metadata']['creators'][m]:
            if 'givenName' in data['records'][n]['data']['metadata']['creators'][m]['person']:
                name = ""
                name = data['records'][n]['data']['metadata']['creators'][m]['person']['givenName'] + " " + data['records'][n]['data']['metadata']['creators'][m]['person']['familyName']
                                                                                                                                                    #string of first name + space + last name
                c_in.append(name)
            else:
                name = ""
                name = data['records'][n]['data']['metadata']['creators'][m]['person']['familyName']
                c_in.append(name)
        else:
            name = ""
            name = data['records'][n]['data']['metadata']['creators'][m]['organization']['name']
            c_in.append(name)
        m +=1 
    creators.append("; ".join(c_in))      #combines the names of all creators in this item with the seperator "; " 
    n +=1
    
df['Creators'] = creators

In [63]:
#Extract Title from JSON and add to DataFrame

n = 0
title = []

for i in data['records']:
    title.append(data['records'][n]['data']['metadata']['title'])
    n += 1
    
df['Title'] = title

In [64]:
#Extract Genre from JSON and add to DataFrame

n = 0
genre = []

for i in data['records']:
    genre.append(data['records'][n]['data']['metadata']['genre'])
    n += 1
    
df['Genre'] = genre

In [65]:
#Extract Date from JSON and add to DataFrame

n = 0
date = []
status = []

for i in data['records']:
    if 'datePublishedInPrint' in data['records'][n]['data']['metadata']:                  #Only ever one Date selected by priorities 
        date.append(data['records'][n]['data']['metadata']['datePublishedInPrint'])       #Published before Online before Accepten before Submitted 
        status.append("Published in Print")
    elif 'datePublishedOnline' in data['records'][n]['data']['metadata']:
        date.append(data['records'][n]['data']['metadata']['datePublishedOnline'])
        status.append("Published Online")
    elif 'dateAccepted'in data['records'][n]['data']['metadata']:
        date.append(data['records'][n]['data']['metadata']['dateAccepted'])
        status.append("Accepted")
    elif "dateSubmitted" in data['records'][n]['data']['metadata']:
        date.append(data['records'][n]['data']['metadata']['dateSubmitted'])
        status.append("Submitted")
    elif "dateCreated" in data['records'][n]['data']['metadata']:
        date.append(data['records'][n]['data']['metadata']['dateCreated'])
        status.append("Created")
    else:
        date.append('NA')
    n += 1

df['Status'] = status
df['Date'] = date

In [66]:
#Extract OA Status from JSON and add to DataFrame

n = 0 
OA = []

for i in data['records']:
    if "files" in data['records'][n]['data']:                   #Only if there are files
        oa_in = []
        m = 0
        for p in data['records'][n]['data']['files']:
            if 'oaStatus' in data['records'][n]['data']['files'][m]['metadata']:
                oa_in.append(data['records'][n]['data']['files'][m]['metadata']['oaStatus'])
            else:
                oa_in.append("NA")
            m +=1 
        OA.append("; ".join(oa_in))                           #combines OA status for all files attached separated by "; "
        n +=1
    else:
        OA.append("NA")                                       #Needed so index of existing df and column match 
    
df['OAStatus'] = OA

In [67]:
#Check Dataframe
df[500:1000]

Unnamed: 0,ID,Creators,Title,Genre,Status,Date,OAStatus
500,item_3211966_1,Markus Rabus; Régis Lachaume; Andrés Jordán; R...,A discontinuity in the T<SUB>eff</SUB>-radius ...,ARTICLE,Published in Print,2019,
501,item_3211440_1,Mariya Lyubenova; Athanassia Tsatsi,Nuclear angular momentum of early-type galaxie...,ARTICLE,Published in Print,2019,
502,item_3211192_1,Paul Torrey; Mark Vogelsberger; Federico Marin...,The evolution of the mass-metallicity relation...,ARTICLE,Published in Print,2019,
503,item_3211204_1,Yoshiki Toba; Takuji Yamashita; Tohru Nagao; W...,A Wide and Deep Exploration of Radio Galaxies ...,ARTICLE,Published in Print,2019,
504,item_3211232_1,Thomas Stanke; Henrik Beuther; Jens Kauffmann;...,The warm and dense Galaxy - tracing the format...,ARTICLE,Published in Print,2019,
...,...,...,...,...,...,...,...
995,item_3206046_1,Chengliang Wei; Guoliang Li; Xi Kang; Xiangkun...,The correspondence between convergence peaks f...,ARTICLE,Published in Print,2018,
996,item_3212164_1,Kyu-Ha Hwang; Yoon-Hyun Ryu; Hyoun-Woo Kim; Mi...,KMT-2016-BLG-1107: A New Hollywood-planet Clos...,ARTICLE,Published in Print,2019,
997,item_3212266_1,Thayne Currie; Christian Marois; Lucas Cieza; ...,"No Clear, Direct Evidence for Multiple Protopl...",ARTICLE,Published in Print,2019,
998,item_3212326_1,Samantha B. Brown Sevilla; Faustine Cantalloub...,High-contrast Imaging Study on the Candidate C...,ARTICLE,Published in Print,2019,


In [92]:
#Write Dataframe as csv into file 

institute = ""
df.to_csv(institute + ".csv")