# Looking for unintended copyfraud via DDB (Deutsche Digitale Bibliothek)

## Prerequisites :

* python 3.7 or higher
* packages:
    * requests
    * json
    * pandas

In [None]:
import requests
import json
import pandas as pd

In [None]:
def dumpjson(pathname, benderDict):
    with open(pathname+".json","w", encoding='utf-8') as jsonfile:
        json.dump(benderDict,jsonfile, ensure_ascii=False)
        print("dumped: "+pathname+".json" )

## To use the API of DDB you need to create an OAuth Access key.
How to get it:
    - Create an account here: https://www.deutsche-digitale-bibliothek.de/login
    - Go to Meine DDB/My DDB -> Account
    - Create an Access Token

In [None]:
accessToken='<insert access-token here>' #Don't forget the brackets ''

### if you don't have the time to let the script run through...

There are json-dumps of each major step. To load the json-files use this:

In [None]:
'''with open("path_to_json_in_question.json", "r") as jsonfile:
    deadauthorList = json.load(jsonfile)
    
with open("path_to_json_in_question.json", "r") as jsonfile:
    deadauthorsWorkDict = json.load(jsonfile)'''

## Test Query
Should result in an list of json-Objects looking similar to this:

{"id":"INLVDM4I3AMZLTG6AE6C5GZRJKGOF75K","name":"Badische Landesbibliothek","sector":"sec_02","latitude":"49.00794","longitude":"8.398618","locationDisplayName":"Badische Landesbibliothek, 15, Erbprinzenstraße, Innenstadt-West Östlicher Teil, Innenstadt-West, Karlsruhe, Baden-Württemberg, 76133, Deutschland","hasItems":true,"numberOfItems":307236,"children":[],"level":-1},{"id":"PE423JPDSCU6C72BAC2PUBOHAINDRGFO","name":"Bayerische Staatsbibliothek","sector":"sec_02","latitude":"48.14761","longitude":"11.58082","locationDisplayName":"Bayerische Staatsbibliothek, 16, Ludwigstraße, Bezirksteil Universität, Stadtbezirk 03 Maxvorstadt, München, Oberbayern, Bayern, 80539, Deutschland","hasItems":true,"numberOfItems":1296223,"children":[],"level":-1}

In [None]:
params=dict(hasItems=True,sector='sec_02',oauth_consumer_key=accessToken)

In [None]:
response = requests.get('https://api.deutsche-digitale-bibliothek.de/institutions', params=params)

In [None]:
print(response.status_code)

In [None]:
response.text

## Look for Authors who are dead for at least 70 years
(might take some time)

I could not find , in time, a way to use a condition on the index, so this code iterates over years.

In [None]:
deadauthorList=[]
params=dict(query='',facet='dateOfDeath_de' ,oauth_consumer_key=accessToken)

for dateOfDeath in range(500,2022-70):
    params["dateOfDeath_de"]=str(dateOfDeath)
    response = requests.get('https://api.deutsche-digitale-bibliothek.de/search/person', params=params)
    data = response.json()
    #results = data["results"][0]["docs"]
    for item in data["results"][0]["docs"]:
        deadauthorList.append(item["id"])
    print(dateOfDeath," done ", "number results: ", len(data["results"][0]["docs"]))

In [None]:
dumpjson("deadauthorlist", deadauthorList)

In [None]:
len(deadauthorList)
#08.01.2022 : 169149

## Items associated with searched author
this search uses the "affiliate_fct_involved"-facet. Does not necessarily mean it is the author. Or the only author.

In [None]:
params=dict(oauth_consumer_key=accessToken)
deadauthorsWorkDict = {}
for personId in deadauthorList:
    params['query'] = 'affiliate_fct_role_normdata:("'+personId+'_1_affiliate_fct_involved")'
    response = requests.get('https://api.deutsche-digitale-bibliothek.de/search', params=params)
    data = response.json()
    #results = data["results"][0]["docs"]
    print(personId," done ", "number results: ", len(data["results"][0]["docs"]))
    for item in data["results"][0]["docs"]:
        workID = item["id"]
        deadauthorsWorkDict[workID]=item["title"]
        if not response.status_code == 200:
            print(response.status_code)
    
    

In [None]:
dumpjson("deadauthorsWork",deadauthorsWorkDict)

In [None]:
len(deadauthorsWorkDict)
#09.01.2022: 24705

### Creates dict of works from dead authors
There are quite the amount of requests to be made. So this might take more several hours. This was a quick writeup so I just worked around occuring interruptions of the connection. (See next cell)

In [None]:
params=dict(oauth_consumer_key=accessToken)
dictTopd ={'origin':[],'label':[],'license':[],'institution':[],'isdigitalisat':[]}
for key in deadauthorsWorkDict:
    response = requests.get('https://api.deutsche-digitale-bibliothek.de/items/'+key+'/view', params=params)
    data = response.json()
    label = data["item"]["label"]
    if not label:
        print(key,"empty label")
    
    origin = data["item"]["origin"]
    if 'license' in data["item"]:
        license = str(data["item"]["license"])
    else:
        license = 'no license found'
    institution = data["item"]["institution"]["name"]
    if not institution:
        print(key,"empty institution")
        
    isdigitalisat = data["item"]["fields"][1]
    isdigitalisatbool= False
    if isdigitalisat["@usage"]=='index':
        for fielddict in isdigitalisat["field"]:
            if fielddict["@id"]=='digitalisat':
                isdigitalisatbool=True
    else:
        print("List[1] not index")
    dictTopd["label"].append(label)    
    dictTopd["origin"].append(origin)
    dictTopd["license"].append(license)
    dictTopd["institution"].append(institution)
    dictTopd["isdigitalisat"].append(isdigitalisatbool)
    

In [None]:
lenDictTopd = len(dictTopd["license"])
lenDictTopd

In [None]:
for index, key in enumerate(deadauthorsWorkDict):
    if index >= lenDictTopd:
            
        response = requests.get('https://api.deutsche-digitale-bibliothek.de/items/'+key+'/view', params=params)
        data = response.json()
        label = data["item"]["label"]
        if not label:
            print(key,"empty label")
        #print(key,"     -      ",label)
        origin = data["item"]["origin"]
        if 'license' in data["item"]:
            license = str(data["item"]["license"])
        else:
            license = 'no license found'
        institution = data["item"]["institution"]["name"]
        if not institution:
            print(key,"empty institution")

        isdigitalisat = data["item"]["fields"][1]
        isdigitalisatbool= False
        if isdigitalisat["@usage"]=='index':
            for fielddict in isdigitalisat["field"]:
                if fielddict["@id"]=='digitalisat':
                    isdigitalisatbool=True
        else:
            print("List[1] not index")
        dictTopd["label"].append(label)    
        dictTopd["origin"].append(origin)
        dictTopd["license"].append(license)
        dictTopd["institution"].append(institution)
        dictTopd["isdigitalisat"].append(isdigitalisatbool)
        

In [None]:
dumpjson('dictopd', dictTopd)

In [None]:
pd_cf = pd.DataFrame.from_dict(dictTopd)

In [None]:
pd_cf

In [None]:
pd_cf.to_excel('pd_list.xlsx')
