# Top DOAJ Geology journals
The purpose of this notebook is to extract pdf files of top doaj journals.
#### Top open source journals  https://doaj.org/
* Minerals 
* Solid Earth
* Solid Earth Sciences 
* Geosphere
* Geosciences 
* Lithosphere (American version)
* Geoscience Frontiers
* Geochemical Perspectives Letters 
* Frontiers in Earth Science
* Advances in Geosciences 
* Quaternary
* SOIL

### Status
* Minerals  (Done)
* Solid Earth (Done)
* Solid Earth Sciences (Failed. Elsvier, Elsvier Open source journals have mechanisms preventing data crawls)
* Geosphere (Done)
* Geosciences (https://pubs.geoscienceworld.org/. Anti-scraping rules via captcha)
* Lithosphere (Not tested but, https://pubs.geoscienceworld.org/)
* Geoscience Frontiers (Not tested but, Elsvier)
* Geochemical Perspectives Letters (Done, https://www.geochemicalperspectivesletters.org/)
* Frontiers in Earth Science(TODO,https://www.frontiersin.org/journals/earth-science)
* Advances in Geosciences (TODO: https://www.advances-in-geosciences.net/)
* Quaternary (Done, https://www.mdpi.com/journal/quaternary)
* SOIL (TODO, https://www.soil-journal.net/)

In [34]:
import pandas as pd
import numpy as np
from os import listdir,environ,rename 
from sys import argv
from os.path import isfile,join,basename
from shutil import rmtree,move
import glob
from bs4 import BeautifulSoup
import os
from zipfile import ZipFile
import requests
from requests_html import HTMLSession,AsyncHTMLSession
import json

In [88]:
root_dir = "/nrcan_p2"
data_dir = join(root_dir,"data")
doaj_root_dir=join(data_dir,"01_raw","20201221","doaj")
doaj_root_dir_2=join(data_dir,"01_raw","20201221","doaj_extra_2")
doaj_root_dir_3=join(data_dir,"01_raw","20201221","doaj_extra_3")
doaj_root_dir_4=join(data_dir,"01_raw","20201221","doaj_extra_4")
doaj_root_dir_5=join(data_dir,"01_raw","20201221","doaj_extra_5")
tmp_dir = join(doaj_root_dir,"tmp")
material_dir = join(doaj_root_dir,"material")
material_dir_new = join(doaj_root_dir_4,"material_redo")

solid_earth_dir = join(doaj_root_dir,"solid_earth")
solid_earth_dir_new = join(doaj_root_dir_4,"solid_earth_redo")

solid_earth_sciences_dir = join(doaj_root_dir,"solid_earth_sciences")

geosciences_dir = join(doaj_root_dir,"geosciences")
geosciences_dir_new = join(doaj_root_dir_4,"geosciences_redo")

gpl_dir = join(doaj_root_dir,"geochemical_perspective_letters")
gpl_dir_new = join(doaj_root_dir_5,"geochemical_perspective_letters_redo")
quaternary_dir = join(doaj_root_dir,"quaternary")


In [79]:
with open('unfinished_files_doaj_3.txt', 'r') as f:
    unfinished_files = f.readlines()
    
unfinished_files

['geosciences/_2076-3263_8_8_308_pdf.pdf\n',
 'geochemical_perspective_letters/_documents_GPL1724cor_noSI.pdf.pdf\n',
 'geochemical_perspective_letters/_documents_GPL1802err_noSI.pdf.pdf\n',
 'geochemical_perspective_letters/_documents_GPL1825cor_noSI.pdf.pdf\n',
 'geochemical_perspective_letters/_documents_GPL1907cor_noSI.pdf.pdf\n',
 'geochemical_perspective_letters/_documents_GPL1922cor_noSI.pdf.pdf\n',
 'geochemical_perspective_letters/_documents_GPL1925cor_noSI.pdf.pdf\n']

In [80]:
import pathlib
from collections import defaultdict
unfinished_files_name_map = defaultdict(list)
for elem in unfinished_files:
    elem = pathlib.Path(elem.strip())
    name = elem.name
    parent = elem.parent.name
    print(name, parent)
    unfinished_files_name_map[parent].append(name)
#unfinished_files_name = [pathlib.Path(x.strip()).name for x in unfinished_files]
#unfinished_files_name[0]

_2076-3263_8_8_308_pdf.pdf geosciences
_documents_GPL1724cor_noSI.pdf.pdf geochemical_perspective_letters
_documents_GPL1802err_noSI.pdf.pdf geochemical_perspective_letters
_documents_GPL1825cor_noSI.pdf.pdf geochemical_perspective_letters
_documents_GPL1907cor_noSI.pdf.pdf geochemical_perspective_letters
_documents_GPL1922cor_noSI.pdf.pdf geochemical_perspective_letters
_documents_GPL1925cor_noSI.pdf.pdf geochemical_perspective_letters


In [90]:
#os.mkdir(material_dir_new)
#os.mkdir(solid_earth_dir_new)
os.mkdir(doaj_root_dir_5)
#os.mkdir(geosciences_dir_new)
os.mkdir(gpl_dir_new)

In [45]:
MAKE_DIRS=False
GET_LINKS=True
DOWNLOAD = True

if MAKE_DIRS:
    os.mkdir(doaj_root_dir)
    os.mkdir(tmp_dir)
    os.mkdir(material_dir)
    os.mkdir(solid_earth_dir)
    os.mkdir(solid_earth_sciences_dir)
    os.mkdir(geosciences_dir)
    os.mkdir(gpl_dir)
    os.mkdir(quaternary_dir)

In [96]:
def get_parsed_content(url):
    html_content = requests.get(url, verify=False).text
    soup = BeautifulSoup(html_content, "html.parser")
    return soup

async def get_session_content(url):
    asession = AsyncHTMLSession()
    r = await asession.get(url)
    await r.html.arender()
    html_content = r.html.raw_html
    soup = BeautifulSoup(html_content, "html.parser")
    return soup

def download_articles(article_links,http_root,destination_dir,list_to_download=None,max_length=100):
    i = 0
    print(destination_dir)
    for article_link in article_links:
        
        article_name = article_link.replace(http_root,"").replace("/","_")+".pdf"
        article_name = article_name[-1*max_length:]
        
        if list_to_download is not None:
            if article_name not in list_to_download:
                continue
                
        print(article_name)
        i += 1
        r = requests.get(article_link,stream=True, verify=False)
        chunk_size=10000000        
        with open (join(destination_dir,article_name),"wb") as f:
            for chunk in r.iter_content(chunk_size):
                f.write(chunk)
    print(i)

In [38]:
class BaseJournal:
    def __init_(self):
        self.issue_links = None
        self.article_links = None
    
    def get_articles_count(self):
        if self.article_links:
            return len(self.article_links)
        return 0
    
    def create_issue_links(self,issue_links):
        issue_links=list(set(issue_links))
        issue_links.sort()
        self.issue_links = issue_links
        return issue_links
    
    def create_article_links(self,article_links):
        article_links=list(set(article_links))
        article_links.sort()
        self.article_links = article_links
        return article_links

In [39]:
class MaterialsJournal(BaseJournal):
    def __init__(self):
        super().__init__()
        self.http_root="https://www.mdpi.com"
        self.link_root = "/2075-163X"
        
    def get_issue_links(self):
        """
        Get all issues
        """
        issue_links = list()
        for i in range(1,12):
            volume_content= get_parsed_content(f'{self.http_root}{self.link_root}/{i}')
            for link in volume_content.find_all('a', href=True):
                if (href := link['href']).startswith(f'{self.link_root}/{i}/'):
                    issue_links.append(href)

        return self.create_issue_links(issue_links)

    def get_article_links(self):
        """
        Get all articles links
        """
        article_links = list()
        for issue_link in self.issue_links:
            article_content = get_parsed_content(f'{self.http_root}{issue_link}')
            for link in article_content.find_all('a', href=True):
                if (href := link['href']).startswith(f'{issue_link}') and "pdf" in href:
                    article_links.append(f'{self.http_root}{href}')

        return self.create_article_links(article_links)

    
    

In [68]:
class SolidEarthJournal(BaseJournal):
    def __init__(self):
        super().__init__()
        self.http_root="https://se.copernicus.org/articles"
        
    def get_issue_links(self):
        """
        Get all issues
        """
        issue_links = list()
        volume_content= get_parsed_content(f'{self.http_root}/volumes.html')
        for link in volume_content.find_all('a', href=True):
                if (href := link['href']).startswith(self.http_root) and "issue" in href:
                    issue_links.append(href)
                    
        return self.create_issue_links(issue_links)
    
    def get_article_links(self):
        """
        Get all articles links
        """
        article_links = list()
        for issue_link in self.issue_links:
            article_list_content = get_parsed_content(issue_link)
            for link in article_list_content.find_all('a', href=True):
                if (href := link['href']).startswith(f'{issue_link.split("issue")[0]}'):
                    if not ".jpg" in href and not ".pdf" in href and not ".png" in href:
                        """
                        if ".pdf" in href:
                            article_links.append(href)
                        if "-avatar-web" in href:
                            article_links.append(f'{href.split("-avatar-web")[0]}.pdf')
                        """
                        atricle_content = get_parsed_content(href)
                        for link2 in atricle_content.find_all('a', href=True):
                            if (href2 := link2['href']).startswith(f'{issue_link.split("issue")[0]}') \
                                and ".pdf" in href2 and not "high-res" in href2 and not "supplement" in href2:
                                article_links.append(href2)

        return self.create_article_links(article_links)
    

# NOT WORKING
# Ok tested with everything, wasted lots of time. Elsvier Open source journals have mechanisms preventing data crawls 

class SolidEarthSciencesJournal(BaseJournal):
    def __init__(self):
        super().__init__()
        self.http_root="https://www.sciencedirect.com"
    
    def get_issue_links(self):
        """
        Get all issues
        """
        issue_links = list()
        for i in range(1,6):
            for j in range (1,5):
                if i == 1 and j == 4:
                    pass
                else:
                    issue_links.append(f'https://www.sciencedirect.com/journal/solid-earth-sciences/vol/{i}/issue/{j}')
            
        return self.create_issue_links(issue_links)
    
    async def get_article_links(self):
        """
        Get all articles links
        """
        article_links = list()
        for issue_link in self.issue_links:
            article_content = await get_session_content(issue_link)
            for link in article_content.find_all('a', href=True):
                if (href := link['href']).startswith(f'/science/article/pii/') and ".pdf" in href:
                    article_links.append(f'{self.http_root}{href}')

        return self.create_article_links(article_links)

# NOT WORKING Anti-scraping rules (Captcha)
class GeosphereJournal(BaseJournal):
    def __init__(self):
        super().__init__()
        self.http_root="https://pubs.geoscienceworld.org/geosphere"
    
    async def get_issue_links(self):
        """
        Get all issues
        """
        issue_links = list()
        volume_content= await get_session_content(f'{self.http_root}/list-of-years')
        print(volume_content)
        for link in volume_content.find_all('a', href=True):
            print(link)
            if (href := link['href']).startswith(f'{self.http_root}/list-of-issues/'):
                volume_details = await get_session_content(href)
                for detail_link in volume_content.find_all('a', href=True):
                    if (detail_href := detail_link['href']).startswith(f'{self.http_root}/issue/'):
                        #TODO 
        return self.create_issue_links(issue_links)
    
    async def get_article_links(self):
        """
        Get all articles links
        """
        article_links = list()
        for issue_link in self.issue_links:
            article_content = await get_session_content(issue_link)
            for link in article_content.find_all('a', href=True):
                if (href := link['href']).startswith(f'/science/article/pii/') and ".pdf" in href:
                    article_links.append(f'{self.http_root}{href}')

        return self.create_article_links(article_links)

In [41]:
class GeosciencesJournal(BaseJournal):
    def __init__(self):
        super().__init__()
        self.http_root="https://www.mdpi.com"
        self.link_root="/2076-3263"
    
    def get_issue_links(self):
        """
        Get all issues
        """
        issue_links = list()
        for i in range(1,12):
            volume_content= get_parsed_content(f'{self.http_root}{self.link_root}/{i}')
            for link in volume_content.find_all('a', href=True):
                if (href := link['href']).startswith(f'{self.link_root}/{i}/'):
                    issue_links.append(href)

        return self.create_issue_links(issue_links)

    def get_article_links(self):
        """
        Get all articles links
        """
        article_links = list()
        for issue_link in self.issue_links:
            article_content = get_parsed_content(f'{self.http_root}{issue_link}')
            for link in article_content.find_all('a', href=True):
                if (href := link['href']).startswith(f'{issue_link}') and "pdf" in href:
                    article_links.append(f'{self.http_root}{href}')

        return self.create_article_links(article_links)

In [42]:
class GeochemicalPerspectivesLettersJournal(BaseJournal):
    def __init__(self):
        super().__init__()
        self.http_root="https://www.geochemicalperspectivesletters.org"
        
    def get_issue_links(self):
        """
        Get all issues
        """
        issue_links = list()
        volume_content= get_parsed_content(f'{self.http_root}/all_issues/')
        for link in volume_content.find_all('a', href=True):
                if (href := link['href']).startswith("../v"):
                    issue_links.append(f'{self.http_root}/{href[3:]}')
                    
        return self.create_issue_links(issue_links)
    
    def get_article_links(self):
        """
        Get all articles links
        """
        article_links = list()
        for issue_link in self.issue_links:
            article_list_content = get_parsed_content(issue_link)
            for link in article_list_content.find_all('a', href=True):
                if "noSI.pdf" in (href := link['href']):
                    article_links.append(f'{self.http_root}/{href[3:]}')
        return self.create_article_links(article_links)
  
    

In [43]:
class QuaternaryJournal(BaseJournal):
    def __init__(self):
        super().__init__()
        self.http_root="https://www.mdpi.com"
        self.link_root = "/2571-550X"
        
    def get_issue_links(self):
        """
        Get all issues
        """
        issue_links = list()
        for i in range(1,5):
            volume_content= get_parsed_content(f'{self.http_root}{self.link_root}/{i}')
            for link in volume_content.find_all('a', href=True):
                #print(link['href'])
                if (href := link['href']).startswith(f'{self.link_root}/{i}/'):
                    issue_links.append(href)

        return self.create_issue_links(issue_links)

    def get_article_links(self):
        """
        Get all articles links
        """
        article_links = list()
        for issue_link in self.issue_links:
            article_content = get_parsed_content(f'{self.http_root}{issue_link}')
            for link in article_content.find_all('a', href=True):
                if (href := link['href']).startswith(f'{issue_link}') and "pdf" in href:
                    article_links.append(f'{self.http_root}{href}')

        return self.create_article_links(article_links)

In [183]:
#res = await get_session_content("https://pubs.geoscienceworld.org/geosphere/list-of-years")
#"https://pubs.geoscienceworld.org/geosphere/list-of-issues/2017".starts

In [89]:
# Interesting
# https://pubs.geoscienceworld.org/pages/opengeosci

In [91]:
geosciences_j.article_links[0]

'https://www.mdpi.com/2076-3263/1/1/1/pdf'

### Get links

In [97]:
#materials_j = MaterialsJournal()
#solidearth_j = SolidEarthJournal()
#geosciences_j=GeosciencesJournal()
gpl_j = GeochemicalPerspectivesLettersJournal()
#quaternary_j = QuaternaryJournal()

if GET_LINKS:
#     materials_j.get_issue_links()
#     materials_j.get_article_links()
#     solidearth_j.get_issue_links()
#     solidearth_j.get_article_links()
#     geosciences_j.get_issue_links()
#     geosciences_j.get_article_links()
    gpl_j.get_issue_links()
    gpl_j.get_article_links()
#     quaternary_j.get_issue_links()
#     quaternary_j.get_article_links()

In [67]:
unfinished_files_name_map


defaultdict(list,
            {'geosciences': ['_2076-3263_8_8_308_pdf.pdf'],
             'geochemical_perspective_letters': ['_documents_GPL1724cor_noSI.pdf.pdf',
              '_documents_GPL1802err_noSI.pdf.pdf',
              '_documents_GPL1825cor_noSI.pdf.pdf',
              '_documents_GPL1907cor_noSI.pdf.pdf',
              '_documents_GPL1922cor_noSI.pdf.pdf',
              '_documents_GPL1925cor_noSI.pdf.pdf']})

### Download 

In [98]:
#print(material_dir_new)
DOWNLOAD_REDO=True
if DOWNLOAD_REDO:
    #print('materials')
    #download_articles(materials_j.article_links,materials_j.http_root,material_dir_new,unfinished_files_name_map['materials'])
    #print('gs')
    #download_articles(geosciences_j.article_links,geosciences_j.http_root,geosciences_dir_new, unfinished_files_name_map['geosciences'])
    print('gpl')
    download_articles(gpl_j.article_links,gpl_j.http_root,gpl_dir_new, unfinished_files_name_map['geochemical_perspective_letters'])

gpl
/nrcan_p2/data/01_raw/20201221/doaj_extra_5/geochemical_perspective_letters_redo
_documents_GPL1724cor_noSI.pdf.pdf
_documents_GPL1802err_noSI.pdf.pdf
_documents_GPL1825cor_noSI.pdf.pdf
_documents_GPL1907cor_noSI.pdf.pdf
_documents_GPL1922cor_noSI.pdf.pdf
_documents_GPL1925cor_noSI.pdf.pdf
6


In [219]:
if DOWNLOAD:
    download_articles(materials_j.article_links,materials_j.http_root,material_dir)
    download_articles(geosciences_j.article_links,geosciences_j.http_root,geosciences_dir)
    download_articles(solidearth_j.article_links,solidearth_j.http_root,solid_earth_dir)
    download_articles(gpl_j.article_links,gpl_j.http_root,gpl_dir)

download_articles(quaternary_j.article_links,quaternary_j.http_root,quaternary_dir)


















### Number of issues

In [7]:
print(f'Materials Journal: {materials_j.get_articles_count()} ') 

Materials Journal: 1600 


In [156]:
print(f'Solid Earth Journal: {solidearth_j.get_articles_count()} ') 

Solid Earth Journal: 796 


In [98]:
print(f'Geosciences Journal: {geosciences_j.get_articles_count()} ') 

Solid Earth Sciences Journal: 1315 


In [194]:
print(f'Geochemical Perspectives Letters Journal: {gpl_j.get_articles_count()} ') 

Geochemical Perspectives Letters Journal: 185 


In [220]:
print(f'Quaternary Journal: {quaternary_j.get_articles_count()} ') 

Quaternary Journal: 102 


In [225]:
1600+796+1315+185+102

3998

### Verifying that the files were downloaded properly

In [155]:
%%bash
ls -1 /nrcan_p2/data/01_raw/20201221/doaj/material | wc -l

1597


In [154]:
%%bash
ls -1 /nrcan_p2/data/01_raw/20201221/doaj/solid_earth | wc -l

799


In [96]:
%%bash
ls -1 /nrcan_p2/data/01_raw/20201221/doaj/geosciences| wc -l

1315


In [198]:
%%bash
ls -1 /nrcan_p2/data/01_raw/20201221/doaj/geochemical_perspective_letters| wc -l


185


In [221]:
%%bash 
ls -1 /nrcan_p2/data/01_raw/20201221/doaj/quaternary| wc -l

102


In [223]:
%%bash
ls -al /nrcan_p2/data/03_primary/v1/all_text.txt

-rw-r--r-- 1 toolkit toolkit 2171268128 Dec 23 19:00 /nrcan_p2/data/03_primary/v1/all_text.txt


# CORE
Trials with core

In [230]:
import json
url = 'https://core.ac.uk:443/api-v2/repositories/search/brunel?page=1&pageSize=10&apiKey=20hIsS1F5j4D2C2iXrg4Wxf7VTp4Xt1j'
result_as_json = json.loads(requests.get(url, verify=False).text)
result_as_json



{'status': 'OK',
 'totalHits': 7,
 'data': [{'name': 'Brunel University Research Archive',
   'uri': 'https://bura.brunel.ac.uk/oai/request',
   'urlOaipmh': 'https://bura.brunel.ac.uk/oai/request',
   'urlHomepage': 'https://bura.brunel.ac.uk',
   'id': 14,
   'type': 'repository'},
  {'name': 'Brunel University Research Archive',
   'uri': 'https://bura.brunel.ac.uk/oai/driver',
   'urlOaipmh': 'https://bura.brunel.ac.uk/oai/driver',
   'id': 16016,
   'type': 'repository'},
  {'name': 'Brunel University Research Archive',
   'uri': 'https://bura.brunel.ac.uk/oai/request',
   'urlOaipmh': 'https://bura.brunel.ac.uk/oai/request',
   'id': 15613,
   'type': 'repository'},
  {'name': 'Brunel University Research Archive',
   'uri': 'https://bura.brunel.ac.uk/oai/request?verb=ListRecords&metadataPrefix=oai_dc',
   'urlOaipmh': 'https://bura.brunel.ac.uk/oai/request?verb=ListRecords&metadataPrefix=oai_dc',
   'id': 11241,
   'OpenDoarId': 900,
   'type': 'repository'},
  {'name': 'Brunel U

In [231]:
# Core API
# https://github.com/oacore/or2016-api-demo/blob/master/OR2016%20API%20demo.ipynb

In [257]:
import json
url = 'https://core.ac.uk:443/api-v2/journals/search/Quaternary?page=1&pageSize=20&apiKey=20hIsS1F5j4D2C2iXrg4Wxf7VTp4Xt1j'
result_as_json = json.loads(requests.get(url, verify=False).text)
result_as_json



{'status': 'OK',
 'totalHits': 3,
 'data': [{'title': 'Quaternary and Environmental Geosciences',
   'identifiers': ['oai:doaj.org/journal:f602a3ee899640b6acd05183ca9841e0',
    'issn:2176-6142',
    'url:https://doaj.org/toc/2176-6142'],
   'subjects': ['environmental geosciences',
    'anthropocene',
    'environmental changes',
    'coastal geology',
    'paleoclimate',
    'quaternary period',
    'Geology',
    'QE1-996.5'],
   'language': 'ES',
   'rights': 'CC BY-NC-SA',
   'publisher': 'OJS'},
  {'title': 'E&G : Quaternary Science Journal',
   'identifiers': ['oai:doaj.org/journal:85ac749cbd244ebca1879dbf28c88ea8',
    'issn:0424-7116',
    'url:https://doaj.org/toc/0424-7116'],
   'subjects': ['quaternary geology',
    'palaeoecology',
    'soil science',
    'palaeoclimatology',
    'geomorphology',
    'Geology',
    'QE1-996.5',
    'Science',
    'Q'],
   'language': 'German',
   'rights': 'CC BY',
   'publisher': 'Geozon Science Media'},
  {'title': 'Baltica : An Internat

In [256]:
import json
url = 'https://core.ac.uk:443/api-v2/journals/get/issn:1680-7340?page=1&pageSize=20&apiKey=20hIsS1F5j4D2C2iXrg4Wxf7VTp4Xt1j'
result_as_json = json.loads(requests.get(url, verify=False).text)
result_as_json



{'status': 'OK',
 'data': {'title': 'Advances in Geosciences',
  'identifiers': ['oai:doaj.org/journal:bc17f22961d6449f8d6966900d234663',
   'issn:1680-7340',
   'issn:1680-7359',
   'url:https://doaj.org/toc/1680-7359'],
  'subjects': ['Earth sciences',
   'Planetary sciences',
   'Solar system sciences',
   'Science',
   'Q',
   'Geology',
   'QE1-996.5',
   'Dynamic and structural geology',
   'QE500-639.5'],
  'language': 'EN',
  'rights': 'CC BY',
  'publisher': 'Copernicus Publications'}}

In [None]:
Journals API has:
Minerals
Solid Earth
Geosciences
Frontiers in Earth Science
Advances in Geosciences

Not have:
Solid Earth Sciences
Geosphere
Lithosphere
Geoscience Frontiers
Geochemical Perspectives Letters
Quaternary
SOIL

In [259]:
import json
url = "https://core.ac.uk:443/api-v2/search/Economics?page=1&pageSize=10&apiKey=20hIsS1F5j4D2C2iXrg4Wxf7VTp4Xt1j"
result_as_json = json.loads(requests.get(url, verify=False).text)
result_as_json



{'status': 'OK',
 'totalHits': 3092526,
 'data': [{'_index': 'articles_2019_06_05',
   '_type': 'article',
   '_id': '23846629',
   '_score': 16.51658,
   '_source': {'id': '23846629',
    'authors': ['Economics', 'Danilo Lopomo Beteto', 'Danilo Lopomo Beteto'],
    'citations': [],
    'contributors': ['The Pennsylvania State University CiteSeerX Archives'],
    'datePublished': '2012',
    'deleted': 'ALLOWED',
    'description': 'This paper studies the effect on equilibrium prices adventing from the presence of a safety net during financial crises. It is shown that, by inflating prices with the insurance provided through its intervention policy, a government might be sowing the seeds of a crisis that its intention is to prevent in the first place. The model developed is one with risk-neutral agents facing a static decision problem, under different (i) frameworks- with and without the possibility of intervention- and (ii) informational scenarios- imperfect, perfect and common prior i

In [272]:
import urllib.request
import urllib.parse
import json
import pprint

class CoreApiRequestor:

    def __init__(self, endpoint, api_key):
        self.endpoint = endpoint
        self.api_key = api_key
        #defaults
        self.pagesize = 5
        self.page = 1

    def parse_response(self, decoded):
        res = []
        for item in decoded['data']:
            doi = None
            if 'identifiers' in item:
                for identifier in item['identifiers']:
                    if identifier and identifier.startswith('doi:'):
                        doi = identifier
                        break
            res.append([item['title'], doi])
        return res

    def request_url(self, url):
        with urllib.request.urlopen(url) as response:
            html = response.read()
        return html

    def get_method_query_request_url(self,method,query,fullText,page):
        if (fullText):
            fullText = 'true'
        else:
            fullText = 'false'
        params = {
            'apiKey':self.api_key,
            'page':page,
            'pageSize':self.pagesize,
            'fulltext':fullText
        }
        return self.endpoint + method + '/' + urllib.parse.quote(query) + '?' + urllib.parse.urlencode(params)

    def get_up_to_20_pages_of_query(self,method,query,fulltext):
        url = self.get_method_query_request_url(method,query,fulltext,1)
        all_articles=[]
        resp = self.request_url(url)
        result = json.loads(resp.decode('utf-8'))
        all_articles.append(result)
        if (result['totalHits']>100):
            numOfPages = int(result['totalHits']/self.pagesize)  #rounds down
            if (numOfPages>3):
                numOfPages=3
            for i in range(2,numOfPages):
                url = self.get_method_query_request_url(method,query,False,i)
                print(url)
                resp =self.request_url(url)
                all_articles.append(json.loads(resp.decode('utf-8')))
        return all_articles

In [273]:
'''
Example invokation
'''
# init 
endpoint = 'https://core.ac.uk/api-v2'

'''
********************************************
Add your own api key below
'''
api_key = 'nTo627BU8jPNth4EbsrDue9IXWzAfZiY'
'''
********************************************
'''
method = '/articles/search'
topic = 'deep AND learning'

api = CoreApiRequestor(endpoint,api_key)

In [274]:
url = api.get_method_query_request_url(method,topic,False,1)
url

'https://core.ac.uk/api-v2/articles/search/deep%20AND%20learning?apiKey=nTo627BU8jPNth4EbsrDue9IXWzAfZiY&page=1&pageSize=5&fulltext=false'

In [263]:
result = api.request_url(url)

In [275]:
multi_page_result = api.get_up_to_20_pages_of_query(method,topic,False)

https://core.ac.uk/api-v2/articles/search/deep%20AND%20learning?apiKey=nTo627BU8jPNth4EbsrDue9IXWzAfZiY&page=2&pageSize=5&fulltext=false


In [276]:
multi_page_result[0]

{'status': 'OK',
 'totalHits': 1022522,
 'data': [{'id': '190331523',
   'authors': ['Barclay, Kezia M.'],
   'contributors': ['Templeton, Elizabeth'],
   'description': 'This artistic inquiry asked, what is my experience as a DMT intern who worked with children who have experienced trauma? I was the only participant in the study. The data was collected using the practice of Authentic Movement, which is a codified expressive, improvisational movement process. Data was also collected using journaling. The data was analyzed using movement and identifying themes in the written data. The results can be summarized in three main themes: boundaries and limitations, polarities and balance and self-care.\nThe main implications of the study are to integrate the explicit use of movement in DMT supervision and self-care practices. Additionally, the practice of sharing one’s cohesive experienced is recommended. For future research, I propose the inclusion of more participants, both DMT interns and 

In [277]:
topic = 'deep AND learning'
ftopic = topic.replace(" ","")

queryOnlyFT = '(title:('+topic+') OR description:('+topic+') OR fullText:('+topic+')) AND fullText:*'
res = api.get_up_to_20_pages_of_query(method,queryOnlyFT,False)

https://core.ac.uk/api-v2/articles/search/%28title%3A%28deep%20AND%20learning%29%20OR%20description%3A%28deep%20AND%20learning%29%20OR%20fullText%3A%28deep%20AND%20learning%29%29%20AND%20fullText%3A%2A?apiKey=nTo627BU8jPNth4EbsrDue9IXWzAfZiY&page=2&pageSize=5&fulltext=false


In [278]:
res

[{'status': 'OK',
  'totalHits': 970510,
  'data': [{'id': '190331523',
    'authors': ['Barclay, Kezia M.'],
    'contributors': ['Templeton, Elizabeth'],
    'description': 'This artistic inquiry asked, what is my experience as a DMT intern who worked with children who have experienced trauma? I was the only participant in the study. The data was collected using the practice of Authentic Movement, which is a codified expressive, improvisational movement process. Data was also collected using journaling. The data was analyzed using movement and identifying themes in the written data. The results can be summarized in three main themes: boundaries and limitations, polarities and balance and self-care.\nThe main implications of the study are to integrate the explicit use of movement in DMT supervision and self-care practices. Additionally, the practice of sharing one’s cohesive experienced is recommended. For future research, I propose the inclusion of more participants, both DMT interns

In [291]:
queryOnlyFT = '(title:(Geology) OR description:(Geology)) AND fullText:*'
res = api.get_up_to_20_pages_of_query(method,queryOnlyFT,False)
res

https://core.ac.uk/api-v2/articles/search/%28title%3A%28Geology%29%20OR%20description%3A%28Geology%29%29%20AND%20fullText%3A%2A?apiKey=nTo627BU8jPNth4EbsrDue9IXWzAfZiY&page=2&pageSize=5&fulltext=false


[{'status': 'OK',
  'totalHits': 16798,
  'data': [{'id': '291851657',
    'authors': ['Eka Putra, Doni Prakasa',
     'Sekar Rianda, Adelide Asriati',
     'Wilopo, Wahyu'],
    'contributors': [''],
    'datePublished': '2020-08-07T00:00:00',
    'description': 'ABSTRACT. Batu Hijau mine pit is known as one of the largest copper pit mine in Nusa Tenggara Barat, Indonesia. Similiar as other copper mine pits in Indonesia, This site also faces acid mine water (AMD) problem. Based on the mine management plan, the AMD generated from this pit is being collected into Santong ponds in the southwest of the pit located in the upstrean area if Sejorong watershed. By the next decade, Batu Hijau mine will be in the closure mine period and it is important to understand the probable move- ment of AMD under the Santong ponds whether the AMD leaked to groundwater or not. This research aims to develop a numerical model of groundwater ﬂow and predict the movement of AMD by applying particle tracking me

In [286]:
queryOnlyFT = 'journal:(1932-6203) AND (title:(Geology) OR description:(Geology)) AND fullText:*'
res = api.get_up_to_20_pages_of_query(method,queryOnlyFT,False)
res

[{'status': 'Not found', 'totalHits': 0, 'data': None}]

In [173]:
url="https://core.ac.uk/api-v2/articles/search/geology?journal=2045-2322&from_Year=2001&to_Year=2020&apiKey=nTo627BU8jPNth4EbsrDue9IXWzAfZiY&hasFullText=True&urls=True"
res = requests.get(url, verify=False)
result_as_json = json.loads(res.text)
result_as_json



{'status': 'OK',
 'totalHits': 522822,
 'data': [{'id': '291755033',
   'authors': ['Ealey, Peter John'],
   'contributors': [],
   'datePublished': '10000-01-01',
   'description': '144 p.Thesis (Ph.D.)--University of Illinois at Urbana-Champaign, 1969.U of I OnlyRestricted to the U of I community idenfinitely during batch ingest of legacy ETD',
   'identifiers': ['oai:www.ideals.illinois.edu:2142/61442', None],
   'relations': [],
   'repositories': [{'id': '7595',
     'openDoarId': 0,
     'name': 'IDEALS @ Illinois',
     'uri': None,
     'urlHomepage': None,
     'urlOaipmh': None,
     'uriJournals': None,
     'physicalName': 'noname',
     'source': None,
     'software': None,
     'metadataFormat': None,
     'description': None,
     'journal': None,
     'roarId': 0,
     'baseId': 0,
     'pdfStatus': None,
     'nrUpdates': 0,
     'disabled': False,
     'lastUpdateTime': None,
     'repositoryLocation': None}],
   'repositoryDocument': {'pdfStatus': 0,
    'metadataAd

In [172]:
result_as_json["scrollId"]

KeyError: 'scrollId'

In [298]:
result_as_json["data"][0]

{'id': '19962119',
 'authors': ['Lantz, Rik E.'],
 'contributors': [],
 'description': 'Thesis (B.S.) in Geology--University of Illinois at Urbana-Champaign, 1982.Bibliography: leaf [15]U of I OnlyTheses restricted to UIUC community onl',
 'identifiers': ['oai:www.ideals.illinois.edu:2142/48374', None],
 'relations': [],
 'repositories': [{'id': '198',
   'openDoarId': 0,
   'name': 'Illinois Digital Environment for Access to Learning and Scholarship Repository',
   'urlHomepage': None,
   'urlOaipmh': None,
   'uriJournals': None,
   'physicalName': 'noname',
   'source': None,
   'software': None,
   'metadataFormat': None,
   'description': None,
   'journal': None,
   'roarId': 0,
   'baseId': 0,
   'pdfStatus': None,
   'nrUpdates': 0,
   'disabled': False,
   'lastUpdateTime': None,
   'repositoryLocation': None}],
 'repositoryDocument': {'pdfStatus': 0,
  'metadataAdded': 1396502191000,
  'metadataUpdated': 1608008811000,
  'depositedDate': 1458259200000},
 'subjects': ['text'],

In [285]:
url="https://core.ac.uk/api-v2/articles/search/geology?language=en&page=1&pageSize=100&apiKey=nTo627BU8jPNth4EbsrDue9IXWzAfZiY&fulltext=true&metadata=false&hasFullText=true&urls=true"
result_as_json = json.loads(requests.get(url, verify=False).text)
result_as_json



{'status': 'OK',
 'totalHits': 525831,
 'data': [{'id': '32244476',
   'fullText': " Procedia - Social and Behavioral Sciences  186 ( 2015 )  732 – 738 \nAvailable online at www.sciencedirect.com\n1877-0428 © 2015 The Authors. Published by Elsevier Ltd. This is an open access article under the CC BY-NC-ND license \n(http://creativecommons.org/licenses/by-nc-nd/4.0/).\nPeer-review under responsibility of Academic World Education and Research Center\ndoi: 10.1016/j.sbspro.2015.04.103 \nScienceDirect\n5th World Conference on Learning, Teaching and Educational Leadership, WCLTA 2014 \nPathways to Change: Improving The Quality of Education in Timor-\nLeste \nMargarida Lucasa\r, Isabel Cabritaa, Adriana Ferreiraa \naResearch Centre Didactics and Technology in Education of Trainers, Dep. of Education,University of Aveiro, 3810-193 Aveiro, Portugal \nAbstract \nAfter more than 400 years of Portuguese colonization and a 24-year period of Indonesian occupation, Timor-Leste became a fully \nindep

In [327]:
len(result_as_json["data"])

100

In [328]:
result_as_json["data"][0]

{'id': '291755033',
 'fullText': '',
 'journals': [{}, {}],
 'relations': [],
 'urls': ['http://hdl.handle.net/2142/61442']}

In [287]:
url="https://core.ac.uk/api-v2/articles/search/%22Elsevier%20BV%22%20AND%20Geology%20?page=2&pageSize=10&apiKey=u1dCfVKqTOIWDF8nHBvEXc2soYpUigzG&hasFullText=true&urls=true"
result_as_json2 = json.loads(requests.get(url, verify=False).text)
result_as_json2



{'status': 'OK',
 'totalHits': 213,
 'data': [{'id': '217199129',
   'authors': ['Cordie, David Russell'],
   'contributors': [],
   'datePublished': '2019-05-01T07:00:00',
   'description': 'The early Cambrian represents an important transition in the evolution of life, perhaps most vividly exemplified by reef ecosystems as they changed from microbial-supported to metazoan-supported framework reefs. Microbial reefs were initially composed of Renalcis- and Epiphyton-group calcifying microbes. Subsequent reefs began to incorporate archaeocyathan sponges within this framework. This represents a shift in the source of carbonate production, which can be quantified using thin section point counts. In archaeocyathan reefs from the western USA, carbonate contribution from metazoan framework builders increased from zero to 29.7%. Similar reefs from Mongolia increased from zero to 5.0%. Increases in Laurentian archaeocyath contributions are not associated with shifts in carbon isotopic composit

In [330]:
result_as_json2["data"][0]

{'id': '291755033',
 'authors': ['Ealey, Peter John'],
 'contributors': [],
 'datePublished': '10000-01-01',
 'description': '144 p.Thesis (Ph.D.)--University of Illinois at Urbana-Champaign, 1969.U of I OnlyRestricted to the U of I community idenfinitely during batch ingest of legacy ETD',
 'identifiers': ['oai:www.ideals.illinois.edu:2142/61442', None],
 'relations': [],
 'repositories': [{'id': '7595',
   'openDoarId': 0,
   'name': 'IDEALS @ Illinois',
   'uri': None,
   'urlHomepage': None,
   'urlOaipmh': None,
   'uriJournals': None,
   'physicalName': 'noname',
   'source': None,
   'software': None,
   'metadataFormat': None,
   'description': None,
   'journal': None,
   'roarId': 0,
   'baseId': 0,
   'pdfStatus': None,
   'nrUpdates': 0,
   'disabled': False,
   'lastUpdateTime': None,
   'repositoryLocation': None}],
 'repositoryDocument': {'pdfStatus': 0,
  'metadataAdded': 1586089228000,
  'metadataUpdated': 1594511926000,
  'depositedDate': 1458216533000},
 'subjects': 

In [331]:
url="https://core.ac.uk/api-v2/articles/search/geology?language=en&pageSize=100&apiKey=nTo627BU8jPNth4EbsrDue9IXWzAfZiY&fulltext=true&metadata=false&hasFullText=false&urls=true"
result_as_json3 = json.loads(requests.get(url, verify=False).text)
result_as_json3



{'status': 'OK',
 'totalHits': 522432,
 'data': [{'id': '291755033',
   'fullText': '',
   'journals': [{}, {}],
   'relations': [],
   'urls': ['http://hdl.handle.net/2142/61442']},
  {'id': '291750312',
   'fullText': '',
   'journals': [{}, {}],
   'relations': [],
   'urls': ['http://hdl.handle.net/2142/56669']},
  {'id': '291750395',
   'fullText': '',
   'journals': [{}, {}],
   'relations': [],
   'urls': ['http://hdl.handle.net/2142/56752']},
  {'id': '291764172',
   'fullText': '',
   'journals': [{}, {}],
   'relations': [],
   'urls': ['http://hdl.handle.net/2142/70643']},
  {'id': '291742253',
   'fullText': '',
   'journals': [{}, {}],
   'relations': [],
   'urls': ['http://hdl.handle.net/2142/47970']},
  {'id': '291791471',
   'fullText': '',
   'journals': [{}, {}],
   'relations': [],
   'urls': ['http://hdl.handle.net/2142/98580']},
  {'id': '15564608',
   'fullText': 'Deep-Sea Research II 58 (2011) 2433–2447Contents lists available at ScienceDirectDeep-Sea Research I

In [332]:
result_as_json2["data"][1]

{'id': '291750312',
 'authors': ['Eveland, Harmon Edwin, Jr.'],
 'contributors': [],
 'datePublished': '10000-01-01',
 'description': '100 p.Thesis (Ph.D.)--University of Illinois at Urbana-Champaign, 1950.U of I OnlyRestricted to the U of I community idenfinitely during batch ingest of legacy ETD',
 'identifiers': ['oai:www.ideals.illinois.edu:2142/56669', None],
 'relations': [],
 'repositories': [{'id': '7595',
   'openDoarId': 0,
   'name': 'IDEALS @ Illinois',
   'uri': None,
   'urlHomepage': None,
   'urlOaipmh': None,
   'uriJournals': None,
   'physicalName': 'noname',
   'source': None,
   'software': None,
   'metadataFormat': None,
   'description': None,
   'journal': None,
   'roarId': 0,
   'baseId': 0,
   'pdfStatus': None,
   'nrUpdates': 0,
   'disabled': False,
   'lastUpdateTime': None,
   'repositoryLocation': None}],
 'repositoryDocument': {'pdfStatus': 0,
  'metadataAdded': 1586089185000,
  'metadataUpdated': 1594511906000,
  'depositedDate': 1458216336000},
 'su

In [223]:
url="https://core.ac.uk/api-v2/journals/search/geosciences?pageSize=10&page=1&metadata=true&hasFullText=true&urls=true&apiKey=u1dCfVKqTOIWDF8nHBvEXc2soYpUigzG"
result_as_json2 = json.loads(requests.get(url, verify=False).text)
result_as_json2



{'status': 'OK',
 'totalHits': 10,
 'data': [{'title': 'Geosciences',
   'identifiers': ['oai:doaj.org/journal:a8dbecea9bb1473ca4ba7ff0531b45e9',
    'issn:2076-3263',
    'url:https://doaj.org/toc/2076-3263'],
   'subjects': ['crystallography',
    'mineralogy',
    'energy and mineral deposits',
    'geochemistry',
    'planetary geology',
    'geodynamics',
    'geographic information science',
    'hydrology',
    'paleontology',
    'Geology',
    'QE1-996.5',
    'Science',
    'Q'],
   'language': 'English',
   'rights': 'CC BY',
   'publisher': 'MDPI AG'},
  {'title': 'Bulletin of Geosciences',
   'identifiers': ['oai:doaj.org/journal:164044b81b154e3783739d1558020fbc',
    'issn:1214-1119',
    'issn:1802-8225',
    'url:https://doaj.org/toc/1802-8225'],
   'subjects': ['palaeobiology',
    'sedimentology',
    'palaeoclimatology',
    'stratigraphy',
    'palaeogeography',
    'Geology',
    'QE1-996.5',
    'Science',
    'Q'],
   'language': 'English',
   'publisher': 'Czech

In [280]:
url="https://core.ac.uk/api-v2/journals/search/Geochemistry%20Geology%20Geomorphology%20Geophysics%20Geoscience%20Geosciences%20Hydrogeology%20Hydrology%20Igneous%20Lithology%20Lithosphere%20Lithospheres%20Metamorphic%20Mineralogy%20Paleontology%20Pedology%20Petrology%20Plutonic%20Sedimentary%20Stratigraphy%20Tectonic%20Volcanic?apiKey=u1dCfVKqTOIWDF8nHBvEXc2soYpUigzG&pageSize=10&page=1"
result_as_json2 = json.loads(requests.get(url, verify=False,headers={'Cache-Control': 'no-cache'}).text)
result_as_json2



{'status': 'OK',
 'totalHits': 41,
 'data': [{'title': 'Hydrology',
   'identifiers': ['oai:doaj.org/journal:c7ef0e9cc18241a8b09b36563d4289ef',
    'issn:2306-5338',
    'url:https://doaj.org/toc/2306-5338'],
   'subjects': ['Surface water hydrology',
    'Hydrochemistry',
    'Hydroinformatics',
    'Isotope hydrology',
    'Water management',
    'Water quality',
    'Science',
    'Q'],
   'language': 'EN',
   'rights': 'CC BY',
   'publisher': 'MDPI AG'},
  {'title': 'Geoscience Letters',
   'identifiers': ['oai:doaj.org/journal:dfdd42be68ba47b7831ef9e57e3d5719',
    'issn:2196-4092',
    'url:https://doaj.org/toc/2196-4092'],
   'subjects': ['"Atmospheric Sciences and Ocean Sciences',
    'Biogeosciences',
    'Geography. Anthropology. Recreation',
    'G'],
   'language': 'EN',
   'rights': 'CC BY',
   'publisher': 'SpringerLink'},
  {'title': 'ISRN Geophysics',
   'identifiers': ['oai:doaj.org/journal:2bb29e9e3eb7484d8597e19b5d6734c1',
    'issn:2090-8946',
    'url:https://doaj

In [284]:
url="https://core.ac.uk/api-v2/journals/search/Geochemistry%20Geology%20Geomorphology%20Geophysics%20Geoscience%20Geosciences%20Hydrogeology%20Hydrology%20Igneous%20Lithology%20Lithosphere%20Lithospheres%20Metamorphic%20Mineralogy%20Paleontology%20Pedology%20Petrology%20Plutonic%20Sedimentary%20Stratigraphy%20Tectonic%20Volcanic?apiKey=u1dCfVKqTOIWDF8nHBvEXc2soYpUigzG&page=3"
result_as_json2 = json.loads(requests.get(url, verify=False,headers={'Cache-Control': 'no-cache'}).text)
result_as_json2



{'status': 'OK',
 'totalHits': 41,
 'data': [{'title': 'Hydrology',
   'identifiers': ['oai:doaj.org/journal:c7ef0e9cc18241a8b09b36563d4289ef',
    'issn:2306-5338',
    'url:https://doaj.org/toc/2306-5338'],
   'subjects': ['Surface water hydrology',
    'Hydrochemistry',
    'Hydroinformatics',
    'Isotope hydrology',
    'Water management',
    'Water quality',
    'Science',
    'Q'],
   'language': 'EN',
   'rights': 'CC BY',
   'publisher': 'MDPI AG'},
  {'title': 'Geoscience Letters',
   'identifiers': ['oai:doaj.org/journal:dfdd42be68ba47b7831ef9e57e3d5719',
    'issn:2196-4092',
    'url:https://doaj.org/toc/2196-4092'],
   'subjects': ['"Atmospheric Sciences and Ocean Sciences',
    'Biogeosciences',
    'Geography. Anthropology. Recreation',
    'G'],
   'language': 'EN',
   'rights': 'CC BY',
   'publisher': 'SpringerLink'},
  {'title': 'ISRN Geophysics',
   'identifiers': ['oai:doaj.org/journal:2bb29e9e3eb7484d8597e19b5d6734c1',
    'issn:2090-8946',
    'url:https://doaj

In [260]:
len(result_as_json2["data"])

10

In [144]:
# For SEPRENDITY
# 326764687 last part
# at 100:100 it is 326039651
# at 100:101 it is 200818811
# but from 10k results it start caching as 10

# FOR GEOLOFY
# 100:101 291755033
len(result_as_json2["data"])

10

In [341]:
%%bash
pwd

/nrcan_p2/data/01_raw/20201221


In [43]:
from pandas import json_normalize
df = json_normalize(result_as_json2["data"])

In [42]:
df.columns

Index(['id', 'authors', 'contributors', 'datePublished', 'description',
       'fullText', 'identifiers', 'relations', 'repositories', 'subjects',
       'title', 'topics', 'types', 'year', 'fulltextUrls', 'oai',
       'downloadUrl', 'repositoryDocument.pdfStatus',
       'repositoryDocument.metadataAdded',
       'repositoryDocument.metadataUpdated',
       'repositoryDocument.depositedDate'],
      dtype='object')

In [36]:
df.head()

Unnamed: 0,id,authors,contributors,datePublished,description,fullText,identifiers,relations,repositories,subjects,...,repositoryDocument.metadataAdded,repositoryDocument.metadataUpdated,repositoryDocument.depositedDate,journals,publisher,fulltextIdentifier,doi,language.code,language.id,language.name
0,291755033,"[Ealey, Peter John]",[],10000-01-01,144 p.Thesis (Ph.D.)--University of Illinois a...,,"[oai:www.ideals.illinois.edu:2142/61442, None]",[],"[{'id': '7595', 'openDoarId': 0, 'name': 'IDEA...",[text],...,1586089228000,1594511926000,1458216533000,,,,,,,
1,291750312,"[Eveland, Harmon Edwin, Jr.]",[],10000-01-01,100 p.Thesis (Ph.D.)--University of Illinois a...,,"[oai:www.ideals.illinois.edu:2142/56669, None]",[],"[{'id': '7595', 'openDoarId': 0, 'name': 'IDEA...",[text],...,1586089185000,1594511906000,1458216336000,,,,,,,
2,291750395,"[Smith, William Calhoun]",[],10000-01-01,222 p.Thesis (Ph.D.)--University of Illinois a...,,"[oai:www.ideals.illinois.edu:2142/56752, None]",[],"[{'id': '7595', 'openDoarId': 0, 'name': 'IDEA...",[text],...,1586089186000,1594511906000,1458216432000,,,,,,,
3,291764172,"[Miller, Michael Vernon]",[],10000-01-01,222 p.Thesis (Ph.D.)--University of Illinois a...,,"[oai:www.ideals.illinois.edu:2142/70643, None]",[],"[{'id': '7595', 'openDoarId': 0, 'name': 'IDEA...",[text],...,1586089315000,1594511965000,1458216129000,,,,,,,
4,291742253,"[Schulmeister, Marcia Kay]",[],10000-01-01,Typescript.Thesis (B.S.) in Geology--Universit...,,"[oai:www.ideals.illinois.edu:2142/47970, None]",[],"[{'id': '7595', 'openDoarId': 0, 'name': 'IDEA...",[text],...,1586089104000,1594511871000,1458338932000,,,,,,,


In [37]:
df[["id","language.code","fulltextUrls","downloadUrl"]]

Unnamed: 0,id,language.code,fulltextUrls,downloadUrl
0,291755033,,"[https://core.ac.uk/display/291755033, http://...",
1,291750312,,"[https://core.ac.uk/display/291750312, http://...",
2,291750395,,"[https://core.ac.uk/display/291750395, http://...",
3,291764172,,"[https://core.ac.uk/display/291764172, http://...",
4,291742253,,"[https://core.ac.uk/display/291742253, http://...",
5,291791471,,"[https://core.ac.uk/display/291791471, http://...",
6,15564608,,"[https://core.ac.uk/download/pdf/15564608.pdf,...",https://core.ac.uk/download/15564608.pdf
7,32244476,en,"[https://core.ac.uk/download/pdf/32244476.pdf,...",https://core.ac.uk/download/32244476.pdf
8,158299734,,"[https://core.ac.uk/display/158299734, http://...",
9,158306126,,"[https://core.ac.uk/display/158306126, http://...",


In [202]:
ndjson = [json.dumps(record) for record in result_as_json2["data"]]

with open(join(tmp_dir,"test.ndjson"),"w") as f:
    f.write("\n".join(ndjson))

In [203]:
with open(join(tmp_dir,"test.ndjson"),"r") as f:
    load_nd_json = f.readlines() 

In [204]:
import flatten_json

In [205]:
json_records = [flatten_json.flatten(json.loads(x)) for x in load_nd_json]

In [206]:
df = json_normalize(json_records)

In [207]:
df["fullText"].replace('', np.nan, inplace=True)

KeyError: 'fullText'

In [89]:
df["fullText"].dropna().count()

11

In [209]:
df

Index(['id', 'authors_0', 'contributors', 'datePublished', 'description',
       'identifiers_0', 'identifiers_1', 'relations', 'repositories_0_id',
       'repositories_0_openDoarId', 'repositories_0_name',
       'repositories_0_uri', 'repositories_0_urlHomepage',
       'repositories_0_urlOaipmh', 'repositories_0_uriJournals',
       'repositories_0_physicalName', 'repositories_0_source',
       'repositories_0_software', 'repositories_0_metadataFormat',
       'repositories_0_description', 'repositories_0_journal',
       'repositories_0_roarId', 'repositories_0_baseId',
       'repositories_0_pdfStatus', 'repositories_0_nrUpdates',
       'repositories_0_disabled', 'repositories_0_lastUpdateTime',
       'repositories_0_repositoryLocation', 'repositoryDocument_pdfStatus',
       'repositoryDocument_metadataAdded',
       'repositoryDocument_metadataUpdated',
       'repositoryDocument_depositedDate', 'subjects_0', 'title', 'topics_0',
       'types', 'year', 'fulltextUrls_0', 'ful

In [211]:
df["journals_0_title"]

0                                                  NaN
1                                                  NaN
2                                                  NaN
3                                                  NaN
4                                                  NaN
5                                                  NaN
6    Deep Sea Research Part II Topical Studies in O...
7            Procedia - Social and Behavioral Sciences
8                                                  NaN
9                                                  NaN
Name: journals_0_title, dtype: object

In [293]:
import urllib.request
import urllib.parse
import json
import pprint

class CoreApiRequestor:

    def __init__(self, endpoint, api_key):
        self.endpoint = endpoint
        self.api_key = api_key
        #defaults
        self.pagesize = 100

    def parse_response(self, decoded):
        res = []
        for item in decoded['data']:
            doi = None
            if 'identifiers' in item:
                for identifier in item['identifiers']:
                    if identifier and identifier.startswith('doi:'):
                        doi = identifier
                        break
            res.append([item['title'], doi])
        return res

    def request_url(self, url):
        with urllib.request.urlopen(url) as response:
            html = response.read()
        return html

    def get_method_query_request_url(self,method,query,fullText=True,page=1):
        if (fullText):
            fullText = 'true'
        else:
            fullText = 'false'
        params = {
            'apiKey':self.api_key,
            'page':page,
            'pageSize':self.pagesize,
            'fulltext':fullText
        }
        return self.endpoint + method + '/' + urllib.parse.quote(query) + '?' + urllib.parse.urlencode(params)

    def get_geology_journals(self):
        ### Make extended topics
        all_articles = []
        # damn pagination does not work for journals and only 10 can be retrieved. The only way is to get them
        # one by one
        for topic in ["Geochemistry","Geology","Geomorphology","Geophysics","Geoscience","Geosciences", \
                      "Hydrogeology","Hydrology","Igneous","Lithology","Lithosphere","Lithospheres", \
                      "Metamorphic","Mineralogy","Paleontology","Pedology","Petrology","Plutonic", \
                      "Sedimentary","Stratigraphy","Tectonic","Volcanic"]:
        
            all_articles += self.get_data(f'https://core.ac.uk/api-v2/journals/search/{urllib.parse.quote(topic)}?apiKey={self.api_key}',10)
        
        return all_articles
        
    def get_data(self,pageless_url,page_size=10):
        all_items = []
        url_ps = f'{pageless_url}&pageSize={page_size}'
        print(url_ps)
        result = json.loads(requests.get(url_ps, verify=False).text)
        all_articles = result["data"]
        if result['totalHits'] == 0:
            return []
        if (result['totalHits']>page_size):
            numOfPages = int(result['totalHits']/page_size) +1 #rounds down
            for i in range(2,numOfPages+1):
                url = f'{url_ps}&page={i}'
                print(url)
                result = json.loads(requests.get(url, verify=False).text)
                all_articles += result["data"]
            
        return all_articles
    
    def get_all_articles_metadata(self,method,query,fulltext_dir,metadata_dir):
        url = self.get_method_query_request_url(method,query,fulltext,1)
    
    def get_articles(self,method,query,fulltext=True,numOfPages=20):
        url = self.get_method_query_request_url(method,query,fulltext,1)
        all_articles=[]
        resp = self.request_url(url)
        result = json.loads(requests.get(url, verify=False).text)
        all_articles.append(result)
        if (result['totalHits']>100):
            numOfPages = int(result['totalHits']/self.pagesize) +1 #rounds down
            for i in range(2,numOfPages):
                url = self.get_method_query_request_url(method,query,fulltext,i)
                resp =self.request_url(url)
                all_articles.append(json.loads(resp.decode('utf-8')))
        return all_articles

In [294]:

endpoint = 'https://core.ac.uk/api-v2'
api_key = 'u1dCfVKqTOIWDF8nHBvEXc2soYpUigzG'


api = CoreApiRequestor(endpoint,api_key)

In [295]:
geology_journals = api.get_geology_journals()


https://core.ac.uk/api-v2/journals/search/Geochemistry?apiKey=u1dCfVKqTOIWDF8nHBvEXc2soYpUigzG&pageSize=10




https://core.ac.uk/api-v2/journals/search/Geology?apiKey=u1dCfVKqTOIWDF8nHBvEXc2soYpUigzG&pageSize=10




https://core.ac.uk/api-v2/journals/search/Geomorphology?apiKey=u1dCfVKqTOIWDF8nHBvEXc2soYpUigzG&pageSize=10




https://core.ac.uk/api-v2/journals/search/Geophysics?apiKey=u1dCfVKqTOIWDF8nHBvEXc2soYpUigzG&pageSize=10




https://core.ac.uk/api-v2/journals/search/Geoscience?apiKey=u1dCfVKqTOIWDF8nHBvEXc2soYpUigzG&pageSize=10




https://core.ac.uk/api-v2/journals/search/Geosciences?apiKey=u1dCfVKqTOIWDF8nHBvEXc2soYpUigzG&pageSize=10




https://core.ac.uk/api-v2/journals/search/Hydrogeology?apiKey=u1dCfVKqTOIWDF8nHBvEXc2soYpUigzG&pageSize=10




https://core.ac.uk/api-v2/journals/search/Hydrology?apiKey=u1dCfVKqTOIWDF8nHBvEXc2soYpUigzG&pageSize=10




https://core.ac.uk/api-v2/journals/search/Igneous?apiKey=u1dCfVKqTOIWDF8nHBvEXc2soYpUigzG&pageSize=10




https://core.ac.uk/api-v2/journals/search/Lithology?apiKey=u1dCfVKqTOIWDF8nHBvEXc2soYpUigzG&pageSize=10




https://core.ac.uk/api-v2/journals/search/Lithosphere?apiKey=u1dCfVKqTOIWDF8nHBvEXc2soYpUigzG&pageSize=10




https://core.ac.uk/api-v2/journals/search/Lithospheres?apiKey=u1dCfVKqTOIWDF8nHBvEXc2soYpUigzG&pageSize=10




https://core.ac.uk/api-v2/journals/search/Metamorphic?apiKey=u1dCfVKqTOIWDF8nHBvEXc2soYpUigzG&pageSize=10




https://core.ac.uk/api-v2/journals/search/Mineralogy?apiKey=u1dCfVKqTOIWDF8nHBvEXc2soYpUigzG&pageSize=10




https://core.ac.uk/api-v2/journals/search/Paleontology?apiKey=u1dCfVKqTOIWDF8nHBvEXc2soYpUigzG&pageSize=10




https://core.ac.uk/api-v2/journals/search/Pedology?apiKey=u1dCfVKqTOIWDF8nHBvEXc2soYpUigzG&pageSize=10




https://core.ac.uk/api-v2/journals/search/Petrology?apiKey=u1dCfVKqTOIWDF8nHBvEXc2soYpUigzG&pageSize=10




https://core.ac.uk/api-v2/journals/search/Plutonic?apiKey=u1dCfVKqTOIWDF8nHBvEXc2soYpUigzG&pageSize=10




https://core.ac.uk/api-v2/journals/search/Sedimentary?apiKey=u1dCfVKqTOIWDF8nHBvEXc2soYpUigzG&pageSize=10




https://core.ac.uk/api-v2/journals/search/Stratigraphy?apiKey=u1dCfVKqTOIWDF8nHBvEXc2soYpUigzG&pageSize=10




https://core.ac.uk/api-v2/journals/search/Tectonic?apiKey=u1dCfVKqTOIWDF8nHBvEXc2soYpUigzG&pageSize=10




https://core.ac.uk/api-v2/journals/search/Volcanic?apiKey=u1dCfVKqTOIWDF8nHBvEXc2soYpUigzG&pageSize=10




In [299]:
len(geology_journals)

41

In [300]:
unique_jourals = set()
for journal in geology_journals:
    print(journal["identifiers"][1])
    unique_jourals.add(journal["identifiers"][1])

issn:2356-7406
issn:0718-7092
issn:1874-2629
issn:0029-196X
issn:2008-7306
issn:1698-6180
issn:2161-7570
issn:1916-9779
issn:1825-6635
issn:0067-3064
issn:2090-8946
issn:2038-9655
issn:1593-5213
issn:1687-885X
issn:1023-5809
issn:2198-5634
issn:2196-4092
issn:1802-5420
issn:2355-9314
issn:2299-8179
issn:2076-3263
issn:1214-1119
issn:1802-6222
issn:2391-5447
issn:2163-3967
issn:2176-6142
issn:1680-7340
issn:2156-8359
issn:0976-4380
issn:2179-2321
issn:2306-5338
issn:1874-3781
issn:1058-3912
issn:0042-790X
issn:2163-0461
issn:1027-5606
issn:1812-2108
issn:2356-7058
issn:2314-4289
issn:1543-8740
issn:2008-7888


In [298]:
len(unique_jourals)

41

In [None]:
# journals pagination and page size do not work