In [1]:
date_start = "2005-09-01"
date_end = "2022-12-31"
issn = '1476-4687'  #ISSN for Nature

global_api_key = "06653b182ca2ae299e9ce3d3d5d676fd"

# use this link:    https://dev.springernature.com/adding-constraints

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun May 17 17:24:43 2020
Search in Spring Nature using API by keywords for NIS lab reserch projects.

@author oudeng, Graduate School of Human Sciences, Waseda University

Python program for searching info from Springer Nature via API by XML.
Results includes the title, pdf url and abstract of destination articles. 

https://dev.springernature.com/signup.  
registerred by WasedaID of Ou,DENG, got necessary API.

How to use?
1) Run this program in Python eviroment, including bs4 and requests lib.
2) Input keywords for searching.
3) Searching results will in Python console window.
4) Read in console window directly if not too many results,
   or copy contents to any other more confortable browsers.
5) Copy url of artitle you like in results, use broswer to read PDF.
 
Can use for other API?
Yes. By modifying base_url, api_key, total and content identification tags.
Just confirm the url you try in other API. 

Can for JSON?
No, this programe for XML only.

"""

from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
import itertools

# Use a for-loop to flatten a list of lists
def flatten_lists(listToFlatten):
    flat_list = list()
    for sub_list in listToFlatten:
        flat_list += sub_list
    return flat_list

def getXML(url):
    try:
        r = requests.get(url, timeout = 30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return "Scraping Errors. Oooop!!"

def spList(total): 
    S=1 
    P=50
    if total%P != 0:
        s = list(range(S, total//P*P, P)) + [total//P*P+1]
        p = list([P]*(len(s)-1)) + [total%P]
    else:
        s = list(range(S, total//P*P, P)) 
        p = list([P]*len(s))
    # print(s)
    # print(p)
    return s, p

def search_function():
    base_url = "http://api.springernature.com/metadata/pam?q="
    api_key = "&api_key=" + global_api_key
    #Please register at SpringerNature as the following url for your API key.
    # https://www.springernature.com/gp/campaign/librarian-covid-tdm?sap-outbound-id=64AF2B62DCE26C591DAA9263090CDAF763E0CD1F
    
    #input keywords to search in API, url_1 for confirm 'total' value    
    
    date_range = "onlinedatefrom:" + date_start + "%20 onlinedateto:" + date_end
    journalISSN = " issn: " + issn
    
    # ================================================================= change keywords
    #keywords = input("Input '+' in searching keywords:    ")
    # =================================================================
    
    url_1 = base_url + date_range + journalISSN + api_key
    xml_1 = getXML(url_1)
    soup = bs(xml_1,'html.parser')
    total = int(soup.find('total').string)
    
    print('Search Results Totally =',total)
    
    s, p = spList(total)
    
    full_list_of_papers = []
    
    ## you can only download 100 at a time so loop through the download 
    for i in range(len(s)):
    # ================================================================= change keywords
        url = base_url + date_range + journalISSN + '&s=' + str(s[i]) + '&p=' + str(p[i]) + api_key
    # =================================================================
        xml = getXML(url)
        soup = bs(xml, "html.parser")
        current_list_of_papers = soup.find_all('pam:message')
        full_list_of_papers.append(current_list_of_papers)
    
            
    print('Total=',total)
    return flatten_lists(full_list_of_papers)

## pull data 
## get citation number 
#https://stackoverflow.com/questions/69067691/retrieve-number-of-citations-of-a-scientific-paper-in-a-given-year
# pull all the data
# get data from allometric 

def search_to_dataframe(flattened_papers_list):
    all_papers = flattened_papers_list
    first_paper = all_papers[0]
    list_of_fields = list(set([tag.name for tag in first_paper.find_all()]))

    ## create a dictionary with a set of keys that is column titles
    dict_of_paper_fields = {}
    for i in list_of_fields: 
        appenddict = {i:[]}
        dict_of_paper_fields.update(appenddict)

    ## find the fields for each paper
    for i in all_papers: 
        for j in list_of_fields: 
            dict_of_paper_fields[j].append(i.find_all(j))

    return pd.DataFrame(dict_of_paper_fields)

def csv_of_data(dataframeCreated):
    dataframeCreated.to_csv('data/' +"nature"+ "__"+ date_start+"_"+date_end+ '.csv',index=False)

def main():
    list_of_search = search_function()
    dataframe_of_search = search_to_dataframe(list_of_search)
    csv_of_data(dataframe_of_search)

In [3]:
main()

Search Results Totally = 67518
Total= 67518


In [7]:
df = pd.read_csv('data/nature__2005-09-01_2022-12-31.csv')

In [8]:
df

Unnamed: 0,superscript,dc:publisher,journalid,xhtml:head,prism:number,openaccess,dc:language,prism:genre,citationref,prism:startingpage,...,dc:identifier,prism:url,dc:subject,pam:article,prism:publicationname,prism:issn,dc:title,prism:copyright,h1,dc:creator
0,[<superscript><citationref additionalcitationi...,[<dc:publisher>Nature</dc:publisher>],[<journalid>41586</journalid>],[<xhtml:head>\n<pam:article><prism:contenttype...,[<prism:number>7978</prism:number>],[<openaccess>true</openaccess>],[<dc:language>en</dc:language>],"[<prism:genre>OriginalPaper</prism:genre>, <pr...","[<citationref additionalcitationids=""CR2"" cita...",[<prism:startingpage>365</prism:startingpage>],...,[<dc:identifier>doi:10.1038/s41586-022-05279-8...,[<prism:url>http://dx.doi.org/10.1038/s41586-0...,"[<dc:subject>Science, Humanities and Social Sc...",[<pam:article><prism:contenttype>Article</pris...,[<prism:publicationname>Nature</prism:publicat...,[<prism:issn>1476-4687</prism:issn>],[<dc:title>Inferring and perturbing cell fate ...,[<prism:copyright>©2022 The Author(s)</prism:c...,[<h1>Abstract</h1>],"[<dc:creator>Fleck, Jonas Simon</dc:creator>, ..."
1,"[<superscript><citationref citationid=""CR1"">1<...",[<dc:publisher>Nature</dc:publisher>],[<journalid>41586</journalid>],[<xhtml:head>\n<pam:article><prism:contenttype...,[<prism:number>7962</prism:number>],[<openaccess>false</openaccess>],[<dc:language>en</dc:language>],"[<prism:genre>OriginalPaper</prism:genre>, <pr...","[<citationref citationid=""CR1"">1</citationref>]",[<prism:startingpage>792</prism:startingpage>],...,[<dc:identifier>doi:10.1038/s41586-022-04967-9...,[<prism:url>http://dx.doi.org/10.1038/s41586-0...,"[<dc:subject>Science, Humanities and Social Sc...",[<pam:article><prism:contenttype>Article</pris...,[<prism:publicationname>Nature</prism:publicat...,[<prism:issn>1476-4687</prism:issn>],[<dc:title>Induction of mouse totipotent stem ...,"[<prism:copyright>©2022 The Author(s), under e...",[<h1>Abstract</h1>],"[<dc:creator>Hu, Yanyan</dc:creator>, <dc:crea..."
2,[<superscript><citationref additionalcitationi...,[<dc:publisher>Nature</dc:publisher>],[<journalid>41586</journalid>],[<xhtml:head>\n<pam:article><prism:contenttype...,[<prism:number>7953</prism:number>],[<openaccess>true</openaccess>],[<dc:language>en</dc:language>],"[<prism:genre>OriginalPaper</prism:genre>, <pr...","[<citationref additionalcitationids=""CR2"" cita...",[<prism:startingpage>687</prism:startingpage>],...,[<dc:identifier>doi:10.1038/s41586-022-05531-1...,[<prism:url>http://dx.doi.org/10.1038/s41586-0...,"[<dc:subject>Science, Humanities and Social Sc...",[<pam:article><prism:contenttype>Article</pris...,[<prism:publicationname>Nature</prism:publicat...,[<prism:issn>1476-4687</prism:issn>],[<dc:title>Non-viral precision T cell receptor...,[<prism:copyright>©2022 The Author(s)</prism:c...,[<h1>Abstract</h1>],"[<dc:creator>Foy, Susan P.</dc:creator>, <dc:c..."
3,[<superscript><citationref additionalcitationi...,[<dc:publisher>Nature</dc:publisher>],[<journalid>41586</journalid>],[<xhtml:head>\n<pam:article><prism:contenttype...,[<prism:number>7950</prism:number>],[<openaccess>true</openaccess>],[<dc:language>en</dc:language>],"[<prism:genre>OriginalPaper</prism:genre>, <pr...","[<citationref additionalcitationids=""CR2 CR3 C...",[<prism:startingpage>151</prism:startingpage>],...,[<dc:identifier>doi:10.1038/s41586-022-05628-7...,[<prism:url>http://dx.doi.org/10.1038/s41586-0...,"[<dc:subject>Science, Humanities and Social Sc...",[<pam:article><prism:contenttype>Article</pris...,[<prism:publicationname>Nature</prism:publicat...,[<prism:issn>1476-4687</prism:issn>],[<dc:title>Active eosinophils regulate host de...,[<prism:copyright>©2022 The Author(s)</prism:c...,[<h1>Abstract</h1>],"[<dc:creator>Gurtner, Alessandra</dc:creator>,..."
4,"[<superscript><citationref citationid=""CR1"">1<...",[<dc:publisher>Nature</dc:publisher>],[<journalid>41586</journalid>],[<xhtml:head>\n<pam:article><prism:contenttype...,[<prism:number>7950</prism:number>],[<openaccess>true</openaccess>],[<dc:language>en</dc:language>],"[<prism:genre>OriginalPaper</prism:genre>, <pr...","[<citationref citationid=""CR1"">1</citationref>...",[<prism:startingpage>134</prism:startingpage>],...,[<dc:identifier>doi:10.1038/s41586-022-05594-0...,[<prism:url>http://dx.doi.org/10.1038/s41586-0...,"[<dc:subject>Science, Humanities and Social Sc...",[<pam:article><prism:contenttype>Article</pris...,[<prism:publicationname>Nature</prism:publicat...,[<prism:issn>1476-4687</prism:issn>],[<dc:title>FXR inhibition may protect from SAR...,[<prism:copyright>©2022 The Author(s)</prism:c...,[<h1>Abstract</h1>],"[<dc:creator>Brevini, Teresa</dc:creator>, <dc..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67513,[],[<dc:publisher>Nature</dc:publisher>],[<journalid>41586</journalid>],[<xhtml:head>\n<pam:article><prism:contenttype...,[<prism:number></prism:number>],[<openaccess>false</openaccess>],[<dc:language>en</dc:language>],[<prism:genre>News</prism:genre>],[],[<prism:startingpage></prism:startingpage>],...,[<dc:identifier>doi:10.1038/news050829-17</dc:...,[<prism:url>http://dx.doi.org/10.1038/news0508...,"[<dc:subject>Science, Humanities and Social Sc...",[<pam:article><prism:contenttype>Article</pris...,[<prism:publicationname>Nature</prism:publicat...,[<prism:issn>1476-4687</prism:issn>],[<dc:title>British duo probes origin of mad co...,[<prism:copyright>©2005 Springer Nature Limite...,[],"[<dc:creator>Khamsi, Roxanne</dc:creator>]"
67514,[],[<dc:publisher>Nature</dc:publisher>],[<journalid>41586</journalid>],[<xhtml:head>\n<pam:article><prism:contenttype...,[<prism:number></prism:number>],[<openaccess>false</openaccess>],[<dc:language>en</dc:language>],[<prism:genre>News</prism:genre>],[],[<prism:startingpage></prism:startingpage>],...,[<dc:identifier>doi:10.1038/news050829-16</dc:...,[<prism:url>http://dx.doi.org/10.1038/news0508...,"[<dc:subject>Science, Humanities and Social Sc...",[<pam:article><prism:contenttype>Article</pris...,[<prism:publicationname>Nature</prism:publicat...,[<prism:issn>1476-4687</prism:issn>],[<dc:title>Spaghetti filters cleanse water sup...,[<prism:copyright>©2005 Springer Nature Limite...,[],"[<dc:creator>von Bubnoff, Andreas</dc:creator>]"
67515,[],[<dc:publisher>Nature</dc:publisher>],[<journalid>41586</journalid>],[<xhtml:head>\n<pam:article><prism:contenttype...,[<prism:number></prism:number>],[<openaccess>false</openaccess>],[<dc:language>en</dc:language>],[<prism:genre>News</prism:genre>],[],[<prism:startingpage></prism:startingpage>],...,[<dc:identifier>doi:10.1038/news050829-15</dc:...,[<prism:url>http://dx.doi.org/10.1038/news0508...,"[<dc:subject>Science, Humanities and Social Sc...",[<pam:article><prism:contenttype>Article</pris...,[<prism:publicationname>Nature</prism:publicat...,[<prism:issn>1476-4687</prism:issn>],[<dc:title>Aerobot aims for Titan</dc:title>],[<prism:copyright>©2005 Springer Nature Limite...,[],"[<dc:creator>Peplow, Mark</dc:creator>]"
67516,[],[<dc:publisher>Nature</dc:publisher>],[<journalid>41586</journalid>],[<xhtml:head>\n<pam:article><prism:contenttype...,[<prism:number>7055</prism:number>],[<openaccess>false</openaccess>],[<dc:language>en</dc:language>],[<prism:genre>BriefCommunication</prism:genre>],[],[<prism:startingpage>164</prism:startingpage>],...,[<dc:identifier>doi:10.1038/nj7055-164a</dc:id...,[<prism:url>http://dx.doi.org/10.1038/nj7055-1...,"[<dc:subject>Science, Humanities and Social Sc...",[<pam:article><prism:contenttype>Article</pris...,[<prism:publicationname>Nature</prism:publicat...,[<prism:issn>1476-4687</prism:issn>],[<dc:title>Expression of interest</dc:title>],[<prism:copyright>©2005 Springer Nature Limite...,[],"[<dc:creator>Hoag, Hannah</dc:creator>]"


In [9]:
df.columns

Index(['superscript', 'dc:publisher', 'journalid', 'xhtml:head',
       'prism:number', 'openaccess', 'dc:language', 'prism:genre',
       'citationref', 'prism:startingpage', 'prism:publicationdate', 'p',
       'prism:contenttype', 'xhtml:body', 'prism:endingpage', 'prism:volume',
       'prism:doi', 'dc:identifier', 'prism:url', 'dc:subject', 'pam:article',
       'prism:publicationname', 'prism:issn', 'dc:title', 'prism:copyright',
       'h1', 'dc:creator'],
      dtype='object')

In [10]:
df['p']

0        [<p>Self-organizing neural organoids grown fro...
1        [<p>In mice, only the zygotes and blastomeres ...
2        [<p>T cell receptors (TCRs) enable T cells to ...
3        [<p>In the past decade, single-cell transcript...
4        [<p>Preventing SARS-CoV-2 infection by modulat...
                               ...                        
67513    [<p>Did human remains in food spawn the infect...
67514    [<p>Chemists package bacteria to eliminate per...
67515    [<p>Robotic plane could survey alien moons or ...
67516    [<p>Scientists willing to tackle membrane prot...
67517                                                   []
Name: p, Length: 67518, dtype: object

In [13]:
df['dc:publisher'].unique()

array(['[<dc:publisher>Nature</dc:publisher>]'], dtype=object)

In [14]:
df_nature = df.copy()