# How to work your way with Scopus API using Python 

First, you need to install the package *pybliometrics*.

In [None]:
#pip install pybliometrics

Note: you may need to restart the kernel to use updated packages.


In [20]:
import pybliometrics
from pybliometrics.scopus import ScopusSearch
import pandas as pd


PATH_TO_DATA = r'"C:\Users\jacob\OneDrive - Université Laval\biophilo\Data\"'
PATH_TO_DATA

'"C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\\Data\\"'

Next, you need to initialize your file. That's were you will put your API token and your Insttoken. 

In [21]:
pybliometrics.scopus.init()

# Queries type

There is multiple types of files you might want to get. For the articles, they are about :
1. Authors informations ; 
2. Affiliation informations ; 
3. General information about the paper (title, published year, etc.)

For the following, we will use an example with the journal "Philosophy of Science". 

## Articles informations

In [17]:
query = "EXACTSRCTITLE("r'\"European Journal for Philosophy of Science\"'")"
print(query)
s = ScopusSearch(query, verbose=True, subscriber = True, view = "COMPLETE")

EXACTSRCTITLE(\"European Journal for Philosophy of Science\")
Downloading results for query "EXACTSRCTITLE(\"European Journal for Philosophy of Science\")":


100%|██████████| 25/25 [00:17<00:00,  1.39it/s]


In [18]:
result = s.results
result_df = pd.DataFrame(result)

In [19]:
result_df
result_df.to_csv("C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\Data\\EUROPEAN_JOURNAL_FOR_PHILOSOPHY_OF_SCIENCE.csv")

  result_df.to_csv("C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\Data\\EUROPEAN_JOURNAL_FOR_PHILOSOPHY_OF_SCIENCE.csv")


In [16]:

for eid in result_df.eid:  
        ref_query = s = AbstractRetrieval(eid, id_type="eid", view="FULL")
        
        # Convert references to DataFrame and add the EID column
        ref_df = pd.DataFrame(ref_query.references)
        ref_df['citing_eid'] = result_df.eid  # Add the source EID column
        
        # Concatenate the current DataFrame to the citation list
        citation_list = pd.concat([citation_list, ref_df], ignore_index=True)
        citation_list.to_csv("C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\Data\\EUROPEAN_JOURNAL_OF_PHILOSOPHY_refs.csv")

  citation_list.to_csv("C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\Data\\EUROPEAN_JOURNAL_OF_PHILOSOPHY_refs.csv")
  citation_list.to_csv("C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\Data\\EUROPEAN_JOURNAL_OF_PHILOSOPHY_refs.csv")


KeyboardInterrupt: 

## Function for multiple queries

First, define your query input. Here we will go with journal name.

In [107]:
philo_of_bio_journals = [
#  r"\"BIOLOGY & PHILOSOPHY\"", 
#  r"\"BIOLOGY AND PHILOSOPHY\"",
#  r"\"BIOLOGICAL THEORY\"",
#  r"\"STUDIES IN HISTORY AND PHILOSOPHY OF SCIENCE PART C :STUDIES IN HISTORY AND PHILOSOPHY OF BIOLOGICAL AND BIOMEDICAL SCIENCES\"",
#  r"\"HISTORY AND PHILOSOPHY OF THE LIFE SCIENCES\""
]



'"BIOLOGY & PHILOSOPHY"'

In [110]:
for_name_philo_of_bio_journals = [
"BIOLOGY_&_PHILOSOPHY",
"BIOLOGY_AND_PHILOSOPHY",
"BIOLOGICAL_THEORY",
"STUDIES_IN_HISTORY_AND_PHILOSOPHY_OF_SCIENCE_PART_C__STUDIES_IN_HISTORY_AND_PHILOSOPHY_OF_BIOLOGICAL_AND_BIOMEDICAL_SCIENCES",
"HISTORY_AND_PHILOSOPHY_OF_THE_LIFE_SCIENCES"]

for_name_philo_of_bio_journals

['BIOLOGY_&_PHILOSOPHY',
 'BIOLOGY_AND_PHILOSOPHY',
 'BIOLOGICAL_THEORY',
 'STUDIES_IN_HISTORY_AND_PHILOSOPHY_OF_SCIENCE_PART_C__STUDIES_IN_HISTORY_AND_PHILOSOPHY_OF_BIOLOGICAL_AND_BIOMEDICAL_SCIENCES',
 'HISTORY_AND_PHILOSOPHY_OF_THE_LIFE_SCIENCES']

In [124]:
pybliometrics.scopus.init()

from pybliometrics.scopus import ScopusSearch
article_list = pd.DataFrame()  # Initialize an empty DataFrame to store results

for i in range(len(philo_of_bio_journals)):  
    pybliometrics.scopus.init()
    query = "EXACTSRCTITLE(" + philo_of_bio_journals[i] + ")" 
    print(query)
    s = ScopusSearch(query, verbose=True, subscriber = True, view = "COMPLETE")
    result = s.results
    result_df = pd.DataFrame(result)
    result_df.to_csv("C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\Data\\"+ for_name_philo_of_bio_journals[i] + ".csv")
    


  result_df.to_csv("C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\Data\\"+ for_name_philo_of_bio_journals[i] + ".csv")


EXACTSRCTITLE("BIOLOGY & PHILOSOPHY")
EXACTSRCTITLE(\"BIOLOGY AND PHILOSOPHY\")
EXACTSRCTITLE(\"BIOLOGICAL THEORY\")
Downloading results for query "EXACTSRCTITLE(\"BIOLOGICAL THEORY\")":


100%|██████████| 29/29 [00:21<00:00,  1.29it/s]


EXACTSRCTITLE(\"STUDIES IN HISTORY AND PHILOSOPHY OF SCIENCE PART C :STUDIES IN HISTORY AND PHILOSOPHY OF BIOLOGICAL AND BIOMEDICAL SCIENCES\")
Downloading results for query "EXACTSRCTITLE(\"STUDIES IN HISTORY AND PHILOSOPHY OF SCIENCE PART C :STUDIES IN HISTORY AND PHILOSOPHY OF BIOLOGICAL AND BIOMEDICAL SCIENCES\")":


100%|██████████| 42/42 [00:28<00:00,  1.44it/s]


EXACTSRCTITLE(\"HISTORY AND PHILOSOPHY OF THE LIFE SCIENCES\")
Downloading results for query "EXACTSRCTITLE(\"HISTORY AND PHILOSOPHY OF THE LIFE SCIENCES\")":


100%|██████████| 42/42 [00:30<00:00,  1.35it/s]


# References informations

In [163]:
from pybliometrics.scopus import AbstractRetrieval
import pandas as pd

citation_list = pd.DataFrame()  # Initialize an empty DataFrame to store results
for i in range(len(philo_of_bio_journals)):
    pybliometrics.scopus.init()
    result_df = pd.read_csv("C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\Data\\"+ for_name_philo_of_bio_journals[i] + ".csv")
    for j in range(len(result_df.eid)):  
        ref_query = AbstractRetrieval(result_df.eid[j], id_type="eid", view="REF")
        
        # Convert references to DataFrame and add the EID column
        ref_df = pd.DataFrame(ref_query.references)
        ref_df['source_eid'] = result_df.eid[j]  # Add the source EID column
        
        # Concatenate the current DataFrame to the citation list
        citation_list = pd.concat([citation_list, ref_df], ignore_index=True)
        citation_list.to_csv("C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\Data\\"+ for_name_philo_of_bio_journals[i] + "_references.csv")

# Display the resulting citation list
print(citation_list)

  result_df = pd.read_csv("C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\Data\\"+ for_name_philo_of_bio_journals[i] + ".csv")
  citation_list.to_csv("C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\Data\\"+ for_name_philo_of_bio_journals[i] + "_references.csv")


                source_eid position           id  \
0        2-s2.0-0010726742        1  85050417424   
1        2-s2.0-0010726742        2  77949540109   
2        2-s2.0-0010726742        3   0038882957   
3        2-s2.0-0010726742        4   2342643968   
4        2-s2.0-0010726742        5   0003397032   
...                    ...      ...          ...   
205086  2-s2.0-33747626043       50   4243582634   
205087  2-s2.0-33747626043       51   8844257136   
205088  2-s2.0-33747626043       52  79957200316   
205089  2-s2.0-33747626043       53   0003771448   
205090  2-s2.0-33747626043       54   0141918296   

                                 doi  \
0                               None   
1                               None   
2       10.1016/0039-3681(81)90015-7   
3                               None   
4                               None   
...                              ...   
205086                          None   
205087                          None   
205088         

# General Philosophy of Science

In [22]:
general_philo_of_science =[
# GENERAL PHILOSOPHY OF SCIENCE JOURNALS 
  #r"\"PHILOSOPHY_OF_SCIENCE\"",
  #r"\"BRITISH_JOURNAL_FOR_THE_PHILOSOPHY_OF_SCIENCE\"", 
  #r"\"SYNTHESE\"", 
  #r"\"ERKENNTNIS\"", 
  #r"\"EUROPEAN JOURNAL FOR THE PHILOSOPHY OF SCIENCE\"", 
  #r"\"INTERNATIONAL_STUDIES_IN_THE_PHILOSOPHY_OF_SCIENCE\"", 
  #r"\"JOURNAL_FOR_GENERAL_PHILOSOPHY_OF_SCIENCE\"", 
  #r"\"FOUNDATIONS_OF_SCIENCE\"",

# SPECIALIZED PHILOSOPHY OF BIOLOGY JOURNALS
  #r"\"BIOLOGY AND PHILOSOPHY\"",
  #r"\"BIOLOGICAL THEORY\"",
  #r"\"STUDIES IN HISTORY AND PHILOSOPHY OF SCIENCE PART C\"",
  #r"\"HISTORY AND PHILOSOPHY OF THE LIFE SCIENCES\"",
  r"\"ACTA BIOTHEORETICA\"",
  r"\"BEHAVIORAL AND BRAIN SCIENCES\"",
  r"\"BIOESSAYS\"",
  r"\"BIOSEMIOTICS\"",
]

In [23]:
for_name_general_philo_of_science =[
    
# GENERAL PHILOSOPHY OF SCIENCE JOURNALS    
  #"PHILOSOPHY_OF_SCIENCE",
  #"BRITISH_JOURNAL_FOR_THE_PHILOSOPHY_OF_SCIENCE", 
  #"SYNTHESE", 
  #"ERKENNTNIS", 
  #"EUROPEAN_JOURNAL_FOR_THE_PHILOSOPHY_OF_SCIENCE", 
  #"INTERNATIONAL_STUDIES_IN_THE_PHILOSOPHY_OF_SCIENCE", 
  #"JOURNAL_FOR_GENERAL_PHILOSOPHY_OF_SCIENCE", 
  #"FOUNDATIONS_OF_SCIENCE"

# SPECIALIZED PHILOSOPHY OF BIOLOGY JOURNALS
  #"BIOLOGY AND PHILOSOPHY",
  #"BIOLOGICAL THEORY",
  #"STUDIES IN HISTORY AND PHILOSOPHY OF SCIENCE PART C",
  #"HISTORY AND PHILOSOPHY OF THE LIFE SCIENCES"
  "ACTA_BIOTHEORETICA",
  "BEHAVIORAL_AND_BRAIN_SCIENCES",
  "BIOESSAYS",
  "BIOSEMIOTICS",
]


In [None]:
pybliometrics.scopus.init()

from pybliometrics.scopus import ScopusSearch
article_list = pd.DataFrame()  # Initialize an empty DataFrame to store results

for i in range(len(general_philo_of_science)):  
    pybliometrics.scopus.init()
    query = "EXACTSRCTITLE(" + general_philo_of_science[i] + ")" 
    print(query)
    s = ScopusSearch(query, verbose=True, subscriber = True, view = "COMPLETE")
    result = s.results
    result_df = pd.DataFrame(result)
    result_df.to_csv("C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\Data\\"+ for_name_general_philo_of_science[i] + ".csv")

  result_df.to_csv("C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\Data\\"+ for_name_general_philo_of_science[i] + ".csv")


EXACTSRCTITLE(\"ACTA BIOTHEORETICA\")
Downloading results for query "EXACTSRCTITLE(\"ACTA BIOTHEORETICA\")":


100%|██████████| 53/53 [00:36<00:00,  1.41it/s]


EXACTSRCTITLE(\"BEHAVIORAL AND BRAIN SCIENCES\")
Downloading results for query "EXACTSRCTITLE(\"BEHAVIORAL AND BRAIN SCIENCES\")":


100%|██████████| 576/576 [07:02<00:00,  1.36it/s]


EXACTSRCTITLE(\"BIOESSAYS\")
Downloading results for query "EXACTSRCTITLE(\"BIOESSAYS\")":


 92%|█████████▏| 223/243 [03:15<00:12,  1.61it/s]

In [6]:
from pybliometrics.scopus import AbstractRetrieval
import pandas as pd

citation_list = pd.DataFrame()  # Initialize an empty DataFrame to store results
for i in range(len(philo_of_bio_journals)):
    pybliometrics.scopus.init()
    result_df = pd.read_csv("C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\Data\\"+ for_name_general_philo_of_science[i] + ".csv")
    for j in range(len(result_df.eid)):  
        ref_query = AbstractRetrieval(result_df.eid[j], id_type="eid", view="REF")
        
        # Convert references to DataFrame and add the EID column
        ref_df = pd.DataFrame(ref_query.references)
        ref_df['source_eid'] = result_df.eid[j]  # Add the source EID column
        
        # Concatenate the current DataFrame to the citation list
        citation_list = pd.concat([citation_list, ref_df], ignore_index=True)
        citation_list.to_csv("C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\Data\\"+ for_name_general_philo_of_science[i] + "_references.csv")

# Display the resulting citation list
print(citation_list)

  result_df = pd.read_csv("C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\Data\\"+ for_name_general_philo_of_science[i] + ".csv")
  citation_list.to_csv("C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\Data\\"+ for_name_general_philo_of_science[i] + "_references.csv")
  result_df = pd.read_csv("C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\Data\\"+ for_name_general_philo_of_science[i] + ".csv")
  citation_list.to_csv("C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\Data\\"+ for_name_general_philo_of_science[i] + "_references.csv")


NameError: name 'philo_of_bio_journals' is not defined

In [140]:
import os
import pandas as pd
from pybliometrics.scopus import AbstractRetrieval

# Initialize Pybliometrics once
pybliometrics.scopus.init()

# Define base directory
base_dir = "C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\\Data\\"

# Initialize an empty list to collect DataFrames
citation_data = []

# Iterate over journal names
for i in range(len(general_philo_of_science)):
    # Read the CSV file
    file_path = os.path.join(base_dir, f"{for_name_general_philo_of_science[i]}.csv")
    result_df = pd.read_csv(file_path)
    print(result_df)
    # Iterate over EIDs in the DataFrame
    for eid in result_df.eid:
        # Retrieve references using AbstractRetrieval
        ref_query = AbstractRetrieval(eid, id_type="eid", view="REF")
        
        # Convert references to DataFrame and add the source EID
        ref_df = pd.DataFrame(ref_query.references)
        if not ref_df.empty:
            ref_df['source_eid'] = eid
            citation_data.append(ref_df)

    # Combine all collected DataFrames for the current journal
    if citation_data:
        combined_df = pd.concat(citation_data, ignore_index=True)
        output_path = os.path.join(base_dir, f"{for_name_general_philo_of_science[i]}_references.csv")
        combined_df.to_csv(output_path, index=False)

        # Clear the list for the next journal
        citation_data.clear()


       Unnamed: 0                 eid                          doi  \
0               0  2-s2.0-85214385293   10.1007/s13194-024-00631-3   
1               1  2-s2.0-85213719065   10.1007/s13194-024-00630-4   
2               2  2-s2.0-85212689131   10.1007/s13194-024-00627-z   
3               3  2-s2.0-85212070800   10.1007/s13194-024-00628-y   
4               4  2-s2.0-85214468909  10.1016/j.shpsa.2024.12.013   
...           ...                 ...                          ...   
13161       13161  2-s2.0-73149086369         10.1093/bjps/I.2.134   
13162       13162   2-s2.0-0009044701         10.1093/bjps/I.2.117   
13163       13163  2-s2.0-84958115445               10.1086/287041   
13164       13164  2-s2.0-78651025270               10.1086/287035   
13165       13165  2-s2.0-66249134722               10.1086/287042   

                     pii   pubmed_id  \
0                    NaN         NaN   
1                    NaN         NaN   
2                    NaN         NaN   

Scopus429Error: Quota Exceeded

In [7]:
from pybliometrics.scopus import AbstractRetrieval
import pandas as pd

citation_list = pd.DataFrame()  # Initialize an empty DataFrame to store results
for i in range(len(general_philo_of_science)):
    pybliometrics.scopus.init()
    result_df = pd.read_csv("C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\Data\\"+ for_name_general_philo_of_science[i] + ".csv")
    for j in range(len(result_df.eid)):  
        ref_query = AbstractRetrieval(result_df.eid[j], id_type="eid", view="REF")
        
        # Convert references to DataFrame and add the EID column
        ref_df = pd.DataFrame(ref_query.references)
        ref_df['source_eid'] = result_df.eid[j]  # Add the source EID column
        
        # Concatenate the current DataFrame to the citation list
        citation_list = pd.concat([citation_list, ref_df], ignore_index=True)
        citation_list.to_csv("C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\Data\\"+ for_name_general_philo_of_science[i] + "_references.csv")

# Display the resulting citation list
print(citation_list)

  result_df = pd.read_csv("C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\Data\\"+ for_name_general_philo_of_science[i] + ".csv")
  citation_list.to_csv("C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\Data\\"+ for_name_general_philo_of_science[i] + "_references.csv")


AttributeError: 'DataFrame' object has no attribute 'eid'