# How to work your way with Scopus API using Python 

First, you need to install the package *pybliometrics* (pip install pybliometrics ). Once this is done, import some packages that will be useful as well as your data path.  

In [25]:
import pybliometrics
from pybliometrics.scopus import ScopusSearch
import pandas as pd


PATH_TO_DATA = r'"C:\Users\jacob\OneDrive - Université Laval\biophilo\Data\"'
PATH_TO_DATA

'"C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\\Data\\"'

Next, you need to initialize your file. That's were you will put your API token and your Insttoken. 

In [26]:
pybliometrics.scopus.init()

# Queries type

There is multiple types of files you might want to get. For the articles, they are about :
1. Authors informations ; 
2. Affiliation informations ; 
3. General information about the paper (title, published year, etc.)

For the following, we will use an example with the journal "Philosophy of Science". 

## Articles informations : example with European Journal for Philosophy of Science

This first part is an exemple so you get how it is done. After, we will automate the workflow to get articles from various journals. 

In [17]:
query = "EXACTSRCTITLE("r'\"European Journal for Philosophy of Science\"'")"
print(query)
s = ScopusSearch(query, verbose=True, subscriber = True, view = "COMPLETE")

EXACTSRCTITLE(\"European Journal for Philosophy of Science\")
Downloading results for query "EXACTSRCTITLE(\"European Journal for Philosophy of Science\")":


100%|██████████| 25/25 [00:17<00:00,  1.39it/s]


In [18]:
result = s.results
result_df = pd.DataFrame(result)

In [19]:
result_df
result_df.to_csv("C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\Data\\EUROPEAN_JOURNAL_FOR_PHILOSOPHY_OF_SCIENCE.csv")

  result_df.to_csv("C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\Data\\EUROPEAN_JOURNAL_FOR_PHILOSOPHY_OF_SCIENCE.csv")


## References informations : example with European Journal for Philosophy of Science

In [16]:

for eid in result_df.eid:  
        ref_query = s = AbstractRetrieval(eid, id_type="eid", view="FULL")
        
        # Convert references to DataFrame and add the EID column
        ref_df = pd.DataFrame(ref_query.references)
        ref_df['citing_eid'] = result_df.eid  # Add the source EID column
        
        # Concatenate the current DataFrame to the citation list
        citation_list = pd.concat([citation_list, ref_df], ignore_index=True)
        citation_list.to_csv("C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\Data\\EUROPEAN_JOURNAL_OF_PHILOSOPHY_refs.csv")

  citation_list.to_csv("C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\Data\\EUROPEAN_JOURNAL_OF_PHILOSOPHY_refs.csv")
  citation_list.to_csv("C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\Data\\EUROPEAN_JOURNAL_OF_PHILOSOPHY_refs.csv")


KeyboardInterrupt: 

## Function for multiple queries

First, define your query input. Here we will go with journal name.

In [107]:
philo_of_bio_journals = [
#  r"\"BIOLOGY & PHILOSOPHY\"", 
#  r"\"BIOLOGY AND PHILOSOPHY\"",
#  r"\"BIOLOGICAL THEORY\"",
#  r"\"STUDIES IN HISTORY AND PHILOSOPHY OF SCIENCE PART C :STUDIES IN HISTORY AND PHILOSOPHY OF BIOLOGICAL AND BIOMEDICAL SCIENCES\"",
#  r"\"HISTORY AND PHILOSOPHY OF THE LIFE SCIENCES\""
]



'"BIOLOGY & PHILOSOPHY"'

In [110]:
for_name_philo_of_bio_journals = [
"BIOLOGY_&_PHILOSOPHY",
"BIOLOGY_AND_PHILOSOPHY",
"BIOLOGICAL_THEORY",
"STUDIES_IN_HISTORY_AND_PHILOSOPHY_OF_SCIENCE_PART_C__STUDIES_IN_HISTORY_AND_PHILOSOPHY_OF_BIOLOGICAL_AND_BIOMEDICAL_SCIENCES",
"HISTORY_AND_PHILOSOPHY_OF_THE_LIFE_SCIENCES"]

for_name_philo_of_bio_journals

['BIOLOGY_&_PHILOSOPHY',
 'BIOLOGY_AND_PHILOSOPHY',
 'BIOLOGICAL_THEORY',
 'STUDIES_IN_HISTORY_AND_PHILOSOPHY_OF_SCIENCE_PART_C__STUDIES_IN_HISTORY_AND_PHILOSOPHY_OF_BIOLOGICAL_AND_BIOMEDICAL_SCIENCES',
 'HISTORY_AND_PHILOSOPHY_OF_THE_LIFE_SCIENCES']

In [124]:
pybliometrics.scopus.init()

from pybliometrics.scopus import ScopusSearch
article_list = pd.DataFrame()  # Initialize an empty DataFrame to store results

for i in range(len(philo_of_bio_journals)):  
    pybliometrics.scopus.init()
    query = "EXACTSRCTITLE(" + philo_of_bio_journals[i] + ")" 
    print(query)
    s = ScopusSearch(query, verbose=True, subscriber = True, view = "COMPLETE")
    result = s.results
    result_df = pd.DataFrame(result)
    result_df.to_csv("C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\Data\\"+ for_name_philo_of_bio_journals[i] + ".csv")
    


  result_df.to_csv("C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\Data\\"+ for_name_philo_of_bio_journals[i] + ".csv")


EXACTSRCTITLE("BIOLOGY & PHILOSOPHY")
EXACTSRCTITLE(\"BIOLOGY AND PHILOSOPHY\")
EXACTSRCTITLE(\"BIOLOGICAL THEORY\")
Downloading results for query "EXACTSRCTITLE(\"BIOLOGICAL THEORY\")":


100%|██████████| 29/29 [00:21<00:00,  1.29it/s]


EXACTSRCTITLE(\"STUDIES IN HISTORY AND PHILOSOPHY OF SCIENCE PART C :STUDIES IN HISTORY AND PHILOSOPHY OF BIOLOGICAL AND BIOMEDICAL SCIENCES\")
Downloading results for query "EXACTSRCTITLE(\"STUDIES IN HISTORY AND PHILOSOPHY OF SCIENCE PART C :STUDIES IN HISTORY AND PHILOSOPHY OF BIOLOGICAL AND BIOMEDICAL SCIENCES\")":


100%|██████████| 42/42 [00:28<00:00,  1.44it/s]


EXACTSRCTITLE(\"HISTORY AND PHILOSOPHY OF THE LIFE SCIENCES\")
Downloading results for query "EXACTSRCTITLE(\"HISTORY AND PHILOSOPHY OF THE LIFE SCIENCES\")":


100%|██████████| 42/42 [00:30<00:00,  1.35it/s]


# References informations

In [163]:
from pybliometrics.scopus import AbstractRetrieval
import pandas as pd

citation_list = pd.DataFrame()  # Initialize an empty DataFrame to store results
for i in range(len(philo_of_bio_journals)):
    pybliometrics.scopus.init()
    result_df = pd.read_csv("C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\Data\\"+ for_name_philo_of_bio_journals[i] + ".csv")
    for j in range(len(result_df.eid)):  
        ref_query = AbstractRetrieval(result_df.eid[j], id_type="eid", view="REF")
        
        # Convert references to DataFrame and add the EID column
        ref_df = pd.DataFrame(ref_query.references)
        ref_df['source_eid'] = result_df.eid[j]  # Add the source EID column
        
        # Concatenate the current DataFrame to the citation list
        citation_list = pd.concat([citation_list, ref_df], ignore_index=True)
        citation_list.to_csv("C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\Data\\"+ for_name_philo_of_bio_journals[i] + "_references.csv")

# Display the resulting citation list
print(citation_list)

  result_df = pd.read_csv("C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\Data\\"+ for_name_philo_of_bio_journals[i] + ".csv")
  citation_list.to_csv("C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\Data\\"+ for_name_philo_of_bio_journals[i] + "_references.csv")


                source_eid position           id  \
0        2-s2.0-0010726742        1  85050417424   
1        2-s2.0-0010726742        2  77949540109   
2        2-s2.0-0010726742        3   0038882957   
3        2-s2.0-0010726742        4   2342643968   
4        2-s2.0-0010726742        5   0003397032   
...                    ...      ...          ...   
205086  2-s2.0-33747626043       50   4243582634   
205087  2-s2.0-33747626043       51   8844257136   
205088  2-s2.0-33747626043       52  79957200316   
205089  2-s2.0-33747626043       53   0003771448   
205090  2-s2.0-33747626043       54   0141918296   

                                 doi  \
0                               None   
1                               None   
2       10.1016/0039-3681(81)90015-7   
3                               None   
4                               None   
...                              ...   
205086                          None   
205087                          None   
205088         

# General Philosophy of Science

In [22]:
general_philo_of_science =[
# GENERAL PHILOSOPHY OF SCIENCE JOURNALS 
  #r"\"PHILOSOPHY_OF_SCIENCE\"",
  #r"\"BRITISH_JOURNAL_FOR_THE_PHILOSOPHY_OF_SCIENCE\"", 
  #r"\"SYNTHESE\"", 
  #r"\"ERKENNTNIS\"", 
  #r"\"EUROPEAN JOURNAL FOR THE PHILOSOPHY OF SCIENCE\"", 
  #r"\"INTERNATIONAL_STUDIES_IN_THE_PHILOSOPHY_OF_SCIENCE\"", 
  #r"\"JOURNAL_FOR_GENERAL_PHILOSOPHY_OF_SCIENCE\"", 
  #r"\"FOUNDATIONS_OF_SCIENCE\"",

# SPECIALIZED PHILOSOPHY OF BIOLOGY JOURNALS
  #r"\"BIOLOGY AND PHILOSOPHY\"",
  #r"\"BIOLOGICAL THEORY\"",
  #r"\"STUDIES IN HISTORY AND PHILOSOPHY OF SCIENCE PART C\"",
  #r"\"HISTORY AND PHILOSOPHY OF THE LIFE SCIENCES\"",
  r"\"ACTA BIOTHEORETICA\"",
  r"\"BEHAVIORAL AND BRAIN SCIENCES\"",
  r"\"BIOESSAYS\"",
  r"\"BIOSEMIOTICS\"",
]

In [23]:
for_name_general_philo_of_science =[
    
# GENERAL PHILOSOPHY OF SCIENCE JOURNALS    
  #"PHILOSOPHY_OF_SCIENCE",
  #"BRITISH_JOURNAL_FOR_THE_PHILOSOPHY_OF_SCIENCE", 
  #"SYNTHESE", 
  #"ERKENNTNIS", 
  #"EUROPEAN_JOURNAL_FOR_THE_PHILOSOPHY_OF_SCIENCE", 
  #"INTERNATIONAL_STUDIES_IN_THE_PHILOSOPHY_OF_SCIENCE", 
  #"JOURNAL_FOR_GENERAL_PHILOSOPHY_OF_SCIENCE", 
  #"FOUNDATIONS_OF_SCIENCE"

# SPECIALIZED PHILOSOPHY OF BIOLOGY JOURNALS
  #"BIOLOGY AND PHILOSOPHY",
  #"BIOLOGICAL THEORY",
  #"STUDIES IN HISTORY AND PHILOSOPHY OF SCIENCE PART C",
  #"HISTORY AND PHILOSOPHY OF THE LIFE SCIENCES"
  "ACTA_BIOTHEORETICA",
  "BEHAVIORAL_AND_BRAIN_SCIENCES",
  "BIOESSAYS",
  "BIOSEMIOTICS",
]


In [24]:
pybliometrics.scopus.init()

from pybliometrics.scopus import ScopusSearch
article_list = pd.DataFrame()  # Initialize an empty DataFrame to store results

for i in range(len(general_philo_of_science)):  
    pybliometrics.scopus.init()
    query = "EXACTSRCTITLE(" + general_philo_of_science[i] + ")" 
    print(query)
    s = ScopusSearch(query, verbose=True, subscriber = True, view = "COMPLETE")
    result = s.results
    result_df = pd.DataFrame(result)
    result_df.to_csv("C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\Data\\"+ for_name_general_philo_of_science[i] + ".csv")

  result_df.to_csv("C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\Data\\"+ for_name_general_philo_of_science[i] + ".csv")


EXACTSRCTITLE(\"ACTA BIOTHEORETICA\")
Downloading results for query "EXACTSRCTITLE(\"ACTA BIOTHEORETICA\")":


100%|██████████| 53/53 [00:36<00:00,  1.41it/s]


EXACTSRCTITLE(\"BEHAVIORAL AND BRAIN SCIENCES\")
Downloading results for query "EXACTSRCTITLE(\"BEHAVIORAL AND BRAIN SCIENCES\")":


100%|██████████| 576/576 [07:02<00:00,  1.36it/s]


EXACTSRCTITLE(\"BIOESSAYS\")
Downloading results for query "EXACTSRCTITLE(\"BIOESSAYS\")":


100%|██████████| 243/243 [03:28<00:00,  1.16it/s]


EXACTSRCTITLE(\"BIOSEMIOTICS\")
Downloading results for query "EXACTSRCTITLE(\"BIOSEMIOTICS\")":


100%|██████████| 22/22 [00:15<00:00,  1.40it/s]


In [6]:
from pybliometrics.scopus import AbstractRetrieval
import pandas as pd

citation_list = pd.DataFrame()  # Initialize an empty DataFrame to store results
for i in range(len(philo_of_bio_journals)):
    pybliometrics.scopus.init()
    result_df = pd.read_csv("C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\Data\\"+ for_name_general_philo_of_science[i] + ".csv")
    for j in range(len(result_df.eid)):  
        ref_query = AbstractRetrieval(result_df.eid[j], id_type="eid", view="REF")
        
        # Convert references to DataFrame and add the EID column
        ref_df = pd.DataFrame(ref_query.references)
        ref_df['source_eid'] = result_df.eid[j]  # Add the source EID column
        
        # Concatenate the current DataFrame to the citation list
        citation_list = pd.concat([citation_list, ref_df], ignore_index=True)
        citation_list.to_csv("C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\Data\\"+ for_name_general_philo_of_science[i] + "_references.csv")

# Display the resulting citation list
print(citation_list)

  result_df = pd.read_csv("C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\Data\\"+ for_name_general_philo_of_science[i] + ".csv")
  citation_list.to_csv("C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\Data\\"+ for_name_general_philo_of_science[i] + "_references.csv")
  result_df = pd.read_csv("C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\Data\\"+ for_name_general_philo_of_science[i] + ".csv")
  citation_list.to_csv("C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\Data\\"+ for_name_general_philo_of_science[i] + "_references.csv")


NameError: name 'philo_of_bio_journals' is not defined

In [18]:
import os

PATH_TO_DATA = r'C:/Users/jacob/OneDrive - Université Laval/biophilo/Data/pybiblio'

for name in for_name_biology_journals:
    file_path = os.path.join(PATH_TO_DATA, f"{name}.csv")  # Assuming CSV format

    if os.path.exists(file_path):  # Check if file exists
        globals()[name] = pd.read_csv(file_path)  # Assign to a variable with the file name
    else:
        print(f"Warning: {file_path} not found!")

In [None]:
import os
import pandas as pd
from pybliometrics.scopus import AbstractRetrieval

# Initialize Pybliometrics once
pybliometrics.scopus.init()

# Define base directory
base_dir = "C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\\Data\\"

# Initialize an empty list to collect DataFrames
citation_data = []

# Iterate over journal names
for i in range(len(bio)):
    # Read the CSV file
    file_path = os.path.join(base_dir, f"{for_name_general_philo_of_science[i]}.csv")
    result_df = pd.read_csv(file_path)
    print(result_df)
    # Iterate over EIDs in the DataFrame
    for eid in result_df.eid:
        # Retrieve references using AbstractRetrieval
        ref_query = AbstractRetrieval(eid, id_type="eid", view="REF")
        
        # Convert references to DataFrame and add the source EID
        ref_df = pd.DataFrame(ref_query.references)
        if not ref_df.empty:
            ref_df['source_eid'] = eid
            citation_data.append(ref_df)

    # Combine all collected DataFrames for the current journal
    if citation_data:
        combined_df = pd.concat(citation_data, ignore_index=True)
        output_path = os.path.join(base_dir, f"{for_name_general_philo_of_science[i]}_references.csv")
        combined_df.to_csv(output_path, index=False)

        # Clear the list for the next journal
        citation_data.clear()


NameError: name 'general_philo_of_science' is not defined

In [7]:
from pybliometrics.scopus import AbstractRetrieval
import pandas as pd

citation_list = pd.DataFrame()  # Initialize an empty DataFrame to store results
for i in range(len(general_philo_of_science)):
    pybliometrics.scopus.init()
    result_df = pd.read_csv("C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\Data\\"+ for_name_general_philo_of_science[i] + ".csv")
    for j in range(len(result_df.eid)):  
        ref_query = AbstractRetrieval(result_df.eid[j], id_type="eid", view="REF")
        
        # Convert references to DataFrame and add the EID column
        ref_df = pd.DataFrame(ref_query.references)
        ref_df['source_eid'] = result_df.eid[j]  # Add the source EID column
        
        # Concatenate the current DataFrame to the citation list
        citation_list = pd.concat([citation_list, ref_df], ignore_index=True)
        citation_list.to_csv("C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\Data\\"+ for_name_general_philo_of_science[i] + "_references.csv")

# Display the resulting citation list
print(citation_list)

  result_df = pd.read_csv("C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\Data\\"+ for_name_general_philo_of_science[i] + ".csv")
  citation_list.to_csv("C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\Data\\"+ for_name_general_philo_of_science[i] + "_references.csv")


AttributeError: 'DataFrame' object has no attribute 'eid'

In [5]:
import pybliometrics
from pybliometrics.scopus import SubjectClassifications
pybliometrics.scopus.init()
sub = SubjectClassifications({'description': 'Biology'})


In [6]:

sub.results

[Subject(code='1300', description='Biochemistry, Genetics and Molecular Biology', detail='Biochemistry, Genetics and Molecular Biology (all)', abbrev='BIOC'),
 Subject(code='1301', description='Biochemistry, Genetics and Molecular Biology', detail='Biochemistry, Genetics and Molecular Biology (miscellaneous)', abbrev='BIOC'),
 Subject(code='1302', description='Biochemistry, Genetics and Molecular Biology', detail='Aging', abbrev='BIOC'),
 Subject(code='1303', description='Biochemistry, Genetics and Molecular Biology', detail='Biochemistry', abbrev='BIOC'),
 Subject(code='1304', description='Biochemistry, Genetics and Molecular Biology', detail='Biophysics', abbrev='BIOC'),
 Subject(code='1305', description='Biochemistry, Genetics and Molecular Biology', detail='Biotechnology', abbrev='BIOC'),
 Subject(code='1306', description='Biochemistry, Genetics and Molecular Biology', detail='Cancer Research', abbrev='BIOC'),
 Subject(code='1307', description='Biochemistry, Genetics and Molecular 

In [57]:
import pandas as pd
from pybliometrics.scopus import SerialTitle

final_df = pd.DataFrame()

for ISSN in cited_journals['ISSN'].dropna().unique():  # Ensure ISSN values are valid
    try:
        # Retrieve source title safely
        source_title = cited_journals.loc[cited_journals['ISSN'] == ISSN, 'sourcetitle']
        if not source_title.empty:
            source_title = source_title.iloc[0]  # Get first matching value
        else:
            source_title = None  # Handle missing source titles
        
        # Fetch citation metrics
        metrics = SerialTitle(ISSN, year=2023, view='CITESCORE')
        
        # Convert citescoreyearinfolist to DataFrame
        df = pd.DataFrame(metrics.citescoreyearinfolist)  
        
        # Ensure 'citescore' column exists before calculating statistics
        if 'citescore' in df.columns:
            mean_citescore = df['citescore'].mean()
            median_citescore = df['citescore'].median()
        else:
            mean_citescore = None
            median_citescore = None
        
        # Create a DataFrame with ISSN, mean, and median citescore
        info = pd.DataFrame({
            'sourcetitle': [source_title],  # Store single value as a list
            'ISSN': [ISSN],
            'median_citescore': [median_citescore],
            'mean_citescore': [mean_citescore]
        })

        # Concatenate the DataFrame
        final_df = pd.concat([final_df, info], ignore_index=True)
    
    except Exception as e:
        print(f"Error retrieving data for ISSN {ISSN}: {e}")

print(final_df)



Error retrieving data for ISSN 1693867: The resource specified cannot be found.


  final_df = pd.concat([final_df, info], ignore_index=True)
  final_df = pd.concat([final_df, info], ignore_index=True)


KeyboardInterrupt: 

In [56]:
import pybliometrics
import pandas as pd
from pybliometrics.scopus import SerialTitle
pybliometrics.scopus.init()
source = SerialTitle("00368075")

source_full = SerialTitle("01693867", view="CITESCORE")
info = pd.DataFrame(source_full.citescoreyearinfolist)
#info.citescore.mean()
#info.citescore.median()
info

Unnamed: 0,year,citescore,status,documentcount,citationcount,percentcited,rank
0,2024,3.9,In-Progress,194,754,66,"[(1211, 37, 95), (1207, 18, 92), (1100, 68, 70)]"
1,2023,4.1,Complete,214,870,68,"[(1211, 38, 95), (1207, 14, 93), (1100, 59, 73)]"
2,2022,3.2,Complete,228,740,67,"[(1211, 43, 94), (1207, 17, 92), (1100, 63, 70)]"
3,2021,2.7,Complete,212,571,67,"[(1211, 39, 94), (1207, 20, 89), (1100, 62, 70)]"
4,2020,2.5,Complete,221,542,63,"[(1211, 34, 94), (1207, 18, 89), (1100, 59, 72)]"
5,2019,2.6,Complete,208,537,63,"[(1211, 26, 95), (1207, 16, 89), (1100, 50, 75)]"
6,2018,2.7,Complete,194,525,62,"[(1211, 28, 95), (1207, 13, 91), (1100, 44, 77)]"
7,2017,2.8,Complete,177,499,73,"[(1211, 20, 96), (1207, 11, 91), (1100, 42, 77)]"
8,2016,2.8,Complete,194,549,69,"[(1211, 15, 96), (1207, 10, 92), (1100, 38, 78)]"
9,2015,2.5,Complete,193,480,73,"[(1211, 21, 95), (1207, 16, 87), (1100, 41, 75)]"


In [8]:
biology_journals = [
    r"\"MICROBIOLOGY AND MOLECULAR BIOLOGY REVIEWS\"",
    r"\"TRENDS IN BIOCHEMICAL SCIENCES\"",
]


for_name_biology_journals = [
    "MICROBIOLOGY_AND_MOLECULAR_BIOLOGY_REVIEWS",
    "TRENDS_IN_BIOCHEMICAL_SCIENCES"
]

In [16]:
import concurrent.futures
import os
from pybliometrics.scopus import ScopusSearch
import pandas as pd

# Initialize the Scopus API
pybliometrics.scopus.init()

def query_scopus_for_journal(title, filename):
    try:
        query = "EXACTSRCTITLE(" + title + ")" 
        print(f"Querying: {query}", flush=True)
        s = ScopusSearch(query, verbose=True, subscriber=True, view="COMPLETE")
        result = s.results
        result_df = pd.DataFrame(result)

        # Define output path and ensure directory exists
        output_path = os.path.join("C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\\Data", filename + ".csv")
        result_df.to_csv(output_path, index=False)
        print(f"Saved results to {output_path}", flush=True)
    except Exception as e:
        print(f"Error querying {title}: {e}", flush=True)

# Create a list of tasks to process concurrently
tasks = [(biology_journals[i], for_name_biology_journals[i]) for i in range(len(biology_journals))]

# Use ThreadPoolExecutor to parallelize the Scopus queries
with concurrent.futures.ThreadPoolExecutor() as executor:
    # Pass the tasks to executor.map, and log the progress
    list(executor.map(lambda task: query_scopus_for_journal(*task), tasks))

    

Querying: EXACTSRCTITLE(\"MICROBIOLOGY AND MOLECULAR BIOLOGY REVIEWS\")
Querying: EXACTSRCTITLE(\"TRENDS IN BIOCHEMICAL SCIENCES\")
Downloading results for query "EXACTSRCTITLE(\"TRENDS IN BIOCHEMICAL SCIENCES\")":


  0%|          | 1/270 [00:00<?, ?it/s]

Downloading results for query "EXACTSRCTITLE(\"MICROBIOLOGY AND MOLECULAR BIOLOGY REVIEWS\")":


100%|██████████| 34/34 [00:32<00:00,  1.01it/s]]

Saved results to C:\Users\jacob\OneDrive - Université Laval\biophilo\Data\MICROBIOLOGY_AND_MOLECULAR_BIOLOGY_REVIEWS.csv



100%|██████████| 270/270 [03:04<00:00,  1.46it/s]


Saved results to C:\Users\jacob\OneDrive - Université Laval\biophilo\Data\TRENDS_IN_BIOCHEMICAL_SCIENCES.csv


KeyboardInterrupt: 

In [20]:
import os

PATH_TO_DATA = r'C:/Users/jacob/OneDrive - Université Laval/biophilo/Data/pybiblio'

for name in for_name_biology_journals:
    file_path = os.path.join(PATH_TO_DATA, f"{name}.csv")  # Assuming CSV format

    if os.path.exists(file_path):  # Check if file exists
        globals()[name] = pd.read_csv(file_path)  # Assign to a variable with the file name
    else:
        print(f"Warning: {file_path} not found!")

In [21]:
import pybliometrics
import concurrent.futures
import pandas as pd
import contextlib
import io
from tqdm import tqdm

# Initialize the list to store reference data
references_list = []

def parse_abstract(eid):
    with contextlib.redirect_stdout(io.StringIO()):  # Suppresses print output
        try:
            print(f"Processing EID: {eid}")
            s = AbstractRetrieval(eid, id_type="eid", view="FULL")

            if s.references:  # Ensure references exist
                df = pd.DataFrame(s.references)
                df['citing_eid'] = eid
                references_list.append(df)  # Collect results
            else:
                print(f"No references found for {eid}")

        except Exception as e:
            print(f"Error processing {eid}: {e}")

# Initialize tqdm for pandas
tqdm.pandas()

# Start pybliometrics
pybliometrics.scopus.init()

def process_journal(name):
    if name in globals():  # Ensure the DataFrame exists
        print(f"Processing journal: {name}")

        dfs = globals()[name]  # Fetch the actual DataFrame

        # Use ThreadPoolExecutor to parallelize the reference extraction process
        with concurrent.futures.ThreadPoolExecutor() as executor:
            list(tqdm(executor.map(parse_abstract, dfs['eid']), total=len(dfs)))

        # Ensure references_list has data before concatenating
        if references_list:
            references_df = pd.concat(references_list, ignore_index=True)
        else:
            references_df = pd.DataFrame(columns=['eid', 'citing_eid'])  # Placeholder for empty case

        # Save only the extracted references, not the original articles
        output_path = f"C:\\Users\\jacob\\OneDrive - Université Laval\\biophilo\\Data\\pybiblio\\{name}_refs_pyblio.csv"
        references_df.to_csv(output_path, index=False)

        print(f"Saved references: {output_path}")

        # Clear the list for the next journal to avoid mixing references
        references_list.clear()
    else:
        print(f"Warning: No DataFrame found for {name}")

# Run the processing for each journal in parallel
for name in for_name_biology_journals:
    process_journal(name)


  0%|          | 0/830 [00:00<?, ?it/s]

Processing journal: MICROBIOLOGY_AND_MOLECULAR_BIOLOGY_REVIEWS


100%|██████████| 830/830 [00:39<00:00, 21.20it/s]
100%|██████████| 6736/6736 [05:24<00:00, 20.79it/s]
