In [None]:
import requests
import time
import json

BASE_URL = ""
API_KEY = ""
EXIST_DATA_YEAR = [2018, 2019, 2020, 2021, 2022, 2023]
START_YEAR = 2018
END_YEAR = START_YEAR
NUM_PAPERS = 25
RESULTS_PER_REQUEST = 25
OPTION = 3
doi_list = []

if START_YEAR in EXIST_DATA_YEAR:
    with open(f"{START_YEAR}_ExistDoi.txt", 'r') as file:
        doi_list = file.readlines()
    doi_list = [doi.strip() for doi in doi_list]

In [None]:
def filter_papers_with_doi(papers, year):
    """
    Filter papers that have a DOI.
    """
    if year in EXIST_DATA_YEAR:
        return [paper for paper in papers if ('prism:doi' in paper and paper['prism:doi'] not in doi_list)]
    
    return [paper for paper in papers if 'prism:doi' in paper]

In [19]:
def fetch_scopus_data(api_key, start_year, end_year, count, optional_query):
    headers = {
        "X-ELS-APIKey": api_key,
    }

    query = [f"PUBYEAR > {start_year - 1} AND PUBYEAR < {end_year + 1} AND DOCTYPE(cp) AND SRCTYPE(p)", 
             f"PUBYEAR > {start_year - 1} AND PUBYEAR < {end_year + 1} AND DOCTYPE(cp) AND NOT SRCTYPE(p)",
             f"PUBYEAR > {start_year - 1} AND PUBYEAR < {end_year + 1} AND SRCTYPE(p) AND NOT DOCTYPE(cp)", 
             f"PUBYEAR > {start_year - 1} AND PUBYEAR < {end_year + 1} AND NOT (DOCTYPE(cp) AND SRCTYPE(p))"]

    papers = []
    papers_with_doi = []
    start = 0

    while len(papers_with_doi) < count:
        fetch_count = min(RESULTS_PER_REQUEST, count - len(papers))
        params = {
            "query": query[optional_query],
            "start": start,
            "count": fetch_count,
            "field": "prism:url,dc:identifier,eid,dc:title,prism:aggregationType,subtype,subtypeDescription,citedby-count,prism:publicationName,prism:isbn,prism:issn,prism:volume,prism:issueIdentifier,prism:pageRange,prism:coverDate,prism:coverDisplayDate,prism:doi,affiliation,dc:creator,openaccess,openaccessFlag",
        }

        try:
            response = requests.get(BASE_URL, headers=headers, params=params)
            response.raise_for_status()
            data = response.json()

            # Debug: Check total results available
            if start == 0:
                total_results = int(data.get("search-results", {}).get("opensearch:totalResults", 0))
                print(f"Total results available: {total_results}")

            # Extract entries
            entries = data.get("search-results", {}).get("entry", [])
            if not entries:
                print(f"No more entries at start index {start}. Exiting.")
                break

            papers.extend(entries)
            print(f"Fetched {len(entries)} papers starting from index {start}")

            papers_with_doi = filter_papers_with_doi(papers, start_year)

            # Update start for the next page
            start += len(entries)
            time.sleep(1)  # Respect rate limits

        except requests.exceptions.RequestException as e:
            print(f"Error fetching data at start index {start}: {e}")
            break


    # return papers[:count]
    return papers_with_doi[:count]

In [7]:
def save_to_json(data, filename):
    """
    Save a list of data to a JSON file.
    """
    try:
        with open(filename, "w", encoding="utf-8") as f:
            json.dump(data, f, ensure_ascii=False, indent=4)
        print(f"Data successfully saved to {filename}")
    except Exception as e:
        print(f"Error saving data: {e}")

In [23]:
YEAR = [2013, 2014, 2015, 2016, 2017, 2019, 2020, 2021, 2022, 2023]

MAP_YEAR = {
            2013: [0, 1, 2, 3], 
            2014: [0, 1, 2, 3], 
            2015: [0, 1, 2, 3], 
            2016: [0, 1, 2, 3], 
            2017: [0, 1, 2, 3], 
            2019: [0, 1, 2], 
            2020: [0, 1, 2], 
            2021: [0, 1, 2], 
            2022: [0, 1, 2], 
            2023: [0, 1, 2]}
MAP_YEAR_NUM = {
            2013: [1250, 1250, 1250, 1250], 
            2014: [1250, 1250, 1250, 1250], 
            2015: [1250, 1250, 1250, 1250], 
            2016: [1250, 1250, 1250, 1250], 
            2017: [1250, 1250, 1250, 1250], 
            2019: [911, 1188, 1147], 
            2020: [1032, 1168, 1145], 
            2021: [1085, 1224, 1248], 
            2022: [1073, 1227, 1239], 
            2023: [1143, 1219, 1246]}

In [22]:
if __name__ == "__main__":
    research_papers = fetch_scopus_data(API_KEY, START_YEAR, END_YEAR, NUM_PAPERS, OPTION)
    print(f"Total papers fetched: {len(research_papers)}")

    # Check if data was fetched successfully
    if research_papers:
        print(f"Fetched {len(research_papers)} papers successfully.")
        
        if OPTION == 0:
            save_to_json(research_papers, f"{START_YEAR}_conference_paper_proceeding.json")
        elif OPTION == 1:
            save_to_json(research_papers, f"{START_YEAR}_conference_paper_not_proceeding.json")
        elif OPTION == 2:
            save_to_json(research_papers, f"{START_YEAR}_not_conference_paper_proceeding.json")
        elif OPTION == 3:
            save_to_json(research_papers, f"{START_YEAR}_not_conference_paper_not_proceeding.json")
    else:
        print("Failed to fetch research papers.")

Total results available: 2893725
Fetched 25 papers starting from index 0
Fetched 25 papers starting from index 25
Fetched 25 papers starting from index 50
Fetched 25 papers starting from index 75
Total papers fetched: 25
Fetched 25 papers successfully.
Data successfully saved to 2018_not_conference_paper_not_proceeding.json


In [24]:
for year in YEAR:
    for option in MAP_YEAR[year]:
        print(f"Fetching data for year {year} with option {option}")

        num_papers = MAP_YEAR_NUM[year][option]
        research_papers = fetch_scopus_data(API_KEY, year, year, num_papers, option)
        print(f"Total papers fetched: {len(research_papers)}")

        # Check if data was fetched successfully
        if research_papers:
            print(f"Fetched {len(research_papers)} papers successfully.")
            
            if option == 0:
                save_to_json(research_papers, f"{year}_conference_paper_proceeding.json")
            elif option == 1:
                save_to_json(research_papers, f"{year}_conference_paper_not_proceeding.json")
            elif option == 2:
                save_to_json(research_papers, f"{year}_not_conference_paper_proceeding.json")
            elif option == 3:
                save_to_json(research_papers, f"{year}_not_conference_paper_not_proceeding.json")
        else:
            print("Failed to fetch research papers.")

Fetching data for year 2013 with option 0
Total results available: 358534
Fetched 25 papers starting from index 0
Fetched 25 papers starting from index 25
Fetched 25 papers starting from index 50
Fetched 25 papers starting from index 75
Fetched 25 papers starting from index 100
Fetched 25 papers starting from index 125
Fetched 25 papers starting from index 150
Fetched 25 papers starting from index 175
Fetched 25 papers starting from index 200
Fetched 25 papers starting from index 225
Fetched 25 papers starting from index 250
Fetched 25 papers starting from index 275
Fetched 25 papers starting from index 300
Fetched 25 papers starting from index 325
Fetched 25 papers starting from index 350
Fetched 25 papers starting from index 375
Fetched 25 papers starting from index 400
Fetched 25 papers starting from index 425
Fetched 25 papers starting from index 450
Fetched 25 papers starting from index 475
Fetched 25 papers starting from index 500
Fetched 25 papers starting from index 525
Fetched