Workflow Breakdown

Index Range Selection

Define a range of indices (start_index = 55000 to end_index = 60000) to process a subset of your dataset.
This approach allows batch processing, which is useful for managing large datasets and avoiding overloading the server.
Data Extraction Loop

For each index in the range:
Retrieve the DOI, Scopus ID, and title from your lists.
Clean the title by removing HTML-like tags (<inf>) and non-alphanumeric characters to standardize the text.
Use a function (asu_open_link_and_search) to query ASU’s library search with the DOI.
Parse the output to separate the result title and the final URL if available.
Clean the result title similarly to the input title.
Append a formatted string containing the Scopus ID, original title, result title, and URL to a list.
Output Handling

After processing the batch, write all collected link entries to a text file named according to the index range.
The file format is plain text, with each line containing comma-separated values for easy parsing later.

In [1]:
import os
import time
import requests
import pandas as pd
import subprocess
import time
import re

In [2]:
import subprocess

def asu_open_link_and_search(url, search_query):
    apple_script = f'''
    set resultTitle to ""
    tell application "Safari"
        activate
        open location "{url}"
        delay 3
    end tell

    tell application "System Events"
        delay 3
    end tell

    tell application "Safari"
        do JavaScript "
            (function() {{
                var input = document.getElementById('searchBar'); 
                if (input) {{
                    input.value = '{search_query}';
                    input.dispatchEvent(new Event('input', {{ bubbles: true }}));
                    var searchButton = document.querySelector('md-icon[md-svg-icon=\\"primo-ui:magnifying-glass\\"]');
                    if (searchButton) {{
                        searchButton.click();
                    }}
                }}
            }})();
        " in document 1
    end tell

    delay 5

    -- Get the title of the first search result
    tell application "Safari"
        set resultTitle to do JavaScript "
            (function() {{
                let titleElement = document.querySelector('h3.item-title');
                if (titleElement) {{
                    return titleElement.innerText;
                }}
                return '';
            }})();
        " in document 1
    end tell

    -- Continue the rest of the navigation logic
    tell application "Safari"
        do JavaScript "
            setTimeout(function() {{
                var linkEl = document.querySelector('a.browzine-direct-to-pdf-link');
                if (linkEl && linkEl.href) {{
                    window.location = linkEl.href;
                }} else {{
                    var fullTextAvailable = document.querySelector('.availability-status.fulltext');
                    if (fullTextAvailable) {{
                        fullTextAvailable.click();
                    }} else {{
                        var fullTextLinkToSrc = document.querySelector('.availability-status.fulltext_linktorsrc');
                        if (fullTextLinkToSrc) {{
                            fullTextLinkToSrc.click();
                        }}
                    }}
                }}
            }}, 3000);
        " in document 1
    end tell


    delay 10

    tell application "Safari"
        set finalUrl to URL of front document
    end tell

    if finalUrl contains "search.lib.asu.edu" then
        tell application "Safari"
            activate
            tell front window
                close current tab
            end tell
        end tell
        return resultTitle
    end if

    if finalUrl contains "lib.asu.edu" then
        tell application "Safari"
            do JavaScript "
                var item = document.querySelector('md-list-item');
                if (item) {{
                    item.click();
                }}
            " in document 1
        end tell

        delay 5

        tell application "Safari"
            set finalUrl to URL of front document
        end tell

        tell application "Safari"
            activate
            tell front window
                close current tab
            end tell
        end tell
    end if
    
    tell application "Safari"
        activate
        close front window
    end tell

    return resultTitle & "|||" & finalUrl
    '''
    result = subprocess.run(
        ["osascript", "-e", apple_script],
        text=True,
        capture_output=True
    )
    return result.stdout.strip()

In [4]:
df = pd.read_excel('/Users/jjvyas1/Downloads/Energy_Database_Scopus_03_07_2025.xlsx')
df.columns

Index(['prism:url', 'dc:identifier', 'prism:doi', 'eid', 'dc:title',
       'dc:creator', 'prism:publicationName', 'prism:issn', 'prism:eIssn',
       'prism:volume', 'prism:issueIdentifier', 'prism:coverDate',
       'prism:coverDisplayDate', 'dc:description', 'citedby-count',
       'prism:aggregationType', 'subtypeDescription', 'author', 'authkeywords',
       'article-number', 'source-id', 'openaccessFlag', 'Affliation-name',
       'fund-sponsor', 'link ref=self', 'link ref=scopus',
       'link ref=scopus-citedby'],
      dtype='object')

In [5]:
titles = list(df['dc:title'])
dois = list(df['prism:doi'])
scopus_ids = df['dc:identifier'].str.extract(r'(\d+)')
scopus_ids = scopus_ids[0].tolist()
dois, scopus_ids, titles

(['10.32479/ijeep.17135',
  '10.32479/ijeep.17350',
  '10.32479/ijeep.17456',
  '10.32479/ijeep.17566',
  '10.32479/ijeep.17308',
  '10.1038/s41598-025-90483-5',
  '10.1038/s41598-025-89817-0',
  '10.1038/s41598-025-91204-8',
  '10.1038/s41598-025-87819-6',
  '10.1038/s41598-025-90559-2',
  '10.1007/s43621-025-00896-5',
  '10.1038/s41598-024-83496-z',
  '10.1038/s41467-025-57249-z',
  '10.1038/s43247-025-02094-7',
  '10.1038/s41467-025-57139-4',
  '10.1007/s43621-025-00909-3',
  '10.1038/s41467-025-57093-1',
  '10.1186/s42162-024-00464-7',
  '10.1186/s42162-025-00477-w',
  '10.1007/s43926-025-00105-9',
  '10.1038/s41467-025-56767-0',
  '10.1007/s43939-025-00187-w',
  '10.1007/s40820-024-01582-3',
  '10.1057/s41599-025-04364-3',
  '10.1038/s44183-024-00101-6',
  '10.1007/s42773-024-00396-1',
  '10.1186/s13705-024-00508-6',
  '10.1007/s44196-025-00745-3',
  '10.1007/s40518-024-00248-3',
  '10.1057/s41599-025-04439-1',
  '10.1186/s13705-024-00503-x',
  '10.1038/s41598-024-83486-1',
  '10.

In [6]:
count = 0
for doi in dois:
    if pd.isna(doi):
        count += 1
count

24800

In [7]:
start_index = 190000
end_index = 195000

In [None]:
links = []
time.sleep(10)
for i in range(start_index, end_index):
    if i%100 == 0:
        print(i)
    doi = dois[i]
    if pd.isna(doi):
        continue
    scopus_id = scopus_ids[i]
    title = titles[i]
    title = re.sub(r'</?inf>', '', title)
    title = re.sub(r'[^A-Za-z0-9 ]+', '', title)
    output = asu_open_link_and_search('https://search.lib.asu.edu/discovery/search?vid=01ASU_INST:01ASU&lang=en', doi)
    if "|||" in output:
        result_title, final_url = output.split("|||", 1)
    else:
        result_title, final_url = output, "" 
    result_title = re.sub(r'</?inf>', '', result_title)
    result_title = re.sub(r'[^A-Za-z0-9 ]+', '', result_title)
    links.append(rf"{scopus_id}, {title}, {result_title}, {final_url}")

In [70]:
output_file = f"./Downloads/retrieved_links_{start_index}-{end_index}.txt"
with open(output_file, "a") as f:
    for link in links:
        f.write(link + "\n")