In [16]:
import os  # Import the os module for interacting with the operating system
import pandas as pd
import time  # Import the time module to use sleep for delays
from selenium import webdriver  # Import the webdriver from Selenium to control the browser
from selenium.webdriver.common.by import By  # Import By for locating elements
from selenium.webdriver.common.keys import Keys  # Import Keys for keyboard actions
from selenium.common.exceptions import StaleElementReferenceException, ElementClickInterceptedException, NoSuchElementException  # Import exceptions for handling common errors
import re  # Import re for regular expressions

In [17]:
# Function to find a DOI on a webpage
def find_doi_on_page(driver):
    try:
        page_text = driver.find_element(By.TAG_NAME, "body").text  # Get the text content of the entire page
        doi_pattern = r'\b10.\d{4,9}/[-._;()/:A-Z0-9]+\b'  # Regular expression pattern to match DOIs
        doi_match = re.search(doi_pattern, page_text, re.IGNORECASE)  # Search for the DOI in the page text
        if doi_match:
            return doi_match.group(0)  # Return the matched DOI
    except Exception as e:
        print(f"Error finding DOI: {str(e)}")  # Print any error that occurs
    return None  # Return None if no DOI is found

# Function to perform a Google Image search and extract DOIs from the resulting pages
def google_image_search(query, download_path, num_images=10):
    options = webdriver.ChromeOptions()  # Create Chrome options
    # options.add_argument('--headless')  # Uncomment to run in headless mode
    driver = webdriver.Chrome(options=options)  # Initialize the Chrome driver with the specified options
    
    search_url = f"https://www.google.com/search?tbm=isch&q={query}"  # Construct the search URL
    driver.get(search_url)  # Open the search URL
    print(f"Opened URL: {search_url}")  # Print the opened URL
    
    time.sleep(2)  # Wait for 2 seconds to allow the page to load

    for _ in range(num_images):  # Scroll down the page 20 times
        try:
            body = driver.find_element(By.TAG_NAME, "body")  # Find the body element
            body.send_keys(Keys.PAGE_DOWN)  # Send the PAGE_DOWN key to scroll down
            time.sleep(0.3)  # Wait for 0.3 seconds between scrolls
        except StaleElementReferenceException:
            print("StaleElementReferenceException encountered. Retrying...")  # Handle stale element reference
            continue  # Continue to the next iteration
    
    selectors = [".rg_i.Q4LuWd", ".isv-r.PNCib.MSM1fd.BUooTd img", ".rg_i", ".H8Rx8c img"]  # List of CSS selectors for image thumbnails
    thumbnails = []

    for selector in selectors:  # Try each selector
        try:
            print(f"Trying selector '{selector}'")  # Print the selector being tried
            thumbnails = driver.find_elements(By.CSS_SELECTOR, selector)  # Find elements matching the selector
            print(f"Tried selector '{selector}': found {len(thumbnails)} elements")  # Print the number of elements found
            if thumbnails:
                break  # Break if thumbnails are found
        except Exception as e:
            print(f"Error with selector '{selector}': {str(e)}")  # Print any error that occurs

    if not thumbnails:  # If no thumbnails are found
        print("No thumbnails found with the selectors.")  # Print a message
        driver.quit()  # Quit the driver
        return

    #dois = []  # Initialize an empty list to store DOIs
    dois_and_url = {'thumbnail_url':[], 'image_url':[], 'DOI':[]} # Initialize an empty dictionary to store URLs and DOIs
    
    n = 0
    for thumbnail in thumbnails[:num_images]:  # Iterate over the found thumbnails, up to the specified number of images
        print("IMAGE PROCESSING")
        
        try:
            thumbnail.click()  # Click the thumbnail
            time.sleep(1)  # Wait for 1 second to allow the high-resolution image to load
            
            thumbnail_url = thumbnail.get_attribute("src")
            dois_and_url['thumbnail_url'].append(thumbnail_url)
            
            if not thumbnail_url:
                print("No thumbnail URL found")
                dois_and_url['thumbnail_url'].append('Not_found')
            
            try:
                image_url_button = driver.find_element(By.CSS_SELECTOR, ".sFlh5c.pT0Scc.iPVvYb")  # Try to find the visit button by CSS selector
            except NoSuchElementException:
                print("No image URL found")  # Handle no such element
                dois_and_url['image_url'].append('Not_found')
                
            if image_url_button:
                image_url = image_url_button.get_attribute("src")
                print("IMAGE URL", image_url)
                dois_and_url['image_url'].append(image_url)
            
            visit_button = None  # Initialize the visit button variable
            try:
                visit_button = driver.find_element(By.CSS_SELECTOR, "a.Hnk30e.indIKd")  # Try to find the visit button by CSS selector
            except NoSuchElementException:
                print("No visit button found with the first selector. Trying alternative.")  # Handle no such element
                try:
                    visit_button = driver.find_element(By.XPATH, "//a[h1[contains(@class, 'GW0XC') and contains(@class, 'cS4Vcb-pGL6qe-fwJd0c')]]")  # Try another XPath
                except NoSuchElementException:
                    print("No visit button found with the second selector. Trying alternative.")  # Handle no such element
                    try:
                        visit_button = driver.find_element(By.XPATH, "//a[contains(@href, 'http')]")  # Try a third XPath
                    except NoSuchElementException:
                        visit_button = None  # Set to None if not found
            
            if visit_button:
                source_url = visit_button.get_attribute("href")  # Get the URL from the visit button
                print(f"Visiting source URL: {source_url}")  # Print the source URL
                
                driver.execute_script("window.open(arguments[0], '_blank');", source_url)  # Open the source URL in a new tab
                driver.switch_to.window(driver.window_handles[1])  # Switch to the new tab
                time.sleep(5)  # Wait for 5 seconds to allow the source page to load
                
                doi = find_doi_on_page(driver)  # Find the DOI on the source page
                if doi:
                    dois_and_url['DOI'].append(doi)  # Append the found DOI to the list
                    print(f"Found DOI: {doi}")  # Print the found DOI
                else:
                    dois_and_url['DOI'].append('Not_found')
                    print("DOI is Not_found!")
                
                driver.close()  # Close the current tab
                driver.switch_to.window(driver.window_handles[0])  # Switch back to the original tab
            else:
                print("No visit button found.")  # Print a message if no visit button is found
                dois_and_url['DOI'].append('Not_found_visit')
                print("DOI and button is Not_found!")
                
        except (StaleElementReferenceException, ElementClickInterceptedException, NoSuchElementException) as e:
            print(f"Exception encountered: {str(e)}. Retrying...")  # Handle exceptions and print the error
            dois_and_url['DOI'].append('Processing_problems')
            continue  # Continue to the next iteration
        except Exception as e:
            print(f"Error processing thumbnail: {str(e)}")  # Print any other errors
        if len(dois_and_url['DOI']) >= num_images:  # If the required number of DOIs are found
            break  # Break the loop

    
    driver.quit()  # Quit the driver
    
    data_table = pd.DataFrame(dois_and_url)
    
    name_txt = input("Type name of .xlsx:")  # Prompt the user to enter a name for the text file
    txt_str = ".xlsx"  # Define the file extension
    name_txt = name_txt + txt_str  # Concatenate the file name and extension
    file_path = os.path.join(download_path, name_txt)  # Construct the file path
    data_table.to_excel(file_path)  
    print(f"Saved {len(dois_and_url['thumbnail_url'])} images to {file_path}")  # Print the number of DOIs saved and the file path
    
    return data_table

Enter the search query: covid-19 and neurodegeneration
Opened URL: https://www.google.com/search?tbm=isch&q=covid-19 and neurodegeneration
Trying selector '.rg_i.Q4LuWd'
Tried selector '.rg_i.Q4LuWd': found 0 elements
Trying selector '.isv-r.PNCib.MSM1fd.BUooTd img'
Tried selector '.isv-r.PNCib.MSM1fd.BUooTd img': found 0 elements
Trying selector '.rg_i'
Tried selector '.rg_i': found 0 elements
Trying selector '.H8Rx8c img'
Tried selector '.H8Rx8c img': found 497 elements
IMAGE PROCESSING
IMAGE URL https://media.springernature.com/lw685/springer-static/image/art%3A10.1186%2Fs13062-020-00282-3/MediaObjects/13062_2020_282_Fig1_HTML.png
Visiting source URL: https://biologydirect.biomedcentral.com/articles/10.1186/s13062-020-00282-3
Found DOI: 10.2210/pdb4GZ9/pdb
IMAGE PROCESSING
IMAGE URL https://www.mdpi.com/cells/cells-11-01298/article_deploy/html/images/cells-11-01298-g005.png
Visiting source URL: https://biologydirect.biomedcentral.com/articles/10.1186/s13062-020-00282-3
Found DOI: 10

DOI is Not_found!
IMAGE PROCESSING
IMAGE URL https://www.lih.lu/wp-content/uploads/2023/05/Rejko-Article-Parkinsons-1024x536.jpg
Visiting source URL: https://translationalneurodegeneration.biomedcentral.com/articles/10.1186/s40035-022-00316-y
Found DOI: 10.1038/s41586-022-04569-5
IMAGE PROCESSING
IMAGE URL https://www.mdpi.com/diagnostics/diagnostics-13-01091/article_deploy/html/images/diagnostics-13-01091-g001.png
Visiting source URL: https://www.lih.lu/en/article/artificial-intelligence-to-assess-the-risk-of-neurodegenerative-disease-following-covid-19-infection/
DOI is Not_found!
IMAGE PROCESSING
IMAGE URL https://img.gelbe-liste.de/image/GetImageWithNamedSize?fileName=corona/coronavirus.webp
Visiting source URL: https://www.mdpi.com/2193046
Found DOI: 10.3390/diagnostics13061091
IMAGE PROCESSING
IMAGE URL https://www.mdpi.com/cells/cells-12-02601/article_deploy/html/images/cells-12-02601-g004.png
Visiting source URL: https://www.gelbe-liste.de/neurologie/biomarker-neurodegeneration

In [None]:
# Main block to execute the function if the script is run directly
if __name__ == "__main__":
    query = input("Enter the search query: ")  # Prompt the user to enter the search query
    download_path = "./"  # Set the download path to the current directory
    data_table_1 = google_image_search(query, download_path, num_images=100)  # Call the function with the provided query and number of images
    
# Note: Uncomment the options.add_argument('--headless') line if you don't want to see the browser window.
# Increase num_images=20 to download more images.

In [18]:
data_table_1

Unnamed: 0,thumbnail_url,image_url,DOI
0,"data:image/jpeg;base64,/9j/4AAQSkZJRgABAQAAAQA...",https://media.springernature.com/lw685/springe...,10.2210/pdb4GZ9/pdb
1,"data:image/jpeg;base64,/9j/4AAQSkZJRgABAQAAAQA...",https://www.mdpi.com/cells/cells-11-01298/arti...,10.2210/pdb4GZ9/pdb
2,"data:image/jpeg;base64,/9j/4AAQSkZJRgABAQAAAQA...",https://www.researchgate.net/profile/Rahat-Ull...,10.3390/cells11081298
3,"data:image/jpeg;base64,/9j/4AAQSkZJRgABAQAAAQA...",https://ars.els-cdn.com/content/image/1-s2.0-S...,Not_found
4,"data:image/jpeg;base64,/9j/4AAQSkZJRgABAQAAAQA...",https://media.springernature.com/lw685/springe...,10.1016/j.csbj.2022.02.020
5,"data:image/jpeg;base64,/9j/4AAQSkZJRgABAQAAAQA...",https://www.frontiersin.org/files/Articles/583...,10.1007/s00415-021-10517-6
6,"data:image/jpeg;base64,/9j/4AAQSkZJRgABAQAAAQA...",https://ars.els-cdn.com/content/image/1-s2.0-S...,10.3389/fneur.2020.583459
7,"data:image/jpeg;base64,/9j/4AAQSkZJRgABAQAAAQA...",https://media.springernature.com/lw1200/spring...,10.1016/j.ensci.2020.100290
8,"data:image/jpeg;base64,/9j/4AAQSkZJRgABAQAAAQA...",https://www.frontiersin.org/files/Articles/568...,10.1002/mds.28084
9,"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...",https://www.mdpi.com/cells/cells-11-01298/arti...,10.3389/fneur.2020.01044
