# Data Extraction

## Author Data Extraction

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from bs4 import BeautifulSoup
import time
import pandas as pd
import re

def get_author_profile_url(author_name, author_affiliation):
    options = Options()
    # Run in non-headless mode to reduce blocking
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-gpu')
    options.add_argument('--disable-dev-shm-usage')
    # Set a user-agent to mimic a real browser
    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36')
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)

    try:
        # Construct the search URL
        query = f"{author_name} {author_affiliation}".replace(" ", "+")
        url = f"https://scholar.google.com/scholar?q={query}&hl=en"
        print(f"Searching for author profile at URL: {url}")
        driver.get(url)
        time.sleep(8)  # Increased wait for the page to load

        # Scroll multiple times to ensure lazy-loaded content appears
        for _ in range(3):
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)

        # Check for CAPTCHA or block page
        page_source = driver.page_source.lower()
        if "recaptcha" in page_source or "unusual traffic" in page_source:
            print("CAPTCHA or block page detected. Please try from a different network, wait a few hours, or solve the CAPTCHA manually in the browser.")
            driver.quit()
            return None, None

        # Find the "User profiles for" link to locate the section
        try:
            profiles_section = WebDriverWait(driver, 60).until(
                EC.presence_of_element_located((By.XPATH, "//a[contains(text(), 'User profiles for')]"))
            )
            print("Found 'User profiles for' section.")
        except TimeoutException:
            print("Timeout waiting for 'User profiles for' section.")
            print("Page source for debugging (first 2000 characters):")
            print(driver.page_source[:2000])
            driver.quit()
            return None, None

        # Find the first profile link directly below the "User profiles for" link
        try:
            # Use a more flexible XPath that looks for a link with the author's name and the correct URL pattern
            first_profile = WebDriverWait(driver, 30).until(
                EC.element_to_be_clickable((By.XPATH, f"//a[contains(text(), '{author_name}') and contains(@href, 'citations?user=')]"))
            )
            profile_name = first_profile.text
            profile_link = first_profile.get_attribute("href")
            print(f"Found first profile: {profile_name} - URL: {profile_link}")
            driver.get(profile_link)  # Navigate to the profile page
            time.sleep(3)
            return profile_link, driver
        except TimeoutException:
            print("Timeout waiting for the first user profile link. Attempting fallback method...")
            
            # Fallback: Parse the page source with BeautifulSoup to find the profile URL
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            profile_link = None
            for a_tag in soup.find_all('a', href=True):
                if f"citations?user=" in a_tag['href'] and author_name.lower() in a_tag.text.lower():
                    profile_link = f"https://scholar.google.com{a_tag['href']}"
                    print(f"Fallback method found profile: {a_tag.text} - URL: {profile_link}")
                    break
            
            if profile_link:
                driver.get(profile_link)
                time.sleep(3)
                return profile_link, driver
            else:
                print("Fallback method failed to find the profile link.")
                print("Page source for debugging (first 5000 characters):")
                print(driver.page_source[:5000])  # Increased to 5000 characters for better debugging
                driver.quit()
                return None, None
    except Exception as e:
        print(f"Error finding author profile: {e}")
        print("Page source for debugging (first 5000 characters):")
        print(driver.page_source[:5000])
        driver.quit()
        return None, None

def scrape_publication_details(driver, publication_url):
    try:
        driver.get(publication_url)
        time.sleep(3)  # Wait for the detail page to load

        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # Extract details from the publication page
        title = soup.find('a', class_='gsc_a_at').text if soup.find('a', class_='gsc_a_at') else "No title"
        
        authors = "N/A"
        journal = "N/A"
        pub_date = "N/A"
        volume = "N/A"
        publisher = "N/A"
        description = "N/A"
        citations = "N/A"

        # Extract metadata from the table (if present)
        metadata_table = soup.find('div', id='gsc_oci_table')
        if metadata_table:
            rows = metadata_table.find_all('div', class_='gs_scl')
            for row in rows:
                field = row.find('div', class_='gsc_oci_field').text.lower()
                value = row.find('div', class_='gsc_oci_value').text
                if 'authors' in field:
                    authors = value
                elif 'journal' in field:
                    journal = value
                elif 'publication date' in field:
                    pub_date = value
                elif 'volume' in field:
                    volume = value
                elif 'publisher' in field:
                    publisher = value

        # Extract description (abstract)
        description_elem = soup.find('div', id='gsc_oci_descr')
        if description_elem:
            description = description_elem.text.strip()

        # Extract citations
        citation_elem = soup.find('a', class_='gsc_oci_g_a')
        if citation_elem:
            citations_text = citation_elem.text
            citations = re.sub(r'\D', '', citations_text) if citations_text else "0"

        return {
            "title": title,
            "authors": authors,
            "journal": journal,
            "publication date": pub_date,
            "total citation": citations,
            "volume": volume,
            "publisher": publisher,
            "description": description,
            "citations_per_year": {}
        }
    except Exception as e:
        print(f"Error scraping publication details from {publication_url}: {e}")
        return {
            "title": "Error",
            "authors": "N/A",
            "journal": "N/A",
            "publication date": "N/A",
            "total citation": "N/A",
            "volume": "N/A",
            "publisher": "N/A",
            "description": "N/A",
            "citations_per_year": {}
        }

def scrape_publications(driver, max_publications=290):
    try:
        # Wait for the publications table to load
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.ID, "gsc_a_t"))
        )
        print("Publications table loaded.")

        # Click "Show more" button repeatedly to load all publications
        while True:
            try:
                publications = driver.find_elements(By.CLASS_NAME, "gsc_a_tr")
                if len(publications) >= max_publications:
                    print(f"Reached maximum publications limit ({max_publications}).")
                    break
                show_more_button = driver.find_element(By.ID, "gsc_bpf_more")
                if show_more_button.get_attribute("disabled"):
                    print("No more publications to load.")
                    break
                print("Clicking 'Show more' button...")
                show_more_button.click()
                time.sleep(3)  # Wait for the next batch to load
            except NoSuchElementException:
                print("Show more button not found. All publications may already be loaded.")
                break
            except Exception as e:
                print(f"Error clicking 'Show more' button: {e}")
                break

        # Extract publication links
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        publications = soup.find_all('tr', class_='gsc_a_tr')[:max_publications]
        print(f"Total publications to process: {len(publications)}")

        # Store the profile page URL to return to it after visiting each publication
        profile_url = driver.current_url

        data = []
        all_years = set()

        for i, pub in enumerate(publications, 1):
            try:
                title_elem = pub.find('a', class_='gsc_a_at')
                title = title_elem.text
                publication_url = f"https://scholar.google.com{title_elem['href']}"
                print(f"Processing publication {i}/{len(publications)}: {title}")

                # Scrape detailed information from the publication's page
                pub_data = scrape_publication_details(driver, publication_url)

                # Extract year for the year-wise citation columns
                year_elem = pub.find('td', class_='gsc_a_y')
                year = year_elem.text if year_elem else "N/A"
                if year.isdigit():
                    all_years.add(int(year))

                data.append(pub_data)

                # Return to the profile page
                driver.get(profile_url)
                time.sleep(3)  # Wait for the profile page to reload
                WebDriverWait(driver, 15).until(
                    EC.presence_of_element_located((By.ID, "gsc_a_t"))
                )
            except Exception as e:
                print(f"Error processing publication {i}: {e}")
                continue

        return data, all_years
    except Exception as e:
        print(f"Error fetching publications: {e}")
        return [], set()

def save_to_excel(publication_data, all_years, filename="author_details_output.xlsx"):
    final_data = []
    for row in publication_data:
        base_data = {
            "title": row["title"],
            "authors": row["authors"],
            "journal": row["journal"],
            "publication date": row["publication date"],
            "total citation": row["total citation"],
            "volume": row["volume"],
            "publisher": row["publisher"],
        }
        for year in all_years:
            base_data[str(year)] = row.get("citations_per_year", {}).get(year, 0)
        base_data["description"] = row["description"]
        final_data.append(base_data)
    df = pd.DataFrame(final_data)
    df.to_excel(filename, index=False)
    print(f"\nData saved to {filename}")

if __name__ == "__main__":
    author_name = input("Enter the author's name: ")
    author_affiliation = input("Enter the author's affiliation (or keyword): ")

    profile_url, driver = get_author_profile_url(author_name, author_affiliation)

    if profile_url and driver:
        try:
            data, all_years = scrape_publications(driver)
            if data:
                save_to_excel(data, sorted(all_years))
            else:
                print("No publications found.")
        finally:
            driver.quit()
    else:
        print("Author not found. Check the name or affiliation keyword.")

Enter the author's name:  Rajendra Prasad
Enter the author's affiliation (or keyword):  Professor Dean, Amity University Haryana


Searching for author profile at URL: https://scholar.google.com/scholar?q=Rajendra+Prasad+Professor+Dean,+Amity+University+Haryana&hl=en
Found 'User profiles for' section.
Timeout waiting for the first user profile link. Attempting fallback method...
Fallback method found profile: Rajendra Prasad - URL: https://scholar.google.com/citations?user=Fx5-ag8AAAAJ&hl=en&oi=ao
Publications table loaded.
Clicking 'Show more' button...
Clicking 'Show more' button...
Clicking 'Show more' button...
Reached maximum publications limit (290).
Total publications to process: 290
Processing publication 1/290: Molecular cloning and characterization of a novel gene of Candida albicans, CDR1, conferring multiple resistance to drugs and antifungals
Processing publication 2/290: Yeast ATP-binding cassette transporters conferring multidrug resistance
Processing publication 3/290: Pathogenicity and drug resistance in   Candida albicans  and other yeast species
Processing publication 4/290: Multidrug resistance

## Article Data Extraction

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import tkinter as tk
from tkinter import filedialog

def scrape_scholar_articles(query, num_pages):
    articles = []
    page = 0
    while page < num_pages:
        url = f"https://scholar.google.com/scholar?start={page*10}&q={query}&hl=en&as_sdt=0,5"
        response = requests.get(url)
        soup = BeautifulSoup(response.text, "html.parser")
        results = soup.find_all("div", class_="gs_ri")

        for result in results:
            title = result.find("h3", class_="gs_rt").text
            authors = result.find("div", class_="gs_a").text
            link = result.find("a")["href"]
            articles.append({"Title": title, "Authors": authors, "Link": link})

        page += 1

    return articles

def save_to_excel(articles, filename):
    df = pd.DataFrame(articles)
    df.to_excel(filename, index=False)

def browse_folder():
    folder_path = filedialog.askdirectory()
    entry_folder.delete(0, tk.END)
    entry_folder.insert(tk.END, folder_path)

def scrape_articles():
    query = entry_query.get()
    num_pages = int(entry_pages.get())

    articles = scrape_scholar_articles(query, num_pages)

    folder_path = entry_folder.get()
    if folder_path:
        filename = f"{folder_path}/scholar_articles.xlsx"
    else:
        filename = "scholar_articles.xlsx"

    save_to_excel(articles, filename)
    label_status.config(text="Extraction complete. Data saved to scholar_articles.xlsx.")

# Create the main window
window = tk.Tk()
window.title("Google Scholar Scraper")
window.geometry("400x250")

# Create input fields and labels
label_query = tk.Label(window, text="Article Title or Keyword:")
label_query.pack()
entry_query = tk.Entry(window, width=40)
entry_query.pack()

label_pages = tk.Label(window, text="Number of Pages:")
label_pages.pack()
entry_pages = tk.Entry(window, width=40)
entry_pages.pack()

label_folder = tk.Label(window, text="Output Folder (optional):")
label_folder.pack()
entry_folder = tk.Entry(window, width=40)
entry_folder.pack()

# Create browse button
button_browse = tk.Button(window, text="Browse", command=browse_folder)
button_browse.pack()

# Create extract button
button_extract = tk.Button(window, text="Extract Data", command=scrape_articles)
button_extract.pack()

# Create status label
label_status = tk.Label(window, text="")
label_status.pack()

# Run the main window loop
window.mainloop()