In [None]:
import os
import csv
import time
import random
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from urllib.parse import urlparse
import undetected_chromedriver as uc

def scrape_google_scholar(query, filter_keywords=None, num_pages=0, output_csv="results.csv"):

    options = uc.ChromeOptions()
    # Comment out headless so you can manually solve CAPTCHA if needed
    # options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--start-maximized")
    # options.add_argument("--ignore-certificate-errors")
    options.add_argument("--incognito")  # Optional: for privacy mode

    driver = uc.Chrome(options=options, version_main=137)

    all_results = []

    for page in range(num_pages):
        url = f"https://scholar.google.com/scholar?q={query}&start={page * 10}"
        driver.get(url)

        retry_count = 0
        max_retries = 30

        while retry_count < max_retries:
            print(f"[{page}] Waiting for valid results or CAPTCHA bypass. Try solving manually if needed...")
            driver.maximize_window()

            time.sleep(random.uniform(1, 5))  # Avoid being blocked

            soup = BeautifulSoup(driver.page_source, "html.parser")
            results = soup.select(".gs_ri")

            if results:
                print(f"[{page}] ✅ Scraping URL: {url}")
                break
            else:
                retry_count += 1
                if retry_count == max_retries:
                    print(f"[{page}] ❌ Failed to bypass CAPTCHA after {max_retries} tries.")
                    continue

        for result in results:
            title_tag = result.select_one(".gs_rt")
            title = title_tag.get_text(strip=True) if title_tag else "No title"

            link_tag = title_tag.find("a") if title_tag else None
            link = link_tag["href"] if link_tag and link_tag.has_attr("href") else "No link"

            snippet_tag = result.select_one(".gs_rs")
            snippet = snippet_tag.get_text(strip=True) if snippet_tag else "No snippet"

            meta_tag = result.select_one(".gs_a")
            meta_text = meta_tag.get_text(strip=True) if meta_tag else ""
            authors = meta_text.split(" - ")[0] if " - " in meta_text else "No author info"

            citation_count = "0"
            footer_links = result.select(".gs_fl a")
            for a in footer_links:
                if "Cited by" in a.get_text():
                    try:
                        citation_count = a.get_text().split("Cited by")[1].strip()
                    except:
                        citation_count = "0"
                    break

            domain = urlparse(link).netloc

            # Filter by keyword match in title or snippet (optional)
            if filter_keywords:
                text_combined = (title + snippet).lower()
                if not any(keyword.lower() in text_combined for keyword in filter_keywords):
                    continue  # Skip if no filter keywords matched
            
            all_results.append({
                "Title": title,
                "Authors": authors,
                "Citations": citation_count,
                "Link": f'=HYPERLINK("{link}", "{domain}")',
                "Snippet": snippet,
                "Query": f'=HYPERLINK("{url}", "{url}")',
                "Page No.": page
            })

    driver.quit()

    if all_results:
        # Ensure output folder exists
        os.makedirs(os.path.dirname(output_csv), exist_ok=True)

        # Save to CSV
        with open(output_csv, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.DictWriter(file, fieldnames=["Title", "Authors", "Citations", "Link", "Snippet", "Query", "Page No."], quoting=csv.QUOTE_MINIMAL)
            writer.writeheader()
            writer.writerows(all_results)

        abs_path = os.path.abspath(output_csv)
        print(f"\n✅ Scraped {len(all_results)} results and saved to:\n📂 {abs_path}")
    else:
        print('Failed to scrape papers')


if __name__ == "__main__":
    
    load_dotenv() 
    
    fixed_keywords = " AND ".join([kw.strip() for kw in os.getenv("FIXED_KEYWORDS").split(',')])
    optional_keywords = " OR ".join([kw.strip() for kw in os.getenv("OPTIONAL_KEYWORDS").split(',')])
    filter_keywords = ([] if os.getenv("FILTER_KEYWORDS") in (None, "") else [kw.strip() for kw in os.getenv("FILTER_KEYWORDS").split(',')])
    main_query = f'{fixed_keywords} AND ({optional_keywords})'

    output_path = os.getenv("OUTPUT_CSV_FILE_PATH")

    scrape_google_scholar(query=main_query, filter_keywords=filter_keywords, num_pages=10, output_csv=output_path)
