In [None]:
import os
import csv
import time
import random
import requests
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from urllib.parse import urlparse

def scrape_google_scholar(query, filter_keywords=None, num_pages=0, output_csv="results.csv"):
    USER_AGENTS = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:112.0) Gecko/20100101 Firefox/112.0",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.1 Safari/605.1.15",
        "Mozilla/5.0 (Linux; Android 11; Pixel 5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.88 Mobile Safari/537.36",
    ]

    base_url = "https://scholar.google.com/scholar"
    all_results = []

    for page in range(num_pages):

        params = {
            "q": query ,
            "start": page * 10  # 10 results per page
        }

        headers = {
            "User-Agent": random.choice(USER_AGENTS)
        }

        response = requests.get(base_url, params=params, headers=headers)
        print(f"[{page}] Status: {response.status_code}, Scraping URL: {response.url}")

        soup = BeautifulSoup(response.text, "html.parser")

        results = soup.select(".gs_ri")

        for result in results:
            title_tag = result.select_one(".gs_rt")
            title = title_tag.get_text(strip=True) if title_tag else "No title"

            link_tag = title_tag.find("a") if title_tag else None
            link = link_tag["href"] if link_tag and link_tag.has_attr("href") else "No link"

            snippet_tag = result.select_one(".gs_rs")
            snippet = snippet_tag.get_text(strip=True) if snippet_tag else "No snippet"

            meta_tag = result.select_one(".gs_a")
            meta_text = meta_tag.get_text(strip=True) if meta_tag else ""
            authors = meta_text.split(" - ")[0] if " - " in meta_text else "No author info"

            citation_count = "0"
            footer_links = result.select(".gs_fl a")
            for a in footer_links:
                if "Cited by" in a.get_text():
                    try:
                        citation_count = a.get_text().split("Cited by")[1].strip()
                    except:
                        citation_count = "0"
                    break

            domain = urlparse(link).netloc

            # Filter by keyword match in title or snippet (optional)
            if filter_keywords:
                text_combined = (title + snippet).lower()
                if not any(keyword.lower() in text_combined for keyword in filter_keywords):
                    continue  # Skip if no filter keywords matched
            
            domain = urlparse(link).netloc

            all_results.append({
                "Title": title,
                "Authors": authors,
                "Citations": citation_count,
                "Link": f'=HYPERLINK("{link}", "{domain}")',  # formatted hyperlink,
                "Snippet": snippet,
                "Query": f'=HYPERLINK("{response.url}", "{response.url}")'
            })

        time.sleep(random.uniform(1, 3))  # Avoid being blocked

    # Ensure output folder exists
    os.makedirs(os.path.dirname(output_csv), exist_ok=True)

    if len(all_results) > 0:
        # Save to CSV
        with open(output_csv, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.DictWriter(file, fieldnames=["Title", "Authors", "Citations", "Link", "Snippet", "Query"], quoting=csv.QUOTE_MINIMAL)
            writer.writeheader()
            writer.writerows(all_results)

        abs_path = os.path.abspath(output_csv)
        print(f"\n✅ Scraped {len(all_results)} results and saved to:\n📂 {abs_path}")
    else:
        print('Failed to scrape papers')


if __name__ == "__main__":
    
    load_dotenv() 
    
    fixed_keywords = " AND ".join([kw.strip() for kw in os.getenv("FIXED_KEYWORDS").split(',')])
    optional_keywords = " OR ".join([kw.strip() for kw in os.getenv("OPTIONAL_KEYWORDS").split(',')])
    filter_keywords = ([] if os.getenv("FILTER_KEYWORDS") in (None, "") else [kw.strip() for kw in os.getenv("FILTER_KEYWORDS").split(',')])
    main_query = f'{fixed_keywords} AND ({optional_keywords})'

    output_path = os.getenv("OUTPUT_CSV_FILE_PATH")

    scrape_google_scholar(query=main_query, filter_keywords=filter_keywords, num_pages=10, output_csv=output_path)


[0] Status: 429, Scraping URL: https://www.google.com/sorry/index?continue=https://scholar.google.com/scholar%3Fq%3DDigital%2BTwin%2BAND%2BEducation%2BAND%2B%2528University%2BOR%2BLab%2529%26start%3D0&q=EgRo6CVwGM7ukcMGIixO6mwTyCz_JgMkHvrzVHPLMKmCJWeoY-jpLQZWdGo8oujgnUFYpSpWWBw5gzICclJaAUM
[1] Status: 200, Scraping URL: https://scholar.google.com/scholar?q=Digital+Twin+AND+Education+AND+%28University+OR+Lab%29&start=10
[2] Status: 200, Scraping URL: https://scholar.google.com/scholar?q=Digital+Twin+AND+Education+AND+%28University+OR+Lab%29&start=20
[3] Status: 200, Scraping URL: https://scholar.google.com/scholar?q=Digital+Twin+AND+Education+AND+%28University+OR+Lab%29&start=30
[4] Status: 200, Scraping URL: https://scholar.google.com/scholar?q=Digital+Twin+AND+Education+AND+%28University+OR+Lab%29&start=40
[5] Status: 200, Scraping URL: https://scholar.google.com/scholar?q=Digital+Twin+AND+Education+AND+%28University+OR+Lab%29&start=50
[6] Status: 200, Scraping URL: https://scholar.go