In [2]:
pip install beautifulsoup4


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install requests

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [1]:
import requests
from bs4 import BeautifulSoup
import time
import csv
import os

def scrape_google_scholar(query, keywords=None, num_pages=1, output_csv="results.csv"):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
    }

    base_url = "https://scholar.google.com/scholar"
    all_results = []

    for page in range(num_pages):
        params = {
            "q": query,
            "start": page * 10  # 10 results per page
        }

        response = requests.get(base_url, params=params, headers=headers)
        soup = BeautifulSoup(response.text, "html.parser")
        results = soup.select(".gs_ri")

        for result in results:
            title_tag = result.select_one(".gs_rt")
            title = title_tag.get_text(strip=True) if title_tag else "No title"

            link_tag = title_tag.find("a") if title_tag else None
            link = link_tag["href"] if link_tag and link_tag.has_attr("href") else "No link"

            snippet_tag = result.select_one(".gs_rs")
            snippet = snippet_tag.get_text(strip=True) if snippet_tag else "No snippet"

            meta_tag = result.select_one(".gs_a")
            meta_text = meta_tag.get_text(strip=True) if meta_tag else ""
            authors = meta_text.split(" - ")[0] if " - " in meta_text else "No author info"

            # Filter by keyword match in title or snippet (optional)
            if keywords:
                text_combined = (title + snippet).lower()
                if not any(keyword.lower() in text_combined for keyword in keywords):
                    continue  # Skip if no keywords matched

            all_results.append({
                "Title": title,
                "Authors": authors,
                "Link": link,
                "Snippet": snippet
            })

        time.sleep(1)  # Avoid being blocked

    # Ensure output folder exists
    os.makedirs(os.path.dirname(output_csv), exist_ok=True)

    # Save to CSV
    with open(output_csv, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=["Title", "Authors", "Link", "Snippet"])
        writer.writeheader()
        writer.writerows(all_results)

    abs_path = os.path.abspath(output_csv)
    print(f"\n✅ Scraped {len(all_results)} results and saved to:\n📂 {abs_path}")


if __name__ == "__main__":
    main_query = "Digital Twin in Industry"
    keyword_list = ["Digital Twin", "Industry"]

    # ✅ Save to D:\paper\paper
    output_path = r"D:\paper\paper1\digital_twin.csv"

    scrape_google_scholar(query=main_query, keywords=keyword_list, num_pages=10, output_csv=output_path)



✅ Scraped 90 results and saved to:
📂 D:\paper\paper1\digital_twin.csv


In [2]:
pwd

'C:\\Users\\ASUS'