In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.service import Service
from selenium.webdriver.edge.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.microsoft import EdgeChromiumDriverManager
import time

# Setup WebDriver
service = Service(EdgeChromiumDriverManager().install())
options = Options()
options.add_argument("--start-maximized")  


# Tambahkan User-Agent agar tidak terdeteksi sebagai bot
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")

driver = webdriver.Edge(service=service, options=options)

# Kata kunci pencarian (bisa kosong untuk melihat semua hasil)
query = "2024"
max_pages = 1000  # Jumlah halaman yang ingin diambil
titles_list = []

for page in range(max_pages):
    start = page * 10  # Google Scholar menampilkan 10 artikel per halaman
    search_url = f"https://scholar.google.com/scholar?q={query}&start={start}"
    
    driver.get(search_url)
    time.sleep(5)  # Tambahkan delay lebih lama untuk menghindari deteksi bot

    # Debug: Periksa apakah halaman termuat dengan benar
    page_source = driver.page_source
    if "did not match any articles" in page_source:
        print(f"Hasil pencarian kosong di halaman {page + 1}, hentikan scraping.")
        break

    try:
        # Tunggu elemen artikel muncul
        wait = WebDriverWait(driver, 10)
        wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "h3.gs_rt")))

        # Ambil semua judul artikel
        titles = driver.find_elements(By.CSS_SELECTOR, "h3.gs_rt")
        if not titles:
            print(f"Tidak ada judul di halaman {page + 1}, mungkin Google memblokir akses.")
            break

        for title in titles:
            if title.text.strip():
                titles_list.append(title.text.strip())

        print(f"Halaman {page + 1} selesai, total artikel terkumpul: {len(titles_list)}")

    except Exception as e:
        print(f"Error di halaman {page + 1}: {e}")
        break

# Cetak hasilnya
print("\nJudul Artikel Ilmiah:")
for i, title in enumerate(titles_list, 1):
    print(f"{i}. {title}")

# Tutup browser
driver.quit()


Halaman 1 selesai, total artikel terkumpul: 10
Error di halaman 2: Message: 


Judul Artikel Ilmiah:
1. GenBank 2024 update
2. [PDF] 2024
3. [PDF] The State of the World's Mangroves 2024
4. The voiceprivacy 2024 challenge evaluation plan
5. [PDF] 2024
6. WikiPathways 2024: next generation pathway database
7. [PDF] 2024 IEEE International Conference on Robotics and Automation (ICRA)
8. <? mode longauthoraffil?> The Human Phenotype Ontology in 2024: phenotypes around the world
9. The UCSC genome browser database: 2024 update
10. KDIGO 2024 clinical practice guideline for the evaluation and management of chronic kidney disease


In [2]:
import pandas as pd

# Simpan data ke dalam DataFrame
df = pd.DataFrame({"Judul": titles_list})

# Simpan ke file CSV
df.to_csv("judul_scholar.csv", index=False, encoding="utf-8")

print("Data berhasil disimpan ke judul_scholar.csv")


Data berhasil disimpan ke judul_scholar.csv


In [3]:
df.head()

Unnamed: 0,Judul
0,GenBank 2024 update
1,[PDF] 2024
2,[PDF] The State of the World's Mangroves 2024
3,The voiceprivacy 2024 challenge evaluation plan
4,[PDF] 2024


In [None]:
import requests
from bs4 import BeautifulSoup
import time
import random
import pandas as pd

# Daftar User-Agent
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
]

# Fungsi untuk melakukan crawling
def crawl_google_scholar(query, max_results=1000):
    results = []
    count = 0

    def get_url_for_page(url, page_index):
        return url + f"&start={page_index}"

    NUM_OF_PAGES = 100
    page_index = 0

    for _ in range(NUM_OF_PAGES):
        page_url = get_url_for_page(query, page_index)

        # Header dengan User-Agent acak
        headers = {
            "User-Agent": random.choice(user_agents)
        }

        # Kirim request ke Google Scholar
        response = requests.get(page_url, headers=headers)
        if response.status_code != 200:
            print(f"Gagal mengambil data. Kode status: {response.status_code}")
            break

        # Parsing HTML dengan BeautifulSoup
        soup = BeautifulSoup(response.text, "html.parser")

        # Cari semua elemen yang mengandung hasil pencarian
        for item in soup.find_all("div", class_="gs_ri"):
            # Ekstrak judul
            title = item.find("h3", class_="gs_rt").text
            # Ekstrak link
            link = item.find("a")["href"]
            # Ekstrak abstrak (jika ada)
            abstract = item.find("div", class_="gs_rs").text.strip() if item.find("div", class_="gs_rs") else "Abstrak tidak tersedia"

            # Tambahkan ke hasil
            results.append({
                "title": title,
                "link": link,
                "abstract": abstract
            })
            count += 1

            # Berhenti jika sudah mencapai max_results
            if count >= max_results:
                break

        # Update parameter "start" untuk halaman berikutnya
        page_index += 10

        # Jeda waktu acak untuk menghindari blokir
        time.sleep(random.randint(10, 30))  # Jeda acak antara 10 hingga 30 detik

    return results

# Query pencarian
query = "https://scholar.google.com/scholar?q=universitas+brawijaya&hl=id&as_sdt=0,5"
max_results = 1000

# Jalankan crawling
data = crawl_google_scholar(query, max_results)

# Simpan ke CSV
df = pd.DataFrame.from_dict(data)
df.to_csv('data.csv', index=False)

# Tampilkan hasil
for i, result in enumerate(data, 1):
    print(f"Result {i}:")
    print(f"Judul: {result['title']}")
    print(f"Link: {result['link']}")
    print(f"Abstrak: {result['abstract']}")
    print("-" * 80)