In [None]:
import requests
from bs4 import BeautifulSoup
import time
import random
import pandas as pd

# Daftar User-Agent
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
]

# Fungsi untuk melakukan crawling
def crawl_google_scholar(query, max_results=1000):
    results = []
    count = 0

    def get_url_for_page(url, page_index):
        return url + f"&start={page_index}"

    NUM_OF_PAGES = 100
    page_index = 0

    for _ in range(NUM_OF_PAGES):
        page_url = get_url_for_page(query, page_index)

        # Header dengan User-Agent acak
        headers = {
            "User-Agent": random.choice(user_agents)
        }

        # Kirim request ke Google Scholar
        response = requests.get(page_url, headers=headers)
        if response.status_code != 200:
            print(f"Gagal mengambil data. Kode status: {response.status_code}")
            break

        # Parsing HTML dengan BeautifulSoup
        soup = BeautifulSoup(response.text, "html.parser")

        # Cari semua elemen yang mengandung hasil pencarian
        for item in soup.find_all("div", class_="gs_ri"):
            # Ekstrak judul
            title = item.find("h3", class_="gs_rt").text
            # Ekstrak link
            link = item.find("a")["href"]
            # Ekstrak abstrak (jika ada)
            abstract = item.find("div", class_="gs_rs").text.strip() if item.find("div", class_="gs_rs") else "Abstrak tidak tersedia"

            # Tambahkan ke hasil
            results.append({
                "title": title,
                "link": link,
                "abstract": abstract
            })
            count += 1

            # Berhenti jika sudah mencapai max_results
            if count >= max_results:
                break

        # Update parameter "start" untuk halaman berikutnya
        page_index += 10

        # Jeda waktu acak untuk menghindari blokir
        time.sleep(random.randint(10, 30))  # Jeda acak antara 10 hingga 30 detik

    return results

# Query pencarian
query = "https://scholar.google.com/scholar?q=universitas+brawijaya&hl=id&as_sdt=0,5"
max_results = 1000

# Jalankan crawling
data = crawl_google_scholar(query, max_results)

# Simpan ke CSV
df = pd.DataFrame.from_dict(data)
df.to_csv('data.csv', index=False)

# Tampilkan hasil
for i, result in enumerate(data, 1):
    print(f"Result {i}:")
    print(f"Judul: {result['title']}")
    print(f"Link: {result['link']}")
    print(f"Abstrak: {result['abstract']}")
    print("-" * 80)