구글 스칼라 인용수 정렬

- 내가 직접 만든다

In [34]:
import requests
import random
import time
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from tqdm import tqdm

In [11]:
headers = {
    "authority": "scholar.google.com",
    "method": "GET",
    "path": "/scholar?hl=en&as_sdt=0%2C5&q=%22black%22%2C+%22litterman%22&btnG=",
    "scheme": "https",
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "accept-encoding": "gzip, deflate, br, zstd",
    "accept-language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7",
    "cache-control": "no-cache",
    "cookie": "",
    "pragma": "no-cache",
    "priority": "u=0, i",
    "referer": "https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q=black+litterman&btnG=",
    "sec-ch-ua": '"Not)A;Brand";v="8", "Chromium";v="138", "Google Chrome";v="138"',
    "sec-ch-ua-arch": "x86",
    "sec-ch-ua-bitness": "64",
    "sec-ch-ua-full-version-list": '"Not)A;Brand";v="8.0.0.0", "Chromium";v="138.0.7204.51", "Google Chrome";v="138.0.7204.51"',
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-model": "",
    "sec-ch-ua-platform": "Windows",
    "sec-ch-ua-platform-version": "19.0.0",
    "sec-ch-ua-wow64": "?0",
    "sec-fetch-dest": "document",
    "sec-fetch-mode": "navigate",
    "sec-fetch-site": "same-origin",
    "sec-fetch-user": "?1",
    "upgrade-insecure-requests": "1",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36",
    "x-browser-channel": "stable",
    "x-browser-copyright": "Copyright 2025 Google LLC. All rights reserved.",
    "x-browser-validation": "6h3XF8YcD8syi2FF2BbuE2KllQo=",
    "x-browser-year": "2025",
    "x-client-data": "CIu2yQEIpLbJAQipncoBCLHuygEIk6HLAQijo8sBCIWgzQEI/qXOAQji8M4BCKTyzgEIk/bOAQir+c4BGND6zgE=",
}

In [35]:
def search_scholar_papers(query, num_pages=1, headers=None):
    """
    Google Scholar에서 논문 정보를 여러 페이지에 걸쳐 수집하는 함수입니다.
    각 페이지는 10개의 논문을 포함합니다.

    Args:
        query (str): 검색할 쿼리 (예: "mean+variance+optimization")
        num_pages (int): 가져올 페이지 수 (기본값: 1)
        headers (dict): requests에 사용할 헤더 (기본값: None)

    Returns:
        list: 논문 정보가 담긴 딕셔너리 리스트
    """
    papers = []
    for page in tqdm(range(num_pages)):
        time.sleep(random.uniform(1.5, 5.5))
        start = page * 10
        url = f"https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q={query}&start={start}&btnG="
        req = requests.get(url, headers=headers)
        soup = BeautifulSoup(req.text, "html.parser")

        for item in soup.find_all("div", class_="gs_ri"):
            # 논문 제목 추출
            title_tag = item.find("h3", class_="gs_rt")
            if title_tag and title_tag.a:
                title = title_tag.a.get_text()
                paper_url = title_tag.a.get("href", "")
            else:
                title = title_tag.get_text() if title_tag else ""
                paper_url = ""
            
            # 저자, 저널, 연도, 웹사이트 분리 추출 (\xa0 기준 분리)
            journal_info = item.find("div", class_="gs_a")
            author = ""
            journal = ""
            year = ""
            website = ""
            if journal_info:
                journal_raw = journal_info.get_text()
                # \xa0로 분리
                parts = journal_raw.split('\xa0- ')
                if len(parts) >= 2:
                    author = parts[0].strip()
                    # 저널, 연도, 웹사이트는 두 번째 파트에 있음
                    rest = parts[1]
                    # 쉼표로 분리해서 연도 찾기
                    rest_parts = rest.split(',')
                    if len(rest_parts) >= 2:
                        # 마지막 쉼표 뒤에 연도와 웹사이트가 있을 수 있음
                        journal = ','.join(rest_parts[:-1]).strip()
                        last_part = rest_parts[-1].strip()
                        # 마지막 파트에서 연도와 웹사이트 분리
                        if '-' in last_part:
                            year_website = last_part.split('-')
                            year = year_website[0].strip()
                            website = year_website[1].strip() if len(year_website) > 1 else ""
                        else:
                            year = last_part
                    else:
                        # 쉼표가 없으면 그냥 전체를 journal로
                        journal = rest.strip()
                else:
                    # \xa0- 가 없으면 전체를 journal로
                    journal = journal_raw.strip()
            
            # 인용 횟수 추출
            cited_tag = item.find("a", string=lambda s: s and "Cited by" in s)
            if cited_tag:
                try:
                    cited_count = int(cited_tag.get_text().split("Cited by")[-1].strip())
                except:
                    cited_count = 0
            else:
                cited_count = 0

            papers.append({
                "title": title,
                "author": author,
                "journal": journal,
                "year": year,
                "website": website,
                "url": paper_url,
                "cited_count": cited_count
            })
    return papers

# 사용 예시:
# papers = search_scholar_papers("mean+variance+optimization", num_pages=3, headers=headers)
# papers 리스트에 논문 제목, 저자, 저널/학회명, 연도, 웹사이트, 논문 URL, 인용 횟수가 담깁니다.

In [36]:
papers = search_scholar_papers("mean+variance+optimization", num_pages=10, headers=headers)

100%|██████████| 10/10 [00:46<00:00,  4.62s/it]


In [37]:
df = pd.DataFrame(papers)

In [None]:
df.sort_values(by="cited_count", ascending=False).drop("website", axis=1).head(40)

Unnamed: 0,title,author,journal,year,url,cited_count
55,Continuous-time mean-variance portfolio select...,"XY Zhou, D Li",Applied Mathematics and Optimization,2000.0,https://link.springer.com/article/10.1007/s002...,1345
89,A mean/variance analysis of tracking error,,R Roll - 1992 - anderson.ucla.edu,,https://www.anderson.ucla.edu/documents/areas/...,1162
60,Dynamic mean-variance asset allocation,"S Basak, G Chabakauri",The Review of Financial Studies,2010.0,https://academic.oup.com/rfs/article-abstract/...,700
7,Mean–variance portfolio optimization with stat...,"T Björk, A Murgoci, XY Zhou",Mathematical Finance: An …,2014.0,https://onlinelibrary.wiley.com/doi/abs/10.111...,591
17,In search of the exchange risk premium: A six-...,JA Frankel,Journal of international Money and Finance,1982.0,https://www.sciencedirect.com/science/article/...,498
35,Mean–variance approximations to expected utility,H Markowitz,European Journal of Operational Research,2014.0,https://www.sciencedirect.com/science/article/...,384
87,Mean–variance portfolio optimization using mac...,"W Chen, H Zhang, MK Mehlawat, L Jia",Applied soft computing,2021.0,https://www.sciencedirect.com/science/article/...,335
49,A mean-variance-skewness portfolio optimizatio...,"H Konno, K Suzuki",Journal of the Operations Research Society of …,1995.0,https://www.jstage.jst.go.jp/article/jorsj/38/...,327
6,The mean-variance approach to portfolio optimi...,A Yoshimoto,Journal of the Operations Research Society of ...,1996.0,https://www.jstage.jst.go.jp/article/jorsj/39/...,318
51,The mean–variance cardinality constrained port...,"KP Anagnostopoulos, G Mamanis",Expert Systems with Applications,2011.0,https://www.sciencedirect.com/science/article/...,272
