In [63]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time

def get_first_paper_link(driver, title):
    # Cochrane Library 검색 페이지로 이동
    search_url = "https://www.cochranelibrary.com/search"
    driver.get(search_url)

    # 검색창 찾기 및 논문 제목 입력
    search_box = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "searchText"))
    )
    search_box.clear()
    search_box.send_keys(title)
    search_box.send_keys("\n")

    # 첫 번째 검색 결과의 링크 추출
    first_result = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, ".search-results-item .result-title a"))
    )
    paper_link = first_result.get_attribute('href')
    
    return paper_link

def extract_pico(url):
    # Selenium 설정
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # GUI 없이 실행
    service = Service('/Users/myo/development/chromedriver')  # chromedriver 경로를 지정해주세요
    driver = webdriver.Chrome(service=service, options=chrome_options)

    driver.get(url)
    
    # PICO 섹션이 로드될 때까지 대기
    wait = WebDriverWait(driver, 10)
    wait.until(EC.presence_of_element_located((By.ID, "pico")))
    
    # 페이지 스크롤
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)  # 스크롤 후 콘텐츠 로드를 위한 대기

    # 페이지 소스 가져오기
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')

    pico = {}
    pico_section = soup.find('section', class_='pico-section', id='pico')
    
    if pico_section:
        for column in pico_section.find_all('div', class_='pico-column'):
            pico_type = column['class'][1]  # Population, Intervention, Comparison, Outcome
            terms = [a.text.strip() for a in column.find_all('a')]
            pico[pico_type] = terms

    driver.quit()
    return pico

def process_papers(start_index, end_index, titles_df, output_file):
    # Selenium 설정
    chrome_options = Options()
    service = Service('/Users/myo/development/chromedriver')  # chromedriver 경로를 지정해주세요
    driver = webdriver.Chrome(service=service, options=chrome_options)

    pico_data = []

    # 지정된 범위의 논문에 대해 링크 추출 및 PICO 정보 추출
    for index in range(start_index, end_index):
        if index >= len(titles_df):
            break
        
        row = titles_df.iloc[index]
        title = row['Title']
        year_issue = f"{row['Year']}-{row['Issue']}"
        print(f"Processing {index+1}/{end_index}: {title}")

        try:
            # 논문의 첫 번째 링크 추출
            paper_link = get_first_paper_link(driver, title)
            print(f"Found paper link: {paper_link}")

            # PICO 정보 추출
            pico_info = extract_pico(paper_link)
            pico_info["No."] = index + 1
            pico_info["Year-Issue"] = year_issue
            pico_info["Title"] = title  # PICO 정보에 제목 추가
            pico_data.append(pico_info)
            print(f"PICO data for {title}: {pico_info}")

        except Exception as e:
            print(f"Error processing title: {title}, error: {e}")
            # 오류 발생 시, 제목만 저장하고 나머지는 공백으로 처리
            pico_data.append({"No.": index + 1, "Year-Issue": year_issue, "Title": title, "Population": "", "Intervention": "", "Comparison": "", "Outcome": ""})

    driver.quit()

    # 결과를 DataFrame으로 변환
    df = pd.DataFrame(pico_data, columns=["No.", "Year-Issue", "Title", "Population", "Intervention", "Comparison", "Outcome"])

    # 기존 CSV 파일에 추가
    df.to_csv(output_file, mode='a', index=False, header=not pd.io.common.file_exists(output_file))
    print(f"Saved PICO data to {output_file}")

def main():
    # CSV 파일에서 논문 제목 읽기
    input_file = '/Users/myo/Desktop/Kangs/interv_sr_list.csv'
    titles_df = pd.read_csv(input_file)

    # 출력 파일 설정
    output_file = '/Users/myo/Desktop/Kangs/picos_good.csv'

    # 100개씩 처리 (예: 0-99, 100-199, ...)
    for start_index in range(0, len(titles_df), 100):
        end_index = start_index + 100
        process_papers(start_index, end_index, titles_df, output_file)

if __name__ == "__main__":
    main()


Processing 1/5: Transarterial (chemo)embolisation versus systemic chemotherapy for colorectal cancer liver metastases
Found paper link: https://www.cochranelibrary.com/cdsr/doi/10.1002/14651858.CD009498.pub4/full?highlightAbstract=colourect%7Cchemo%7Csystemic%7Cembolisation%7Cchem%7Cembolization%7Cliver%7Cfor%7Cmetastas%7Cchemotherapy%7Cversus%7Cembolis%7Cmetastases%7Ctransarterial%7Csystem%7Ctransarteri%7Ccolorectal%7Cembol%7Cfour%7Ccolorect%7Ccancer%7Cchemotherapi%7Ccolourectal
PICO data for Transarterial (chemo)embolisation versus systemic chemotherapy for colorectal cancer liver metastases: {'Population': ['Child, Preschool 2-5 years', 'Aged 80 and over 80+ years', 'Adult 19-44 years', 'Child 6-12 years', 'Middle Aged 45-64 years', 'Infant 1 to 23 mo', 'Young Adult 19-24 years', 'Birth to 1 mo', 'Aged 65-79 years', 'Metastasis to liver', 'Adolescent 13-18 years'], 'Intervention': ['Transarterial Embolization', 'Chemoembolization Of Liver Metastases'], 'Comparison': [], 'Outcome': [