In [3]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException
from concurrent.futures import ThreadPoolExecutor, as_completed
from enum import Enum
import requests
import time
import pandas as pd

class CitationStyle(Enum):
    AMA = "american-medical-association"
    APSA = "american-political-science-association"
    APA = "apa"
    CHICAGO = "chicagob"
    HARVARD = "harvard"
    IEEE = "ieee"
    MHRA = "modern-humanities-research-association"
    MLA = "mla7"
    VANCOUVER = "vancouver"

#To click the cite button, idk but it only works when i use this
def click_element(driver, by, value):
    max_attempts = 3
    for attempt in range(max_attempts):
        try:
            element = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((by, value))
            )
            time.sleep(0.5)
            element.click()
            break  # Exit the loop if successful
        except StaleElementReferenceException:
            # print(f"Attempt {attempt + 1} failed. Retrying...")
            time.sleep(.5)  # Backoff before retrying


#Get Abstract and Chicago Citation from link
def get_info(link, citation_style):
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Enable headless mode
    chrome_options.add_argument("--no-sandbox")  # For environments where sandboxing is not available
    chrome_options.add_argument("--log-level=3")
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(link)
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')

    try:
        s = soup.find('div', class_='abstract')
        abstract = s.text
    except Exception as e:
        print("error at abstract")
        raise e

    # print(button)
    click_element(driver, By.LINK_TEXT, "Show author details")
    s = soup.find_all('div', class_='row author')
    authors = []

    try:
        for row in s:
            authors.append(row.text.strip().split(' Affiliation: ')[1])
    except Exception as e:
        print("error at authors")
        raise e


    click_element(driver, By.CLASS_NAME, "export-citation-product")

    select_element = WebDriverWait(driver, 10).until(
                    EC.visibility_of_element_located((By.ID, "selectCitationStyle"))
                )
    
    citation = driver.find_element(By.ID, "citationText") #Need to put this after selectCitationStyle becomes visible
    initial_text = citation.text
    
    select = Select(select_element)
    select.select_by_value(citation_style.value)

    try:
        WebDriverWait(driver, 10).until(
            lambda d: citation.text != initial_text
        )
        citation = citation.text
    except Exception as e:
        print("error at waiting for citation to change")
        raise e

    driver.close()
    return abstract, citation, authors

#Get links of all articles from link


def parallel_get_info(link, citation_style):
    while True:
        try:
            abstract, citation, authors = get_info(link, citation_style)
            # Create a dictionary with 'Authors' as a list of authors
            return {'Chicago Citation': citation, 'Abstract': abstract,  'First Author Institution': authors[0], 'Other Author Institutions': ' / '.join(authors[1:])}
        except Exception as e:
            print(f"Error occured, retrying")

def get_pages(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.content, "html.parser")

    pages = soup.find('div', class_="pagination-centered")
    pages = pages.find('p').text
    pages = int(pages[-1])

    links = [url]
    for i in range(2, pages+1):
        links.append(url.replace("pageNum=1", "pageNum=" + str(i)))
    
    return links
    

def scrape(link, max_workers=10, output_file='output', csv=False, excel=True, citation_style=CitationStyle.CHICAGO):
    """
    Input: link (str), max_workers (int), output_filename (str), csv (bool), excel (bool)
    Output:
    """
    links = get_pages(link)
    all_links = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
            # Submit tasks to the executor for each link
            future_to_link = {executor.submit(get_links, link): link for link in links}
            
            # As each task completes, append the result to the DataFrame
            for future in as_completed(future_to_link):
                result = future.result()
                if result:  # If result is not None
                    all_links.extend(result)

    for i in all_links:
        print(i)

    # Initialize an empty DataFrame to store results
    df = pd.DataFrame()

    # Set up a ThreadPoolExecutor for parallel execution
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit tasks to the executor for each link
        future_to_link = {executor.submit(parallel_get_info, link, citation_style): link for link in all_links}
        
        # As each task completes, append the result to the DataFrame
        for future in as_completed(future_to_link):
            result = future.result()
            if result:  # If result is not None
                df = pd.concat([df, pd.DataFrame([result])], ignore_index=True)

    # Save the final DataFrame to an Excel file
    if(excel):
        df.to_excel(f'{output_file}.xlsx', index=False)
    if(csv):
        df.to_csv(f'{output_file}.csv', index=False)


# REPLACE LINK WITH THE LINK
# LOOK AT THE ORANGE TEXT TO SEE THE PARAMETERS, AND ADD THEM INTO THE scrape(link, csv = True) TO CHANGE OPTIONS


In [10]:
def get_links(url):
    domain = "https://www.cambridge.org"
    r = requests.get(url)
    soup = BeautifulSoup(r.content, "html.parser")
    
    s = soup.find('h4', class_="journal-article-listing-type")
    siblings = [s] + s.findNextSiblings()
    s.findnex
    result = []
    inReviews = False
    for sibling in siblings:
        if sibling.name == "h4":
            if sibling.text == "Book Reviews":
                inReviews = True
            else:
                inReviews = False
            continue
        # Don't know what h3 are supposed to represent (its like a name of something), skip book reviews and anything that contains front/back matter
        if inReviews or sibling.name == "h3" or sibling.text.lower().__contains__("front matter") or sibling.text.lower().__contains__("back matter") or sibling.text.lower().__contains__("book review"):
            continue
        result.append(sibling)
        print(sibling.find('a').text)
    siblings = result
    links = list(map(lambda x: domain + x.find('a').get('href'), siblings))
    return links

In [12]:
link = "https://www.cambridge.org/core/journals/american-political-science-review/issue/AA64DAD7A735A935540A462898BF0876?sort=canonical.position%3Aasc&pageNum=1&searchWithinIds=AA64DAD7A735A935540A462898BF0876&productType=JOURNAL_ARTICLE&template=cambridge-core%2Fjournal%2Farticle-listings%2Flistings-wrapper&hideArticleJournalMetaData=true&displayNasaAds=false"

print(get_links("https://www.cambridge.org/core/journals/american-political-science-review/issue/AA64DAD7A735A935540A462898BF0876?sort=canonical.position%3Aasc&pageNum=2&searchWithinIds=AA64DAD7A735A935540A462898BF0876&productType=JOURNAL_ARTICLE&template=cambridge-core%2Fjournal%2Farticle-listings%2Flistings-wrapper&hideArticleJournalMetaData=true&displayNasaAds=false") == None)

False
