In [1]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd



In [2]:
def scrape_adb_reports():
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }

    session = requests.Session()
    session.max_redirects = 20

    df = pd.DataFrame(columns=['Title', 'Source', 'Date', 'Research Area', 'Keywords', 'Authors', 'Abstract', 'External Link', 'PDF Link'])

    for i in range(54):
        try:
            url = "https://www.adb.org/search0/language/en/subject/climate-change/type/institutional_document/type/publication?page=" + str(i)
            response = session.get(url, headers=headers)
            soup = BeautifulSoup(response.text, 'html.parser')
            pdfs_url = []
            for link in soup.find_all('a'):
                if link.get('href').startswith("/publications/"):
                    extend_url = "https://www.adb.org" + link.get('href')
                    pdfs_url.append(extend_url)
            
            for url in pdfs_url:
                response = session.get(url, headers=headers)
                soup = BeautifulSoup(response.text, 'html.parser')

                for link in soup.find_all('a'):
                    if link.get('href').endswith(".pdf"):
                        pdf_url = "https://www.adb.org" + link.get('href')

                print(url)
                title_tag = soup.find("meta",  {"name":"citation_title"})
                title = title_tag['content'] if title_tag else 'N/A'
                source = 'Asian Development Bank'
                date_tag = soup.find("meta",  {"name":"citation_publication_date"})
                date = date_tag['content'] if date_tag else 'N/A'
                research_area = 'Climate Change'
                keywords_tag = soup.find("meta",  {"name":"keywords"})
                keywords = keywords_tag['content'] if keywords_tag else 'N/A'
                authors_tag = soup.find("meta",  {"name":"  "})
                authors = authors_tag['content'] if authors_tag else 'N/A'
                abstract_tag = soup.find("meta",  {"name":"description"})
                abstract = abstract_tag['content'] if abstract_tag else 'N/A'
                external_link = url
                pdf_link = pdf_url
                df.loc[len(df)] = [title, source, date, research_area, keywords, authors, abstract, external_link, pdf_link]
                time.sleep(1)
        except Exception as e:
            print(f"An error occurred: {e}")
    df = df.drop_duplicates(subset=['Title', 'Source', 'Date', 'Research Area', 'Keywords', 'Authors', 'Abstract', 'External Link', 'PDF Link'], keep='first')
    
    return df

In [16]:
def scrape_iea_reports():
    # The list of links to the reports
    links_list = []

    # Get the data from https://www.iea.org/analysis?type=report from page 0 to 63
    for page in range(0, 64):
        try:
            url = 'https://www.iea.org/analysis?type=report&page=' + str(page)
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
            }
            response = requests.get(url, headers=headers)
            soup = BeautifulSoup(response.text, 'html.parser')
            links = soup.find_all('a', href=True)
            links = [link['href'] for link in links if link['href'].startswith('/reports/')]
            links = list(set(links))
            links_list.extend(links)
        except Exception as e:
            print(f"Error occurred while fetching links from page {page}: {e}")

    # dataframe to store report data
    df = pd.DataFrame(columns=['title', 'date', 'summary', 'external_link', 'pdf_link', 'report_type'])

    # Get the data from each report
    for report_url in links_list:
        try:
            url = 'https://www.iea.org' + report_url
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
            }
            response = requests.get(url, headers=headers)
            soup = BeautifulSoup(response.text, 'html.parser')

            # title
            title = soup.find('title').text

            # date
            published_label = soup.find('span', class_='m-meta-infos__item-label', text='Published')
            date = published_label.find_next_sibling('span', class_='m-meta-infos__item-value').text

            # summary
            div = soup.find('div', class_='m-report-abstract__desc f-rte')
            paragraphs = div.find_all('p')
            summary = ' '.join(paragraph.text for paragraph in paragraphs)
            
            # external_link
            external_link = url

            # pdf_link
            pdf_link_tag = soup.find('a', href=lambda x: x and x.endswith('.pdf'))
            pdf_link = pdf_link_tag['href'] if pdf_link_tag else None

            # return a row
            row = pd.DataFrame({'title': [title], 'date': [date], 'summary': [summary], 'external_link': [external_link], 'pdf_link': [pdf_link]})
            print(row)

            # append the row to the dataframe
            df = pd.concat([df, row], ignore_index=True)
            
            time.sleep(1)
        except Exception as e:
            print(f"Error occurred while fetching data from report {report_url}: {e}")

    return df