# Install Selenium and BeautifulSoup

In [1]:
!pip install pandas selenium requests beautifulsoup4



# Import libraries

In [2]:
import pandas as pd
from selenium import webdriver
import os
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import requests
from bs4 import BeautifulSoup
from datetime import datetime
import pickle

cache_path = os.path.join(os.getcwd(),'cache')
if not os.path.exists(cache_path) :
    os.makedirs(cache_path)

# Accessing Nature Website

In [3]:
driver = webdriver.Chrome()

driver.get("https://chula.idm.oclc.org/login?url=https://www.nature.com")

username = driver.find_element(By.CSS_SELECTOR, 'input[name="user"]')
password = driver.find_element(By.CSS_SELECTOR, 'input[name="pass"]')

username.clear()
password.clear()

# Change Username and Password to Your CU NET ACCOUNT
USERNAME = "Student_ID"
PASSWORD = "Password"

username.send_keys(USERNAME)
password.send_keys(PASSWORD)

agreeterm = driver.find_element(By.ID,"checky")
agreeterm.click()

getin = driver.find_element(By.CSS_SELECTOR,'input[type = "submit"]')

getin.click()

# Web Scraping "Deep Learning"

In [4]:
cookies = driver.get_cookies()

session = requests.Session()  # Create a requests session
for cookie in cookies:
    session.cookies.set(cookie['name'], cookie['value'])

search = driver.find_element(By.CSS_SELECTOR, 'li[class = "c-header__item c-header__item--padding c-header__item--pipe"]')
search.click()

search_text = driver.find_element(By.ID, 'keywords')
search_text.clear()
search_text.send_keys('Deep Learning')

search_text.send_keys(u'\ue007')

# Year Range
startingyear = 2020
endingyear = 2024

def parse_time(time_element) :
    datetime_value = time_element['datetime']
    parsed_datetime = datetime.strptime(datetime_value, '%Y-%m-%d')
    parsed_date = parsed_datetime.date()
    return parsed_date
    
def fetch_data(url):
    filename = article_no(url)+'.pkl'
    cache_filename = os.path.join(cache_path,filename)
    if os.path.exists(cache_filename):
        with open(cache_filename, 'rb') as f:
            data = pickle.load(f)
        print("Data loaded from cache.")
    else:
        response = session.get(url)
        soup = BeautifulSoup(response.text, "lxml")
        with open(cache_filename, 'wb') as f:
            pickle.dump(soup, f)
        with open(cache_filename, 'rb') as f:
            data = pickle.load(f)
        print("Data fetched from the web and cached.")
    return data

def article_no(url):
    # Split the URL by '/'
    parts = url.split('/')
    
    # Return the last element
    return parts[-1]

while(startingyear <= endingyear) :

    date = driver.find_element(By.CSS_SELECTOR, 'button[data-track-action = "date filter"]')
    date.click()

    date_range = driver.find_element(By.CSS_SELECTOR, 'a[data-test = "advance-search-link-date"]')
    date_range.click()

    start_year = driver.find_element(By.CSS_SELECTOR, 'select[name = "start_year"]')
    select = Select(start_year)
    select.select_by_visible_text(str(startingyear))

    end_year = driver.find_element(By.CSS_SELECTOR, 'select[name = "end_year"]')
    select = Select(end_year)
    select.select_by_visible_text(str(startingyear))

    dismiss_cookie = driver.find_elements(By.CSS_SELECTOR, 'button[class = "c-cookie-banner__dismiss"]')
    if(len(dismiss_cookie) != 0) : 
        dismiss_cookie[0].click()

    search_button = driver.find_element(By.CSS_SELECTOR, 'button[class = "c-search__button c-search__button--width-auto"]')
    search_button.click()

    article_type = driver.find_element(By.CSS_SELECTOR, 'button[data-track-action = "article type filter"]')
    article_type.click()

    research_type = driver.find_element(By.ID, 'article-type-research')
    research_type.click()

    apply_filter = driver.find_element(By.CSS_SELECTOR, 'button.c-facet__submit')
    apply_filter.submit()

    count = 1

    data = []

    while(True) :
        all_link = driver.find_elements(By.CSS_SELECTOR, 'a[data-track-action = "view article"]')

        if len(all_link) == 0 :
            search_return = driver.find_element(By.CSS_SELECTOR, 'button[class = "c-search__button"]')
            search_return.click()

            current_directory = os.getcwd()
            # Specify the file name
            file_name = 'data'+str(startingyear)+'.csv'

            # Combine the directory and file name to create the file path
            file_path = os.path.join(current_directory, file_name)

            df = pd.DataFrame(data)

            # Convert DataFrame to CSV and save the file
            df.to_csv(file_path, index=False)

            break

        for link in all_link :
            print(count,"scraping",link.get_attribute('href'))
            URL = link.get_attribute('href')
            soup = fetch_data(URL)
            title = soup.find('h1', class_='c-article-title')
            subjects = [subject.get_text(strip=True) for subject in soup.find_all('li', class_='c-article-subject-list__subject')]
            authors = [author.get_text(strip=True) for author in soup.find_all('a', attrs={'data-test': 'author-name'})]
            publish_date = parse_time(soup.find('time'))
            
            if title is not None and len(subjects) != 0 and len(authors) != 0 :
                print("Add data for Web :",count)
                data.append({ 'title' : title.text.strip(), 'subjects' : subjects, 'authors' : authors, 'publish-date' : publish_date})
            else :
                print("Skipping Web :",count,"Since Missing Data")

            print(title.text.strip())
            print(subjects)
            print(authors)
            print(publish_date)

            count += 1
        
        try:
            next_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'li[data-page="next"]')))
            next_button.click()
        except:
            print("End the search Deep Learning in year :",startingyear)
            search_return = driver.find_element(By.CSS_SELECTOR, 'button[class = "c-search__button"]')
            search_return.click()

            current_directory = os.getcwd()
            # Specify the file name
            file_name = 'data'+str(startingyear)+'.csv'

            # Combine the directory and file name to create the file path
            file_path = os.path.join(current_directory, file_name)

            df = pd.DataFrame(data)

            # Convert DataFrame to CSV and save the file
            df.to_csv(file_path, index=False)

            break
        
    startingyear += 1

driver.close()

NoSuchElementException: Message: no such element: Unable to locate element: {"method":"css selector","selector":"li[class = "c-header__item c-header__item--padding c-header__item--pipe"]"}
  (Session info: chrome=123.0.6312.124); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00007FF716067032+63090]
	(No symbol) [0x00007FF715FD2C82]
	(No symbol) [0x00007FF715E6EC65]
	(No symbol) [0x00007FF715EB499D]
	(No symbol) [0x00007FF715EB4ADC]
	(No symbol) [0x00007FF715EF5B37]
	(No symbol) [0x00007FF715ED701F]
	(No symbol) [0x00007FF715EF3412]
	(No symbol) [0x00007FF715ED6D83]
	(No symbol) [0x00007FF715EA83A8]
	(No symbol) [0x00007FF715EA9441]
	GetHandleVerifier [0x00007FF7164625AD+4238317]
	GetHandleVerifier [0x00007FF71649F70D+4488525]
	GetHandleVerifier [0x00007FF7164979EF+4456495]
	GetHandleVerifier [0x00007FF716140576+953270]
	(No symbol) [0x00007FF715FDE54F]
	(No symbol) [0x00007FF715FD9224]
	(No symbol) [0x00007FF715FD935B]
	(No symbol) [0x00007FF715FC9B94]
	BaseThreadInitThunk [0x00007FFA58467344+20]
	RtlUserThreadStart [0x00007FFA59F226B1+33]
