In [1]:
pip install selenium

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.3 -> 26.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [13]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import csv
import os
import time

# initalize the chrome webdriver
driver = webdriver.Chrome()
driver.get('https://www.sciencedirect.com/journal/information-systems/vol/3/issue/1')

# Wait for the page to fully load
driver.implicitly_wait(20)

OUT_FILE = 'Information_Systems_Issues.csv'
data = []

def is_not_found_page():
    title = (driver.title or "").lower()
    source = (driver.page_source or "").lower()
    return ("page not found" in source) or ("404" in title) or ("not found" in title)

def write_to_csv(rows):
    file_exists = os.path.exists(OUT_FILE) and os.path.getsize(OUT_FILE) > 0
    with open(OUT_FILE, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        if not file_exists:
            writer.writerow(["Title", "URL", "Volume Issue", "Vol Issue Year"])
        writer.writerows(rows)

def getVolumedata(url):
    try:
        if not url:
            return 0

        driver.get(url)
        driver.implicitly_wait(20)
        time.sleep(1.5)

        # NEW: skip "not found" pages and continue
        if is_not_found_page():
            try:
                prev_ele = driver.find_element(By.CSS_SELECTOR, 'nav.issue-navigation div.navigation-pre a.text-m')
                prev_ele_url = prev_ele.get_attribute('href')
                return getVolumedata(prev_ele_url)
            except:
                return 0

        articles = driver.find_elements(By.CSS_SELECTOR, 'li.js-article-list-item dl.js-article')

        # NEW: if metadata is missing, skip and continue
        try:
            vol_issue = driver.find_element(By.CLASS_NAME, 'js-vol-issue').text.strip()
            vol_iss_year = driver.find_element(By.CLASS_NAME, 'js-issue-status').text.split('(')[-1].strip(') ')
        except:
            try:
                prev_ele = driver.find_element(By.CSS_SELECTOR, 'nav.issue-navigation div.navigation-pre a.text-m')
                prev_ele_url = prev_ele.get_attribute('href')
                return getVolumedata(prev_ele_url)
            except:
                return 0

        # NEW: if no articles, skip and continue
        if not articles:
            try:
                prev_ele = driver.find_element(By.CSS_SELECTOR, 'nav.issue-navigation div.navigation-pre a.text-m')
                prev_ele_url = prev_ele.get_attribute('href')
                return getVolumedata(prev_ele_url)
            except:
                return 0

        rows = []
        for article in articles:
            try:
                article_url_element = article.find_element(By.CSS_SELECTOR, 'a.article-content-title')
                article_url = article_url_element.get_attribute('href')
                article_title = article.find_element(By.CLASS_NAME, 'js-article-title').text.strip()

                data.append([article_title, article_url, vol_issue, vol_iss_year])
                rows.append([article_title, article_url, vol_issue, vol_iss_year])
            except:
                continue

        # NEW: append only new rows (no overwrite)
        write_to_csv(rows)

        prev_ele = driver.find_element(By.CSS_SELECTOR, 'nav.issue-navigation div.navigation-pre a.text-m')
        prev_ele_url = prev_ele.get_attribute('href')
        return getVolumedata(prev_ele_url)

    except Exception as e:
        print("Exception", e)
        # NEW: on error, try moving to previous issue instead of stopping
        try:
            prev_ele = driver.find_element(By.CSS_SELECTOR, 'nav.issue-navigation div.navigation-pre a.text-m')
            prev_ele_url = prev_ele.get_attribute('href')
            return getVolumedata(prev_ele_url)
        except:
            return 0

try:
    time.sleep(1.5)

    # NEW: if starting page not found, jump to previous
    if is_not_found_page():
        prev_ele = driver.find_element(By.CSS_SELECTOR, 'nav.issue-navigation div.navigation-pre a.text-m')
        prev_ele_url = prev_ele.get_attribute('href')
        getVolumedata(prev_ele_url)

    else:
        articles = driver.find_elements(By.CSS_SELECTOR, 'li.js-article-list-item dl.js-article')
        vol_issue = driver.find_element(By.CLASS_NAME, 'js-vol-issue').text.strip()
        vol_iss_year = driver.find_element(By.CLASS_NAME, 'js-issue-status').text.split('(')[-1].strip(') ')

        rows = []
        for article in articles:
            try:
                article_url_element = article.find_element(By.CSS_SELECTOR, 'a.article-content-title')
                article_url = article_url_element.get_attribute('href')
                article_title = article.find_element(By.CLASS_NAME, 'js-article-title').text.strip()

                data.append([article_title, article_url, vol_issue, vol_iss_year])
                rows.append([article_title, article_url, vol_issue, vol_iss_year])
            except:
                continue

        # NEW: append (header only once)
        write_to_csv(rows)

        prev_ele = driver.find_element(By.CSS_SELECTOR, 'nav.issue-navigation div.navigation-pre a.text-m')
        prev_ele_url = prev_ele.get_attribute('href')
        getVolumedata(prev_ele_url)

except Exception as e:
    print("Exception", e)

driver.quit()

In [8]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
import time
import os

START_INDEX = 1320
END_INDEX =  1400

journals_data = pd.read_csv('Information_Systems_Issues.csv')
OUT_FILE = 'Information_Systems_article_data.csv'

def getAuthorsData(authors, driver):
    authdata = []
    for auth in authors:
        name, desc = '', ''
        email = None
        auth_data = None
        try:
            auth.click()
            time.sleep(2)
            auth_data = driver.find_element(By.CSS_SELECTOR, 'div.side-panel-content')
            auth_data_desc = auth_data.find_element(By.CSS_SELECTOR, 'div.affiliation')
            auth_name = auth_data.find_element(By.CSS_SELECTOR, 'div.author')
            name = auth_name.text.strip()
            desc = auth_data_desc.text.strip()
        except Exception as e:
            print("error at author data", e)
        try:
            if auth_data:
                auth_email = auth_data.find_element(By.CSS_SELECTOR, 'div.e-address a.anchor')
                email = auth_email.text.strip()
        except Exception:
            email = None
        authdata.append([name, email, desc])
    return authdata

if not os.path.exists(OUT_FILE) or os.path.getsize(OUT_FILE) == 0:
    with open(OUT_FILE, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['URL','Journal_Title','Article_Title','Volume_Issue','Month_Year','Abstract','Keywords','Author_name','Author_email','Author_Address'])

for index, row in journals_data.iloc[START_INDEX:END_INDEX].iterrows():
    driver = webdriver.Chrome()
    final_data = []

    url = str(row.get('URL', '')).strip()
    article_date = row.get('Vol Issue Year', None)

    if not url.startswith('http'):
        driver.quit()
        continue

    title = "N/A"
    article_journal = "N/A"
    article_vol = "N/A"
    abstract = None
    keyword_list = []

    try:
        driver.get(url)
        driver.implicitly_wait(10)

        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'h1.Head'))
        )

        title = driver.find_element(By.CSS_SELECTOR, 'h1.Head').text.strip()
        article_details = driver.find_element(By.CSS_SELECTOR, 'div.Publication')
        article_journal = article_details.find_element(By.CSS_SELECTOR, 'h2.publication-title').text.strip()
        article_vol = article_details.find_element(By.CSS_SELECTOR, 'div.text-xs a.anchor').text.strip()
    except Exception as e:
        print(f"Error is {e} in this {url}")

    try:
        abstract = driver.find_element(By.ID, 'div.sp0050').text.strip()
    except Exception:
        abstract = None

    try:
        keywords = driver.find_elements(By.CSS_SELECTOR, 'div.keyword')
        keyword_list = []
        for key in keywords:
            keyword_list.append(key.text.strip())
    except Exception:
        keyword_list = []

    final_data = [url, article_journal, title, article_vol, article_date, abstract, keyword_list]

    try:
        author_group = driver.find_element(By.CSS_SELECTOR, 'div.author-group')
        authors = author_group.find_elements(By.CSS_SELECTOR, 'button.button-link')
        auth_data = getAuthorsData(authors, driver)
        if auth_data:
            for i in auth_data:
                with open(OUT_FILE, mode='a', newline='', encoding='utf-8') as file:
                    writer = csv.writer(file)
                    writer.writerow(final_data + i)
                    file.flush()
        else:
            with open(OUT_FILE, mode='a', newline='', encoding='utf-8') as file:
                writer = csv.writer(file)
                writer.writerow(final_data + ["N/A", "N/A", "N/A"])
                file.flush()
    except Exception as e:
        with open(OUT_FILE, mode='a', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow(final_data + ["N/A", "N/A", "N/A"])
            file.flush()
        print(f"Error processing author data on {url}: {e}")

    driver.quit()
    time.sleep(3)


error at author data Message: element click intercepted: Element <button class="button-link react-xocs-icon-only-link button-link-secondary button-link-underline" type="button" aria-haspopup="dialog" data-sd-ui-side-panel-opener="true" data-xocs-content-type="author" data-xocs-content-id="aep-author-id10">...</button> is not clickable at point (655, 492). Other element would receive the click: <div id="side-panel" role="dialog" aria-modal="true" aria-label="Author panel" class="side-panel side-panel-is-expanded side-panel-border">...</div>
  (Session info: chrome=145.0.7632.46); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#elementclickinterceptedexception
Stacktrace:
Symbols not available. Dumping unresolved backtrace:
	0x7ff7b0a143e5
	0x7ff7b0a14440
	0x7ff7b07ad49d
	0x7ff7b0810845
	0x7ff7b080df5a
	0x7ff7b080ade5
	0x7ff7b0809c97
	0x7ff7b07fb1c6
	0x7ff7b08317ca
	0x7ff7b07faa46
	0x7ff7b0855728
	0x7ff7b07f8e38
	0x7f

KeyboardInterrupt: 