In [None]:
pip install selenium pandas

In [1]:
# ========== STEP 2: Scrape detailed data for each article ==========
# Reads MISQ_Issues.csv, visits each URL, extracts title/abstract/keywords/authors.
# Writes MISQ_article_data.csv with ONE ROW PER AUTHOR (same article info, author name changes).
# Run Step 1 first to create MISQ_Issues.csv.

import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import csv
import time
import os
import random
import re

START_INDEX = 0
END_INDEX = 5  # Process in batches; use len(journals_data) for all

OUT_DIR = os.getcwd()
CSV_PATH = os.path.join(OUT_DIR, 'MISQ_Issues.csv')
OUT_FILE = os.path.join(OUT_DIR, 'MISQ_article_data.csv')

print("Step 2: Scraping article details (one row per author)...")
print(f"Reading from: {CSV_PATH}")
print(f"Writing to: {OUT_FILE}\n")

journals_data = pd.read_csv(CSV_PATH)
n_total = len(journals_data)
print(f"Total articles: {n_total}")
print(f"Processing range: {START_INDEX} to {min(END_INDEX, n_total)}\n")

if not os.path.exists(OUT_FILE) or os.path.getsize(OUT_FILE) == 0:
    with open(OUT_FILE, 'a', newline='', encoding='utf-8') as f:
        csv.writer(f).writerow(['URL','Journal_Title','Article_Title','Volume_Issue','Month_Year','Abstract','Keywords','Author_name','Author_email','Author_Address'])

chrome_options = Options()
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument('--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')

driver = webdriver.Chrome(options=chrome_options)
driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")

def is_bot_page(driver):
    title = (driver.title or '').lower()
    url = (driver.current_url or '').lower()
    return 'validate' in title or 'bot' in title or 'captcha' in title or 'crawlprevention' in url

def get_authors(driver):
    """One row per author. Splits combined text (e.g. 'A1\\nA2\\nA3') into individual names."""
    authdata = []
    seen = set()
    skip = {'n/a', 'author', 'authors', 'author & article information', 'author information', 'article information'}
    try:
        selectors = ['.author-name', '.author .name', '[itemprop="author"]', '.byline .author', '.contributor-name',
                     '.author, .contributor, [class*="author"], .byline span']
        all_parts = []
        for sel in selectors:
            try:
                for el in driver.find_elements(By.CSS_SELECTOR, sel):
                    text = (el.text or '').strip()
                    if not text or len(text) < 3:
                        continue
                    for part in re.split(r'[\n;]+', text):
                        part = part.strip()
                        if part and 3 <= len(part) <= 80:
                            all_parts.append(part)
                if all_parts:
                    break
            except:
                continue
        for name in all_parts:
            name = ' '.join(name.split()).replace(';', '').strip()
            if not name or len(name) < 3 or len(name) > 80 or name.lower() in skip or name.startswith('Author') or name.startswith('Article'):
                continue
            key = name.lower()
            if key not in seen:
                seen.add(key)
                authdata.append([name, "N/A", "N/A"])
    except Exception as e:
        print(f"Error extracting authors: {e}")
    return authdata if authdata else [["N/A", "N/A", "N/A"]]

try:
    for index, row in journals_data.iloc[START_INDEX:END_INDEX].iterrows():
        url = str(row.get('URL', '')).strip()
        article_date = row.get('Vol Issue Year', None)
        article_vol = row.get('Volume Issue', 'N/A')
        if not url or not url.startswith('http'):
            continue

        title, abstract, keyword_list = "N/A", None, []

        try:
            time.sleep(random.uniform(3, 7))
            driver.get(url)
            driver.implicitly_wait(10)
            time.sleep(random.uniform(1, 3))

            if is_bot_page(driver):
                print(f"[{index-START_INDEX+1}] Bot detection - waiting 30s...")
                time.sleep(30)
                driver.get(url)
                time.sleep(3)
                if is_bot_page(driver):
                    print("  Still blocked. Skipping.")
                    continue

            for sel in ['h1', '.article-title', '.title', 'article h1']:
                try:
                    title = driver.find_element(By.CSS_SELECTOR, sel).text.strip()
                    if title and len(title) > 3:
                        break
                except:
                    continue

            for sel in ['.abstract', '#abstract', '.article-abstract', 'section.abstract', '[class*="abstract"]']:
                try:
                    abstract = driver.find_element(By.CSS_SELECTOR, sel).text.strip()
                    if abstract and len(abstract) > 20:
                        break
                except:
                    continue

            try:
                keyword_list = [k.text.strip() for k in driver.find_elements(By.CSS_SELECTOR, '.keyword, .keywords span, .tag, [class*="keyword"] span') if k.text.strip()]
            except:
                pass

            auth_data = get_authors(driver)
            final_data = [url, "MIS Quarterly", title, article_vol, article_date, abstract, keyword_list]

            with open(OUT_FILE, 'a', newline='', encoding='utf-8') as f:
                writer = csv.writer(f)
                for auth in auth_data:
                    writer.writerow(final_data + auth)
                f.flush()

            print(f"[{index-START_INDEX+1}/{min(END_INDEX, n_total)-START_INDEX}] {title[:50]}... ({len(auth_data)} author(s))")

        except Exception as e:
            print(f"[{index-START_INDEX+1}] Error: {e}")
            with open(OUT_FILE, 'a', newline='', encoding='utf-8') as f:
                csv.writer(f).writerow([url, "MIS Quarterly", "N/A", article_vol, article_date, None, [], "N/A", "N/A", "N/A"])
                f.flush()

finally:
    driver.quit()
    print(f"\nStep 2 complete. Data saved to {OUT_FILE}")

Step 2: Scraping article details (one row per author)...
Reading from: /Users/keerthisagi/Documents/Journals/MIS_Quarterly/MISQ_Issues.csv
Writing to: /Users/keerthisagi/Documents/Journals/MIS_Quarterly/MISQ_article_data.csv

Total articles: 1273
Processing range: 0 to 5

[1] Bot detection - waiting 30s...
[1/5] Information Systems Innovation for Environmental S... (1 author(s))
[2/5] Information Systems and Environmentally Sustainabl... (3 author(s))
[3/5] An Empirical Analysis of the Impact of Information... (3 author(s))
[4/5] Chasing the Hottest IT: Effects of Information Tec... (1 author(s))
[5/5] Toward Agile: An Integrated Analysis of Quantitati... (2 author(s))

Step 2 complete. Data saved to /Users/keerthisagi/Documents/Journals/MIS_Quarterly/MISQ_article_data.csv


In [1]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
import time
import os
import random
import re

START_INDEX = 0
END_INDEX = 200  # set to len(journals_data) for all

journals_data = pd.read_csv('MISQ_Issues.csv')
OUT_FILE = 'MISQ_article_data.csv'

WAIT_SEC = 15

def is_bot_page(driver):
    title = (driver.title or '').lower()
    url = (driver.current_url or '').lower()
    src = (driver.page_source or '').lower()
    signals = ['validate', 'bot', 'captcha', 'crawlprevention', 'verification', 'are you human', 'cloudflare']
    return any(s in title for s in signals) or any(s in url for s in signals) or any(s in src for s in signals)

def getAuthorsData(driver):
    authdata = []
    seen = set()
    skip = {
        'n/a', 'author', 'authors', 'author & article information',
        'author information', 'article information'
    }

    selectors = [
        '.author-name',
        '.author .name',
        '[itemprop="author"]',
        '.byline .author',
        '.contributor-name',
        '.author, .contributor, [class*="author"], .byline span'
    ]

    all_parts = []
    for sel in selectors:
        try:
            els = driver.find_elements(By.CSS_SELECTOR, sel)
            for el in els:
                text = (el.text or '').strip()
                if not text or len(text) < 3:
                    continue
                for part in re.split(r'[\n;]+', text):
                    part = part.strip()
                    if part and 3 <= len(part) <= 80:
                        all_parts.append(part)
            if all_parts:
                break
        except:
            continue

    for name in all_parts:
        name = ' '.join(name.split()).replace(';', '').strip()
        if not name or len(name) < 3 or len(name) > 80:
            continue
        low = name.lower()
        if low in skip:
            continue
        if name.startswith('Author') or name.startswith('Article'):
            continue
        if low not in seen:
            seen.add(low)
            authdata.append([name, "N/A", "N/A"])

    return authdata if authdata else [["N/A", "N/A", "N/A"]]

# Write header once
if not os.path.exists(OUT_FILE) or os.path.getsize(OUT_FILE) == 0:
    with open(OUT_FILE, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow([
            'URL','Journal_Title','Article_Title','Volume_Issue','Month_Year',
            'Abstract','Keywords','Author_name','Author_email','Author_Address'
        ])

# Chrome options: faster loads (block images/fonts/styles)
opts = Options()
prefs = {
    "profile.managed_default_content_settings.images": 2,
    "profile.managed_default_content_settings.stylesheets": 2,
    "profile.managed_default_content_settings.fonts": 2,
}
opts.add_experimental_option("prefs", prefs)

driver = webdriver.Chrome(options=opts)
wait = WebDriverWait(driver, WAIT_SEC)

# Optional: batch writes
buffer_rows = []
BUFFER_SIZE = 50

def flush_buffer():
    global buffer_rows
    if not buffer_rows:
        return
    with open(OUT_FILE, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerows(buffer_rows)
        file.flush()
    buffer_rows = []

try:
    for idx, row in journals_data.iloc[START_INDEX:END_INDEX].iterrows():
        url = str(row.get('URL', '')).strip()
        article_date = row.get('Vol Issue Year', None)
        article_vol = row.get('Volume Issue', 'N/A')

        if not url.startswith('http'):
            continue

        title = "N/A"
        abstract = None
        keyword_list = []

        # Smaller jitter (faster than 3–7 seconds)
        time.sleep(random.uniform(0.8, 1.8))

        driver.get(url)

        # Bot handling (manual)
        if is_bot_page(driver):
            print(f"⚠ Bot page hit. Solve in browser, then press ENTER. URL: {url}")
            input()
            driver.get(url)

        # Wait for page to load enough (title)
        try:
            wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "body")))
        except:
            pass

        # Title
        for sel in ['h1', '.article-title', '.title', 'article h1']:
            try:
                title = driver.find_element(By.CSS_SELECTOR, sel).text.strip()
                if title and len(title) > 3:
                    break
            except:
                continue

        # Abstract
        for sel in ['.abstract', '#abstract', '.article-abstract', 'section.abstract', '[class*="abstract"]']:
            try:
                abstract = driver.find_element(By.CSS_SELECTOR, sel).text.strip()
                if abstract and len(abstract) > 20:
                    break
            except:
                continue

        # Keywords
        try:
            keyword_list = [
                k.text.strip()
                for k in driver.find_elements(By.CSS_SELECTOR, '.keyword, .keywords span, .tag, [class*="keyword"] span')
                if k.text.strip()
            ]
        except:
            keyword_list = []

        final_data = [url, "MIS Quarterly", title, article_vol, article_date, abstract, keyword_list]

        auth_data = getAuthorsData(driver)
        for arow in auth_data:
            buffer_rows.append(final_data + arow)

        if len(buffer_rows) >= BUFFER_SIZE:
            flush_buffer()

        print(f"✓ {title[:60]}... ({len(auth_data)} author(s))")

finally:
    flush_buffer()
    driver.quit()
    print("DONE:", OUT_FILE)

⚠ Bot page hit. Solve in browser, then press ENTER. URL: https://misq.umn.edu/misq/article/34/1/1/488/Information-Systems-Innovation-for-Environmental
✓ N/A... (1 author(s))
DONE: MISQ_article_data.csv


NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=145.0.7632.109)
Stacktrace:
0   chromedriver                        0x0000000102623d84 cxxbridge1$str$ptr + 3127864
1   chromedriver                        0x000000010261c174 cxxbridge1$str$ptr + 3096104
2   chromedriver                        0x00000001020f99f4 _RNvCsdExgN8vFLbb_7___rustc35___rust_no_alloc_shim_is_unstable_v2 + 75356
3   chromedriver                        0x00000001020d28c4 chromedriver + 157892
4   chromedriver                        0x000000010216a994 _RNvCsdExgN8vFLbb_7___rustc35___rust_no_alloc_shim_is_unstable_v2 + 538108
5   chromedriver                        0x00000001021806b4 _RNvCsdExgN8vFLbb_7___rustc35___rust_no_alloc_shim_is_unstable_v2 + 627484
6   chromedriver                        0x00000001021362bc _RNvCsdExgN8vFLbb_7___rustc35___rust_no_alloc_shim_is_unstable_v2 + 323364
7   chromedriver                        0x00000001025e28a8 cxxbridge1$str$ptr + 2860380
8   chromedriver                        0x00000001025e5fe8 cxxbridge1$str$ptr + 2874524
9   chromedriver                        0x00000001025c7cc4 cxxbridge1$str$ptr + 2750840
10  chromedriver                        0x00000001025e686c cxxbridge1$str$ptr + 2876704
11  chromedriver                        0x00000001025b82cc cxxbridge1$str$ptr + 2686848
12  chromedriver                        0x000000010260af68 cxxbridge1$str$ptr + 3025948
13  chromedriver                        0x000000010260b0e4 cxxbridge1$str$ptr + 3026328
14  chromedriver                        0x000000010261bdcc cxxbridge1$str$ptr + 3095168
15  libsystem_pthread.dylib             0x0000000196503bc8 _pthread_start + 136
16  libsystem_pthread.dylib             0x00000001964feb80 thread_start + 8
