In [None]:
pip install selenium pandas

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
import os
import time
import re

# Initialize the chrome webdriver
driver = webdriver.Chrome()

# Starting URL - browse by year page
START_URL = 'https://misq.umn.edu/misq/issue/browse-by-year'

# Years to scrape (2010 to 2025)
START_YEAR = 2010
END_YEAR = 2025

# Save CSV file in the same directory as this notebook (MIS_Quarterly folder)
OUT_FILE = os.path.join(os.getcwd(), 'MISQ_Issues.csv')
print(f"CSV file will be saved to: {OUT_FILE}")
print(f"Current working directory: {os.getcwd()}\n")
data = []

def write_to_csv(rows):
    file_exists = os.path.exists(OUT_FILE) and os.path.getsize(OUT_FILE) > 0
    with open(OUT_FILE, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        if not file_exists:
            writer.writerow(["Title", "URL", "Volume Issue", "Vol Issue Year"])
            print(f"Created CSV file: {OUT_FILE}")
        writer.writerows(rows)
        file.flush()  # Ensure data is written immediately
    print(f"  ✓ Saved {len(rows)} articles to {OUT_FILE}")

def scrape_issue_page(issue_url, vol_issue, year):
    """Scrape articles from a single issue page"""
    try:
        driver.get(issue_url)
        driver.implicitly_wait(15)
        time.sleep(2)
        
        print(f"  Page loaded: {driver.title}")
        
        # Find all article links - try multiple selectors
        articles = []
        
        # Try different selectors for article links
        selectors = [
            'a[href*="/misq/vol"]',
            'a[href*="/article"]',
            '.article-title a',
            '.article a',
            'h3 a',
            'h4 a',
            '.title a',
            'article a',
            '.entry-title a',
            'li a[href*="/article"]',
            'div.article a',
            'table a[href*="/article"]'
        ]
        
        print("  Searching for articles...")
        for selector in selectors:
            try:
                found = driver.find_elements(By.CSS_SELECTOR, selector)
                if found:
                    print(f"    Selector '{selector}': Found {len(found)} links")
                    # Filter for article links (not issue links)
                    filtered = [a for a in found if a.get_attribute('href') and 
                               ('/article' in a.get_attribute('href') or 
                                ('/misq/vol' in a.get_attribute('href') and '/issue' not in a.get_attribute('href')))]
                    if filtered:
                        print(f"      → {len(filtered)} are article links")
                        articles.extend(filtered)
            except Exception as e:
                continue
        
        # Remove duplicates
        seen_urls = set()
        unique_articles = []
        for article in articles:
            try:
                url = article.get_attribute('href')
                if url and url not in seen_urls:
                    seen_urls.add(url)
                    unique_articles.append(article)
            except:
                continue
        
        if not unique_articles:
            # Fallback: find all links and filter
            print("  Trying fallback: checking all links...")
            all_links = driver.find_elements(By.TAG_NAME, 'a')
            for link in all_links:
                try:
                    url = link.get_attribute('href') or ''
                    if url and ('/article' in url or ('/misq/vol' in url and '/issue' not in url)) and url not in seen_urls:
                        seen_urls.add(url)
                        unique_articles.append(link)
                except:
                    continue
        
        print(f"  Total unique articles found: {len(unique_articles)}")
        
        rows = []
        for article in unique_articles:
            try:
                article_url = article.get_attribute('href')
                if not article_url:
                    continue
                    
                # Make sure URL is absolute
                if article_url.startswith('/'):
                    article_url = 'https://misq.umn.edu' + article_url
                
                if not article_url.startswith('http'):
                    continue
                
                # Get article title
                article_title = article.text.strip()
                if not article_title or len(article_title) < 5:
                    # Try to get title from parent or nearby element
                    try:
                        parent = article.find_element(By.XPATH, './..')
                        article_title = parent.text.strip()
                    except:
                        try:
                            # Try sibling or nearby heading
                            heading = article.find_element(By.XPATH, './preceding-sibling::h3 | ./preceding-sibling::h4 | ./following-sibling::h3 | ./following-sibling::h4')
                            article_title = heading.text.strip()
                        except:
                            article_title = "N/A"
                
                if article_url and article_title and article_title != "N/A" and len(article_title) > 5:
                    rows.append([article_title, article_url, vol_issue, year])
                    print(f"    ✓ {article_title[:60]}...")
                    
            except Exception as e:
                print(f"    Error extracting article: {e}")
                continue
        
        if rows:
            write_to_csv(rows)
            return len(rows)
        else:
            print(f"  ⚠ No articles found on this page")
            # Debug: show page structure
            try:
                page_text = driver.find_element(By.TAG_NAME, 'body').text[:300]
                print(f"  Page content preview: {page_text[:200]}...")
            except:
                pass
            return 0
            
    except Exception as e:
        print(f"  ✗ Error scraping issue page: {e}")
        import traceback
        traceback.print_exc()
        return 0

def scrape_year_page(year_url, year):
    """Scrape all issues from a year page"""
    try:
        print(f"\n{'='*60}")
        print(f"Navigating to year page: {year_url}")
        driver.get(year_url)
        driver.implicitly_wait(15)
        time.sleep(3)  # Give page more time to load
        
        print(f"Page title: {driver.title}")
        print(f"Current URL: {driver.current_url}")
        
        # Debug: Print some page content to understand structure
        try:
            page_text = driver.find_element(By.TAG_NAME, 'body').text[:500]
            print(f"Page content preview: {page_text}...")
        except:
            pass
        
        # Find all issue links - try comprehensive approach
        issue_links = []
        
        # Try different selectors for issue links
        selectors = [
            'a[href*="/misq/vol"]',
            'a[href*="/vol"]',
            '.issue-link a',
            '.issue a',
            'h2 a',
            'h3 a',
            'h4 a',
            'li a',
            '.volume a',
            'article a',
            'div a[href*="/vol"]',
            'table a[href*="/vol"]'
        ]
        
        print("\nTrying to find issue links...")
        for selector in selectors:
            try:
                links = driver.find_elements(By.CSS_SELECTOR, selector)
                if links:
                    print(f"  Selector '{selector}': Found {len(links)} links")
                    # Filter for actual issue links
                    filtered = [l for l in links if l.get_attribute('href') and ('/misq/vol' in l.get_attribute('href') or '/vol' in l.get_attribute('href'))]
                    if filtered:
                        print(f"    → {len(filtered)} are issue links")
                        issue_links.extend(filtered)
            except Exception as e:
                print(f"  Selector '{selector}': Error - {e}")
                continue
        
        # Remove duplicates
        seen_urls = set()
        unique_issue_links = []
        for link in issue_links:
            try:
                url = link.get_attribute('href')
                if url and url not in seen_urls:
                    seen_urls.add(url)
                    unique_issue_links.append(link)
            except:
                continue
        
        print(f"\nTotal unique issue links found: {len(unique_issue_links)}")
        
        if not unique_issue_links:
            # Fallback: find ALL links and filter
            print("Trying fallback: checking all links on page...")
            all_links = driver.find_elements(By.TAG_NAME, 'a')
            print(f"Total links on page: {len(all_links)}")
            
            for link in all_links[:50]:  # Check first 50 links
                try:
                    url = link.get_attribute('href') or ''
                    text = link.text.strip()
                    if url and ('/misq/vol' in url or '/vol' in url) and url not in seen_urls:
                        print(f"  Found issue link: {text[:50]} -> {url}")
                        seen_urls.add(url)
                        unique_issue_links.append(link)
                except:
                    continue
        
        # Extract unique issue URLs with metadata
        unique_issues = {}
        for link in unique_issue_links:
            try:
                url = link.get_attribute('href')
                if not url:
                    continue
                    
                # Make sure URL is absolute
                if url.startswith('/'):
                    url = 'https://misq.umn.edu' + url
                
                # Extract volume/issue from URL or text
                link_text = link.text.strip()
                
                # Try to extract from URL
                match = re.search(r'vol[^\d]*(\d+)[^\d]*issue[^\d]*(\d+)', url, re.I)
                if match:
                    vol_issue = f"Vol {match.group(1)}, Issue {match.group(2)}"
                elif link_text and len(link_text) > 3:
                    vol_issue = link_text
                else:
                    # Extract from URL path
                    parts = url.split('/')
                    vol_issue = parts[-1] if parts else f"Vol {year}"
                
                unique_issues[url] = vol_issue
                print(f"  Issue: {vol_issue} -> {url}")
            except Exception as e:
                print(f"  Error processing link: {e}")
                continue
        
        print(f"\n{'='*60}")
        print(f"Processing {len(unique_issues)} issues for year {year}...")
        print(f"{'='*60}")
        
        if len(unique_issues) == 0:
            print(f"WARNING: No issues found for year {year}!")
            print("Page HTML snippet:")
            try:
                html_snippet = driver.page_source[:1000]
                print(html_snippet)
            except:
                pass
            return 0
        
        total_articles = 0
        for issue_url, vol_issue in unique_issues.items():
            print(f"\n{'─'*60}")
            print(f"Scraping: {vol_issue}")
            print(f"URL: {issue_url}")
            count = scrape_issue_page(issue_url, vol_issue, str(year))
            total_articles += count
            print(f"  → Found {count} articles")
            time.sleep(1)  # Be respectful
        
        return total_articles
        
    except Exception as e:
        print(f"Error scraping year page {year_url}: {e}")
        import traceback
        traceback.print_exc()
        return 0

# Main scraping logic
try:
    driver.get(START_URL)
    driver.implicitly_wait(15)
    time.sleep(2)
    
    print("Starting MIS Quarterly scraper (2010-2025)...")
    print(f"Browse page: {START_URL}\n")
    
    # Find year links for 2010-2025
    year_links = {}
    
    print("Searching for year links on browse-by-year page...")
    print(f"Page title: {driver.title}")
    print(f"Current URL: {driver.current_url}\n")
    
    # Get all links and filter by year
    all_links = driver.find_elements(By.TAG_NAME, 'a')
    print(f"Total links found on page: {len(all_links)}")
    
    # First pass: look for links with year in URL or text
    for link in all_links:
        try:
            url = link.get_attribute('href') or ''
            text = link.text.strip()
            
            # Make URL absolute if relative
            if url.startswith('/'):
                url = 'https://misq.umn.edu' + url
            
            # Check if link contains year in URL or text
            for year in range(START_YEAR, END_YEAR + 1):
                year_str = str(year)
                if (year_str in url or year_str in text) and url.startswith('http'):
                    if year not in year_links:
                        year_links[year] = url
                        print(f"  ✓ Found year {year}: {text[:40]} -> {url}")
        except Exception as e:
            continue
    
    print(f"\nFound {len(year_links)} year links directly from page")
    
    # If we didn't find enough year links, try to construct URLs
    if len(year_links) < (END_YEAR - START_YEAR + 1) / 2:  # If less than half found
        print("\nTrying to construct year URLs...")
        # Common patterns for year pages
        base_patterns = [
            'https://misq.umn.edu/misq/issue/browse-by-year/{}',
            'https://misq.umn.edu/misq/issue/{}',
            'https://misq.umn.edu/misq/vol/{}'
        ]
        
        for year in range(START_YEAR, END_YEAR + 1):
            if year in year_links:
                continue  # Skip if already found
                
            for pattern in base_patterns:
                test_url = pattern.format(year)
                try:
                    driver.get(test_url)
                    time.sleep(1)
                    if '404' not in driver.title.lower() and 'not found' not in driver.title.lower():
                        year_links[year] = test_url
                        print(f"  ✓ Constructed year {year}: {test_url}")
                        break
                except:
                    continue
    
    # Go back to browse page
    driver.get(START_URL)
    time.sleep(2)
    
    print(f"\n{'='*60}")
    print(f"Total year links found: {len(year_links)}")
    print(f"Years: {sorted(year_links.keys())}")
    print(f"{'='*60}\n")
    
    # Scrape each year
    total_articles_scraped = 0
    for year in sorted(year_links.keys()):
        if START_YEAR <= year <= END_YEAR:
            print(f"\n{'='*60}")
            print(f"Scraping year {year}")
            print(f"{'='*60}")
            count = scrape_year_page(year_links[year], year)
            total_articles_scraped += count
            print(f"Year {year}: {count} articles scraped")
            time.sleep(2)
    
    print(f"\n{'='*60}")
    print(f"Scraping complete!")
    print(f"Total articles scraped: {total_articles_scraped}")
    print(f"{'='*60}")

except Exception as e:
    print(f"Exception: {e}")
    import traceback
    traceback.print_exc()

finally:
    driver.quit()

In [None]:
# ========== STEP 2: Scrape detailed data for each article ==========
# Reads MISQ_Issues.csv, visits each URL, extracts title/abstract/keywords/authors.
# Writes MISQ_article_data.csv with ONE ROW PER AUTHOR.
# JUPYTER-SAFE: NO input() prompts. If verification shows up, solve it in Chrome and code auto-continues.

import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
import time
import os
import random
import re

# -----------------------
# CONFIG
# -----------------------
START_INDEX = 0
END_INDEX = 5  # set to len(journals_data) for all
WAIT_SEC = 20

# pacing (faster -> more bot; slower -> fewer bot)
SLEEP_MIN = 2.5
SLEEP_MAX = 5.5

# bot wait loop (you solve manually in browser; code waits/polls)
BOT_MAX_WAIT_SEC = 15 * 60   # 15 minutes
BOT_POLL_SEC = 5

CSV_PATH = os.path.join(os.getcwd(), "MISQ_Issues.csv")
OUT_FILE = os.path.join(os.getcwd(), "MISQ_article_data.csv")

# -----------------------
# LOAD INPUT
# -----------------------
journals_data = pd.read_csv(CSV_PATH)
n_total = len(journals_data)
END_INDEX = min(END_INDEX, n_total)

print("Step 2: Scraping MISQ article details (ONE ROW PER AUTHOR)")
print(f"Reading from: {CSV_PATH}")
print(f"Writing to: {OUT_FILE}")
print(f"Total input rows: {n_total}")
print(f"Processing: {START_INDEX} -> {END_INDEX}\n")

# -----------------------
# OUTPUT HEADER
# -----------------------
if not os.path.exists(OUT_FILE) or os.path.getsize(OUT_FILE) == 0:
    with open(OUT_FILE, mode="w", newline="", encoding="utf-8") as f:
        csv.writer(f).writerow([
            "URL","Journal_Title","Article_Title","Volume_Issue","Month_Year",
            "Abstract","Keywords","Author_name","Author_email","Author_Address"
        ])

# -----------------------
# CHROME SETUP (single session + persistent profile)
# -----------------------
chrome_options = Options()
chrome_options.add_argument("--user-data-dir=/tmp/misq_profile")
chrome_options.add_argument("--profile-directory=Default")

# Lighter loads
prefs = {
    "profile.managed_default_content_settings.images": 2,
    "profile.managed_default_content_settings.stylesheets": 2,
    "profile.managed_default_content_settings.fonts": 2,
}
chrome_options.add_experimental_option("prefs", prefs)

driver = webdriver.Chrome(options=chrome_options)
wait = WebDriverWait(driver, WAIT_SEC)

# -----------------------
# HELPERS
# -----------------------
def is_bot_page(driver) -> bool:
    """
    Only treat as bot/verification page when title or URL clearly indicate it.
    Do NOT check full page_source - normal article pages often contain words
    like 'verify', 'cloudflare', 'verification' in scripts/footers and cause false positives.
    """
    title = (driver.title or "").lower()
    url = (driver.current_url or "").lower()
    # Only check title and URL to avoid false "bot detected" on real article pages
    signals = [
        "captcha",
        "are you human",
        "checking your browser",
        "access denied",
        "security check",
        "please enable cookies",
        "crawlprevention",
        "blocked",
        "challenge"
    ]
    if any(s in title for s in signals) or any(s in url for s in signals):
        return True
    # Optional: only if page is clearly a minimal challenge page (very short body)
    try:
        body_text = (driver.find_element(By.TAG_NAME, "body").text or "").lower()
        if len(body_text) < 500 and ("verify" in body_text or "human" in body_text or "security check" in body_text):
            return True
    except Exception:
        pass
    return False

def wait_until_unblocked(driver, max_wait_sec=BOT_MAX_WAIT_SEC, poll_sec=BOT_POLL_SEC) -> bool:
    """
    If verification page appears, you solve it manually in the open Chrome window.
    This function polls until verification is gone or timeout.
    """
    start = time.time()
    warned = False
    while is_bot_page(driver):
        if not warned:
            print("⚠ Verification detected. Please complete it in the Chrome window.")
            print("   Waiting automatically (no ENTER needed)...")
            warned = True
        if time.time() - start > max_wait_sec:
            return False
        time.sleep(poll_sec)
    return True

def first_text(driver, selectors, min_len=3, default="N/A"):
    for sel in selectors:
        try:
            el = driver.find_element(By.CSS_SELECTOR, sel)
            txt = (el.text or "").strip()
            if txt and len(txt) >= min_len:
                return txt
        except:
            continue
    return default

def extract_keywords_str(driver):
    selectors = [
        ".keyword",
        ".keywords span",
        ".keywords a",
        ".tag",
        "[class*='keyword'] span",
        "[class*='keyword'] a"
    ]
    kws = []
    seen = set()
    for sel in selectors:
        try:
            for el in driver.find_elements(By.CSS_SELECTOR, sel):
                t = (el.text or "").strip()
                if not t:
                    continue
                k = t.lower()
                if k not in seen:
                    seen.add(k)
                    kws.append(t)
        except:
            continue
    return "; ".join(kws) if kws else "N/A"

def _split_author_block(text):
    """Split a byline like 'Name1, Name2, and Name3' into individual author names."""
    if not text or len(text) < 3:
        return []

    # Normalize whitespace
    text = re.sub(r"\s+", " ", str(text).strip())

    # Strip common label prefixes
    text = re.sub(r"^(authors?|by|by:)\s*:?\s*", "", text, flags=re.I)

    # Split on common separators: comma, and, newline-ish bullets
    parts = re.split(r",\s*|\s+and\s+|\s*[•·\|/]\s*|\s*;\s*", text, flags=re.I)

    names = []
    for p in parts:
        p = " ".join(p.split()).strip()
        if not p:
            continue
        # Remove trailing footnote/affiliation markers like 1, *, †, ‡
        p = re.sub(r"[\s\d\*†‡]+$", "", p).strip()
        if 2 <= len(p) <= 150 and not p.lower().startswith(("author", "authors", "article", "additional details")):
            names.append(p)

    # Dedupe while preserving order
    out = []
    seen_local = set()
    for n in names:
        low = n.lower()
        if low not in seen_local:
            seen_local.add(low)
            out.append(n)

    return out

def get_authors_per_author(driver):
    """
    ONE ROW PER AUTHOR.
    Returns list of [Author_name, Author_email, Author_Address].

    Notes for MISQ:
    - Emails are often NOT published; we can only capture them if they appear as mailto links or visible text.
    - "Address" is treated as affiliation/institution text when available.
    """

    email_re = re.compile(r"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}", re.I)

    def clean_name(s: str) -> str:
        s = re.sub(r"\s+", " ", (s or "").strip())
        s = re.sub(r"[\s\d\*†‡]+$", "", s).strip()
        return s

    def extract_email(scope_el=None) -> str:
        try:
            root = scope_el if scope_el is not None else driver
            for a in root.find_elements(By.CSS_SELECTOR, "a[href^='mailto:']"):
                href = (a.get_attribute("href") or "").strip()
                if href.lower().startswith("mailto:"):
                    addr = href.split("mailto:", 1)[1].split("?", 1)[0].strip()
                    if addr and email_re.search(addr):
                        return addr
        except Exception:
            pass
        try:
            txt = (scope_el.text if scope_el is not None else driver.find_element(By.TAG_NAME, "body").text) or ""
            m = email_re.search(txt)
            return m.group(0) if m else "N/A"
        except Exception:
            return "N/A"

    def extract_affiliation(scope_el=None) -> str:
        sels = [
            ".affiliation",
            "[class*='affil']",
            "[itemprop='affiliation']",
            "[class*='institution']",
            ".author-affiliation",
            ".contributor-affiliation",
            ".hlFld-affiliation",
        ]
        texts = []
        seen_local = set()
        try:
            root = scope_el if scope_el is not None else driver
            for sel in sels:
                for el in root.find_elements(By.CSS_SELECTOR, sel):
                    t = clean_name(el.text)
                    if t and len(t) >= 3:
                        low = t.lower()
                        if low not in seen_local:
                            seen_local.add(low)
                            texts.append(t)
        except Exception:
            pass
        return " | ".join(texts) if texts else "N/A"

    skip = {
        "n/a", "author", "authors", "author & article information",
        "author information", "article information", "view article", "abstract", "pdf",
        "additional details", "published", "received", "accepted"
    }

    # Strategy A: author containers (best chance to map name ↔ email ↔ affiliation)
    container_selectors = [
        ".loa-author",
        "li.loa-author",
        "li.author",
        ".author",
        ".contributor",
        "[itemprop='author']",
        "[class*='author'] li",
        "[class*='authors'] li",
    ]
    author_rows = []
    seen_names = set()
    for sel in container_selectors:
        try:
            containers = driver.find_elements(By.CSS_SELECTOR, sel)
            for c in containers:
                # try to get name from child first
                name = ""
                for nsel in [".author-name", ".contributor-name", "[itemprop='name']", "a[rel='author']", "a"]:
                    try:
                        t = clean_name(c.find_element(By.CSS_SELECTOR, nsel).text)
                        if 2 <= len(t) <= 150:
                            name = t
                            break
                    except Exception:
                        continue
                if not name:
                    # fallback: take first plausible name from container text
                    candidates = _split_author_block(c.text)
                    name = clean_name(candidates[0]) if candidates else ""

                low = name.lower()
                if not name or low in skip or low in seen_names:
                    continue

                email = extract_email(c)
                affil = extract_affiliation(c)

                seen_names.add(low)
                author_rows.append([name, email if email else "N/A", affil if affil else "N/A"])

            if author_rows:
                return author_rows
        except Exception:
            continue

    # Strategy B: names first (byline/table), then best-effort global email/affiliation (may be N/A)
    seen = set()

    one_per_author_selectors = [
        ".author-name",
        ".contributor-name",
        "[itemprop='author']",
        ".byline a",
        ".authors a",
        ".author a",
        ".contributor a",
        "a[rel='author']",
        ".citation-author",
        ".loa-author span",
        ".author-list span",
        ".hlFld-author",
    ]

    names = []
    for sel in one_per_author_selectors:
        try:
            els = driver.find_elements(By.CSS_SELECTOR, sel)
            for el in els:
                txt = clean_name(el.text)
                if not txt or len(txt) < 2 or len(txt) > 500:
                    continue

                split_names = _split_author_block(txt)
                if len(split_names) > 1:
                    for n in split_names:
                        n = clean_name(n)
                        low = n.lower()
                        if not n or low in skip or low in seen:
                            continue
                        seen.add(low)
                        names.append(n)
                    continue

                low = txt.lower()
                if low in skip or low in seen:
                    continue
                seen.add(low)
                names.append(txt)

            if names:
                break
        except Exception:
            continue

    if not names:
        # table 'Author(s)' row
        try:
            for row in driver.find_elements(By.CSS_SELECTOR, "tr"):
                cells = row.find_elements(By.TAG_NAME, "td")
                if len(cells) < 2:
                    continue
                label = clean_name(cells[0].text).lower()
                if "author" not in label:
                    continue
                value = clean_name(cells[1].text)
                for n in _split_author_block(value):
                    n = clean_name(n)
                    low = n.lower()
                    if not n or low in skip or low in seen:
                        continue
                    seen.add(low)
                    names.append(n)
                if names:
                    break
        except Exception:
            pass

    if not names:
        return [["N/A", "N/A", "N/A"]]

    # Best-effort: capture page-level emails/affiliations (mapping per author usually not available)
    page_email = extract_email(None)
    page_affil = extract_affiliation(None)
    return [[n, page_email, page_affil] for n in names]

def _safe_get(row, *keys):
    """Get first existing column value (handles CSV header variations)."""
    for k in keys:
        if k in row.index:
            val = row.get(k, "N/A")
            return "N/A" if pd.isna(val) else str(val).strip()
    return "N/A"

def _write_article_rows(out_path, url, journal, title, vol_issue, month_year, abstract, keywords_str, authors):
    """Write one row per author to CSV; always at least one row."""
    final_data = [url, journal, title, vol_issue, month_year, abstract, keywords_str]
    if not authors:
        authors = [["N/A", "N/A", "N/A"]]
    with open(out_path, mode="a", newline="", encoding="utf-8") as f:
        w = csv.writer(f)
        for arow in authors:
            w.writerow(final_data + arow)
        f.flush()

# -----------------------
# MAIN LOOP
# -----------------------
try:
    for pos, (index, row) in enumerate(journals_data.iloc[START_INDEX:END_INDEX].iterrows(), start=1):

        url = str(row.get("URL", "") or "").strip()
        month_year = _safe_get(row, "Vol Issue Year", "Year")
        volume_issue = _safe_get(row, "Volume Issue")

        if not url or not url.startswith("http"):
            print(f"[{pos}] Skipping row (no URL)")
            continue

        # polite pacing (helps bot + avoids hammering)
        time.sleep(random.uniform(SLEEP_MIN, SLEEP_MAX))

        try:
            driver.get(url)

            # basic load
            try:
                wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
            except:
                pass
            time.sleep(1)

            # if bot, wait until you solve (only when title/URL clearly indicate challenge)
            if is_bot_page(driver):
                ok = wait_until_unblocked(driver)
                if not ok:
                    print(f"[{pos}] Timed out on verification. Writing placeholder row: {url}")
                    _write_article_rows(OUT_FILE, url, "MIS Quarterly", "N/A", volume_issue, month_year, "N/A", "N/A", [["N/A", "N/A", "N/A"]])
                    continue

                driver.get(url)
                time.sleep(2)

                if is_bot_page(driver):
                    print(f"[{pos}] Still blocked. Writing placeholder row: {url}")
                    _write_article_rows(OUT_FILE, url, "MIS Quarterly", "N/A", volume_issue, month_year, "N/A", "N/A", [["N/A", "N/A", "N/A"]])
                    continue

            # wait for content-ish
            try:
                wait.until(
                    EC.any_of(
                        EC.presence_of_element_located((By.CSS_SELECTOR, "h1")),
                        EC.presence_of_element_located((By.CSS_SELECTOR, ".article-title")),
                        EC.presence_of_element_located((By.CSS_SELECTOR, "article h1"))
                    )
                )
            except:
                pass

            title = first_text(driver, ["h1", ".article-title", "article h1", ".title", ".citation-title", "h1.citation-title"], min_len=3, default="N/A")

            abstract = first_text(
                driver,
                [".abstract", "#abstract", ".article-abstract", "section.abstract", "[class*='abstract']"],
                min_len=20,
                default="N/A"
            )

            keywords_str = extract_keywords_str(driver)
            authors = get_authors_per_author(driver)

            _write_article_rows(OUT_FILE, url, "MIS Quarterly", title, volume_issue, month_year, abstract, keywords_str, authors)

            print(f"[{pos}/{END_INDEX-START_INDEX}] ✓ {title[:60]}... ({len(authors)} author(s))")

        except Exception as e:
            print(f"[{pos}] ✗ Error on {url}: {e}")
            _write_article_rows(OUT_FILE, url, "MIS Quarterly", "N/A", volume_issue, month_year, "N/A", "N/A", [["N/A", "N/A", "N/A"]])

finally:
    driver.quit()
    print(f"\nDONE. Saved to: {OUT_FILE}")

Step 2: Scraping MISQ article details (ONE ROW PER AUTHOR)
Reading from: /Users/keerthisagi/Documents/Journals/MIS_Quarterly/MISQ_Issues.csv
Writing to: /Users/keerthisagi/Documents/Journals/MIS_Quarterly/MISQ_article_data.csv
Total input rows: 1273
Processing: 0 -> 5

[1/5] ✓ Information Systems Innovation for Environmental Sustainabil... (1 author(s))
[2/5] ✓ Information Systems and Environmentally Sustainable Developm... (1 author(s))
[3/5] ✓ An Empirical Analysis of the Impact of Information Capabilit... (1 author(s))
[4/5] ✓ Chasing the Hottest IT: Effects of Information Technology Fa... (1 author(s))
[5/5] ✓ Toward Agile: An Integrated Analysis of Quantitative and Qua... (1 author(s))

DONE. Saved to: /Users/keerthisagi/Documents/Journals/MIS_Quarterly/MISQ_article_data.csv
