In [2]:
pip install selenium pandas

Note: you may need to restart the kernel to use updated packages.


# MIS Quarterly Scraper - 2-Step Pipeline

Same pattern as Information Systems:
- **Step 1**: Collect all article URLs only → `MISQ_Issues.csv` (Title, URL, Volume Issue, Vol Issue Year)
- **Step 2**: Scrape detailed data for each URL → `MISQ_article_data.csv` (abstract, keywords, authors)

Uses direct issue URLs (`contents-{vol}-{issue}`) for Vol 34–49 (2010–2025).  
If you see "Validate User", open https://misq.umn.edu in a browser and complete validation first.

In [None]:
# ========== STEP 1: Collect all article URLs only ==========
# Output: MISQ_Issues.csv (Title, URL, Volume Issue, Vol Issue Year)
# Uses direct issue URLs: misq.umn.edu/contents-{vol}-{issue}

from selenium import webdriver
from selenium.webdriver.common.by import By
import csv
import os
import time

driver = webdriver.Chrome()
driver.implicitly_wait(15)

# 2010 = Vol 34, 2025 = Vol 49 (MISQ publishes quarterly)
START_VOL = 34
END_VOL = 49

# Save in same folder as notebook
OUT_DIR = os.getcwd()
OUT_FILE = os.path.join(OUT_DIR, 'MISQ_Issues.csv')
print(f"Step 1: Collecting article URLs...")
print(f"Output: {OUT_FILE}\n")

def write_to_csv(rows):
    file_exists = os.path.exists(OUT_FILE) and os.path.getsize(OUT_FILE) > 0
    with open(OUT_FILE, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        if not file_exists:
            writer.writerow(["Title", "URL", "Volume Issue", "Vol Issue Year"])
        writer.writerows(rows)
        file.flush()

def scrape_issue(vol, issue):
    """Scrape article URLs from one issue page. Returns list of [title, url, vol_issue, year]."""
    url = f"https://misq.umn.edu/contents-{vol}-{issue}"
    vol_issue = f"Vol {vol}, Issue {issue}"
    year = 1976 + vol  # approximate year
    
    try:
        driver.get(url)
        time.sleep(2)
        
        # Skip if page not found
        if "page not found" in driver.title.lower() or "404" in driver.title.lower():
            return []
        
        rows = []
        # Find article links (MISQ format: /misq/article/ or /article/)
        all_links = driver.find_elements(By.TAG_NAME, 'a')
        seen = set()
        
        for link in all_links:
            try:
                href = link.get_attribute('href') or ''
                if not href or href in seen:
                    continue
                # Article URLs contain /article/
                if '/article/' in href and 'misq.umn.edu' in href:
                    seen.add(href)
                    title = link.text.strip()
                    if not title or len(title) < 5:
                        title = "N/A"
                    rows.append([title, href, vol_issue, str(year)])
            except:
                continue
        
        return rows
    except Exception as e:
        print(f"  Error {vol}-{issue}: {e}")
        return []

# Scrape all volumes and issues
total = 0
for vol in range(START_VOL, END_VOL + 1):
    year = 1976 + vol
    print(f"Vol {vol} ({year}):")
    for issue in range(1, 5):  # 4 issues per year
        rows = scrape_issue(vol, issue)
        if rows:
            write_to_csv(rows)
            total += len(rows)
            print(f"  Issue {issue}: {len(rows)} articles")
        time.sleep(1)
    print()

print(f"Step 1 complete. Total: {total} articles → {OUT_FILE}")
driver.quit()

CSV file will be saved to: /Users/keerthisagi/Documents/Journals/MIS_Quarterly/MISQ_Issues.csv
Current working directory: /Users/keerthisagi/Documents/Journals/MIS_Quarterly

Starting MIS Quarterly scraper (2010-2025)...
Browse page: https://misq.umn.edu/misq/issue/browse-by-year

Searching for year links on browse-by-year page...
Page title: Validate User
Current URL: https://misq.umn.edu/crawlprevention/governor?content=%2fmisq%2fissue%2fbrowse-by-year

Total links found on page: 138
  ✓ Found year 2010: 2010 -> https://misq.umn.edu/misq/issue/browse-by-year/2010
  ✓ Found year 2011: 2011 -> https://misq.umn.edu/misq/issue/browse-by-year/2011
  ✓ Found year 2012: 2012 -> https://misq.umn.edu/misq/issue/browse-by-year/2012
  ✓ Found year 2013: 2013 -> https://misq.umn.edu/misq/issue/browse-by-year/2013
  ✓ Found year 2014: 2014 -> https://misq.umn.edu/misq/issue/browse-by-year/2014
  ✓ Found year 2015: 2015 -> https://misq.umn.edu/misq/issue/browse-by-year/2015
  ✓ Found year 2016: 20

## Step 2: Scrape detailed data for each article

Reads `MISQ_Issues.csv` from Step 1, visits each URL, extracts title, abstract, keywords, authors.  
Saves to `MISQ_article_data.csv`. Set `START_INDEX` and `END_INDEX` to process in batches.

In [None]:
# ========== STEP 2: Scrape detailed data for each article ==========
# Output: MISQ_article_data.csv
# Run Step 1 first to create MISQ_Issues.csv

import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
import time
import os

START_INDEX = 0
END_INDEX = 100  # Process in batches; use len(articles) for all

# Paths - run from MIS_Quarterly folder (or set CSV_PATH manually)
OUT_DIR = os.getcwd()
CSV_PATH = os.path.join(OUT_DIR, 'MISQ_Issues.csv')
OUT_FILE = os.path.join(OUT_DIR, 'MISQ_article_data.csv')

print(f"Step 2: Scraping article details...")
print(f"Reading from: {CSV_PATH}")
print(f"Writing to: {OUT_FILE}\n")

journals_data = pd.read_csv(CSV_PATH)
n_total = len(journals_data)
print(f"Total articles: {n_total}")
print(f"Processing range: {START_INDEX} to {min(END_INDEX, n_total)}\n")

# Create output file with headers
if not os.path.exists(OUT_FILE) or os.path.getsize(OUT_FILE) == 0:
    with open(OUT_FILE, mode='a', newline='', encoding='utf-8') as f:
        csv.writer(f).writerow(['URL','Journal_Title','Article_Title','Volume_Issue','Month_Year','Abstract','Keywords','Author_name','Author_email','Author_Address'])

def get_authors(driver):
    """Extract author info from page."""
    authdata = []
    try:
        authors = driver.find_elements(By.CSS_SELECTOR, '.author, .contributor, [class*="author"], .byline span')
        for auth in authors:
            name = auth.text.strip() or "N/A"
            authdata.append([name, "N/A", "N/A"])
    except:
        pass
    return authdata if authdata else [["N/A", "N/A", "N/A"]]

for index, row in journals_data.iloc[START_INDEX:END_INDEX].iterrows():
    driver = webdriver.Chrome()
    url = str(row.get('URL', '')).strip()
    article_date = row.get('Vol Issue Year', None)
    article_vol = row.get('Volume Issue', 'N/A')

    if not url or not url.startswith('http'):
        driver.quit()
        continue

    title = "N/A"
    abstract = None
    keyword_list = []

    try:
        driver.get(url)
        driver.implicitly_wait(10)
        time.sleep(1)

        # Title
        for sel in ['h1', '.article-title', '.title', 'article h1']:
            try:
                title = driver.find_element(By.CSS_SELECTOR, sel).text.strip()
                if title and len(title) > 3:
                    break
            except:
                continue

        # Abstract
        for sel in ['.abstract', '#abstract', '.article-abstract', 'section.abstract', '[class*="abstract"]']:
            try:
                abstract = driver.find_element(By.CSS_SELECTOR, sel).text.strip()
                if abstract and len(abstract) > 20:
                    break
            except:
                continue

        # Keywords
        try:
            keywords = driver.find_elements(By.CSS_SELECTOR, '.keyword, .keywords span, .tag, [class*="keyword"] span')
            keyword_list = [k.text.strip() for k in keywords if k.text.strip()]
        except:
            pass

        auth_data = get_authors(driver)
        final_data = [url, "MIS Quarterly", title, article_vol, article_date, abstract, keyword_list]

        with open(OUT_FILE, mode='a', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            for auth in auth_data:
                writer.writerow(final_data + auth)
            f.flush()

        print(f"[{index-START_INDEX+1}] {title[:50]}...")
    except Exception as e:
        print(f"Error {url}: {e}")
        with open(OUT_FILE, mode='a', newline='', encoding='utf-8') as f:
            csv.writer(f).writerow([url, "MIS Quarterly", "N/A", article_vol, article_date, None, [], "N/A", "N/A", "N/A"])
            f.flush()

    driver.quit()
    time.sleep(2)

print(f"\nStep 2 complete. Data saved to {OUT_FILE}")

Reading articles from: /Users/keerthisagi/Documents/Journals/MIS_Quarterly/MISQ_Issues.csv


FileNotFoundError: [Errno 2] No such file or directory: '/Users/keerthisagi/Documents/Journals/MIS_Quarterly/MISQ_Issues.csv'