In [None]:
pip install selenium pandas

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
import os
import time
import re

# Initialize the chrome webdriver
driver = webdriver.Chrome()

# Starting URL - browse by year page
START_URL = 'https://misq.umn.edu/misq/issue/browse-by-year'

# Years to scrape (2010 to 2025)
START_YEAR = 2010
END_YEAR = 2025

# Save CSV file in the same directory as this notebook (MIS_Quarterly folder)
OUT_FILE = os.path.join(os.getcwd(), 'MISQ_Issues.csv')
print(f"CSV file will be saved to: {OUT_FILE}")
print(f"Current working directory: {os.getcwd()}\n")
data = []

def write_to_csv(rows):
    file_exists = os.path.exists(OUT_FILE) and os.path.getsize(OUT_FILE) > 0
    with open(OUT_FILE, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        if not file_exists:
            writer.writerow(["Title", "URL", "Volume Issue", "Vol Issue Year"])
            print(f"Created CSV file: {OUT_FILE}")
        writer.writerows(rows)
        file.flush()  # Ensure data is written immediately
    print(f"  ✓ Saved {len(rows)} articles to {OUT_FILE}")

def scrape_issue_page(issue_url, vol_issue, year):
    """Scrape articles from a single issue page"""
    try:
        driver.get(issue_url)
        driver.implicitly_wait(15)
        time.sleep(2)
        
        print(f"  Page loaded: {driver.title}")
        
        # Find all article links - try multiple selectors
        articles = []
        
        # Try different selectors for article links
        selectors = [
            'a[href*="/misq/vol"]',
            'a[href*="/article"]',
            '.article-title a',
            '.article a',
            'h3 a',
            'h4 a',
            '.title a',
            'article a',
            '.entry-title a',
            'li a[href*="/article"]',
            'div.article a',
            'table a[href*="/article"]'
        ]
        
        print("  Searching for articles...")
        for selector in selectors:
            try:
                found = driver.find_elements(By.CSS_SELECTOR, selector)
                if found:
                    print(f"    Selector '{selector}': Found {len(found)} links")
                    # Filter for article links (not issue links)
                    filtered = [a for a in found if a.get_attribute('href') and 
                               ('/article' in a.get_attribute('href') or 
                                ('/misq/vol' in a.get_attribute('href') and '/issue' not in a.get_attribute('href')))]
                    if filtered:
                        print(f"      → {len(filtered)} are article links")
                        articles.extend(filtered)
            except Exception as e:
                continue
        
        # Remove duplicates
        seen_urls = set()
        unique_articles = []
        for article in articles:
            try:
                url = article.get_attribute('href')
                if url and url not in seen_urls:
                    seen_urls.add(url)
                    unique_articles.append(article)
            except:
                continue
        
        if not unique_articles:
            # Fallback: find all links and filter
            print("  Trying fallback: checking all links...")
            all_links = driver.find_elements(By.TAG_NAME, 'a')
            for link in all_links:
                try:
                    url = link.get_attribute('href') or ''
                    if url and ('/article' in url or ('/misq/vol' in url and '/issue' not in url)) and url not in seen_urls:
                        seen_urls.add(url)
                        unique_articles.append(link)
                except:
                    continue
        
        print(f"  Total unique articles found: {len(unique_articles)}")
        
        rows = []
        for article in unique_articles:
            try:
                article_url = article.get_attribute('href')
                if not article_url:
                    continue
                    
                # Make sure URL is absolute
                if article_url.startswith('/'):
                    article_url = 'https://misq.umn.edu' + article_url
                
                if not article_url.startswith('http'):
                    continue
                
                # Get article title
                article_title = article.text.strip()
                if not article_title or len(article_title) < 5:
                    # Try to get title from parent or nearby element
                    try:
                        parent = article.find_element(By.XPATH, './..')
                        article_title = parent.text.strip()
                    except:
                        try:
                            # Try sibling or nearby heading
                            heading = article.find_element(By.XPATH, './preceding-sibling::h3 | ./preceding-sibling::h4 | ./following-sibling::h3 | ./following-sibling::h4')
                            article_title = heading.text.strip()
                        except:
                            article_title = "N/A"
                
                if article_url and article_title and article_title != "N/A" and len(article_title) > 5:
                    rows.append([article_title, article_url, vol_issue, year])
                    print(f"    ✓ {article_title[:60]}...")
                    
            except Exception as e:
                print(f"    Error extracting article: {e}")
                continue
        
        if rows:
            write_to_csv(rows)
            return len(rows)
        else:
            print(f"  ⚠ No articles found on this page")
            # Debug: show page structure
            try:
                page_text = driver.find_element(By.TAG_NAME, 'body').text[:300]
                print(f"  Page content preview: {page_text[:200]}...")
            except:
                pass
            return 0
            
    except Exception as e:
        print(f"  ✗ Error scraping issue page: {e}")
        import traceback
        traceback.print_exc()
        return 0

def scrape_year_page(year_url, year):
    """Scrape all issues from a year page"""
    try:
        print(f"\n{'='*60}")
        print(f"Navigating to year page: {year_url}")
        driver.get(year_url)
        driver.implicitly_wait(15)
        time.sleep(3)  # Give page more time to load
        
        print(f"Page title: {driver.title}")
        print(f"Current URL: {driver.current_url}")
        
        # Debug: Print some page content to understand structure
        try:
            page_text = driver.find_element(By.TAG_NAME, 'body').text[:500]
            print(f"Page content preview: {page_text}...")
        except:
            pass
        
        # Find all issue links - try comprehensive approach
        issue_links = []
        
        # Try different selectors for issue links
        selectors = [
            'a[href*="/misq/vol"]',
            'a[href*="/vol"]',
            '.issue-link a',
            '.issue a',
            'h2 a',
            'h3 a',
            'h4 a',
            'li a',
            '.volume a',
            'article a',
            'div a[href*="/vol"]',
            'table a[href*="/vol"]'
        ]
        
        print("\nTrying to find issue links...")
        for selector in selectors:
            try:
                links = driver.find_elements(By.CSS_SELECTOR, selector)
                if links:
                    print(f"  Selector '{selector}': Found {len(links)} links")
                    # Filter for actual issue links
                    filtered = [l for l in links if l.get_attribute('href') and ('/misq/vol' in l.get_attribute('href') or '/vol' in l.get_attribute('href'))]
                    if filtered:
                        print(f"    → {len(filtered)} are issue links")
                        issue_links.extend(filtered)
            except Exception as e:
                print(f"  Selector '{selector}': Error - {e}")
                continue
        
        # Remove duplicates
        seen_urls = set()
        unique_issue_links = []
        for link in issue_links:
            try:
                url = link.get_attribute('href')
                if url and url not in seen_urls:
                    seen_urls.add(url)
                    unique_issue_links.append(link)
            except:
                continue
        
        print(f"\nTotal unique issue links found: {len(unique_issue_links)}")
        
        if not unique_issue_links:
            # Fallback: find ALL links and filter
            print("Trying fallback: checking all links on page...")
            all_links = driver.find_elements(By.TAG_NAME, 'a')
            print(f"Total links on page: {len(all_links)}")
            
            for link in all_links[:50]:  # Check first 50 links
                try:
                    url = link.get_attribute('href') or ''
                    text = link.text.strip()
                    if url and ('/misq/vol' in url or '/vol' in url) and url not in seen_urls:
                        print(f"  Found issue link: {text[:50]} -> {url}")
                        seen_urls.add(url)
                        unique_issue_links.append(link)
                except:
                    continue
        
        # Extract unique issue URLs with metadata
        unique_issues = {}
        for link in unique_issue_links:
            try:
                url = link.get_attribute('href')
                if not url:
                    continue
                    
                # Make sure URL is absolute
                if url.startswith('/'):
                    url = 'https://misq.umn.edu' + url
                
                # Extract volume/issue from URL or text
                link_text = link.text.strip()
                
                # Try to extract from URL
                match = re.search(r'vol[^\d]*(\d+)[^\d]*issue[^\d]*(\d+)', url, re.I)
                if match:
                    vol_issue = f"Vol {match.group(1)}, Issue {match.group(2)}"
                elif link_text and len(link_text) > 3:
                    vol_issue = link_text
                else:
                    # Extract from URL path
                    parts = url.split('/')
                    vol_issue = parts[-1] if parts else f"Vol {year}"
                
                unique_issues[url] = vol_issue
                print(f"  Issue: {vol_issue} -> {url}")
            except Exception as e:
                print(f"  Error processing link: {e}")
                continue
        
        print(f"\n{'='*60}")
        print(f"Processing {len(unique_issues)} issues for year {year}...")
        print(f"{'='*60}")
        
        if len(unique_issues) == 0:
            print(f"WARNING: No issues found for year {year}!")
            print("Page HTML snippet:")
            try:
                html_snippet = driver.page_source[:1000]
                print(html_snippet)
            except:
                pass
            return 0
        
        total_articles = 0
        for issue_url, vol_issue in unique_issues.items():
            print(f"\n{'─'*60}")
            print(f"Scraping: {vol_issue}")
            print(f"URL: {issue_url}")
            count = scrape_issue_page(issue_url, vol_issue, str(year))
            total_articles += count
            print(f"  → Found {count} articles")
            time.sleep(1)  # Be respectful
        
        return total_articles
        
    except Exception as e:
        print(f"Error scraping year page {year_url}: {e}")
        import traceback
        traceback.print_exc()
        return 0

# Main scraping logic
try:
    driver.get(START_URL)
    driver.implicitly_wait(15)
    time.sleep(2)
    
    print("Starting MIS Quarterly scraper (2010-2025)...")
    print(f"Browse page: {START_URL}\n")
    
    # Find year links for 2010-2025
    year_links = {}
    
    print("Searching for year links on browse-by-year page...")
    print(f"Page title: {driver.title}")
    print(f"Current URL: {driver.current_url}\n")
    
    # Get all links and filter by year
    all_links = driver.find_elements(By.TAG_NAME, 'a')
    print(f"Total links found on page: {len(all_links)}")
    
    # First pass: look for links with year in URL or text
    for link in all_links:
        try:
            url = link.get_attribute('href') or ''
            text = link.text.strip()
            
            # Make URL absolute if relative
            if url.startswith('/'):
                url = 'https://misq.umn.edu' + url
            
            # Check if link contains year in URL or text
            for year in range(START_YEAR, END_YEAR + 1):
                year_str = str(year)
                if (year_str in url or year_str in text) and url.startswith('http'):
                    if year not in year_links:
                        year_links[year] = url
                        print(f"  ✓ Found year {year}: {text[:40]} -> {url}")
        except Exception as e:
            continue
    
    print(f"\nFound {len(year_links)} year links directly from page")
    
    # If we didn't find enough year links, try to construct URLs
    if len(year_links) < (END_YEAR - START_YEAR + 1) / 2:  # If less than half found
        print("\nTrying to construct year URLs...")
        # Common patterns for year pages
        base_patterns = [
            'https://misq.umn.edu/misq/issue/browse-by-year/{}',
            'https://misq.umn.edu/misq/issue/{}',
            'https://misq.umn.edu/misq/vol/{}'
        ]
        
        for year in range(START_YEAR, END_YEAR + 1):
            if year in year_links:
                continue  # Skip if already found
                
            for pattern in base_patterns:
                test_url = pattern.format(year)
                try:
                    driver.get(test_url)
                    time.sleep(1)
                    if '404' not in driver.title.lower() and 'not found' not in driver.title.lower():
                        year_links[year] = test_url
                        print(f"  ✓ Constructed year {year}: {test_url}")
                        break
                except:
                    continue
    
    # Go back to browse page
    driver.get(START_URL)
    time.sleep(2)
    
    print(f"\n{'='*60}")
    print(f"Total year links found: {len(year_links)}")
    print(f"Years: {sorted(year_links.keys())}")
    print(f"{'='*60}\n")
    
    # Scrape each year
    total_articles_scraped = 0
    for year in sorted(year_links.keys()):
        if START_YEAR <= year <= END_YEAR:
            print(f"\n{'='*60}")
            print(f"Scraping year {year}")
            print(f"{'='*60}")
            count = scrape_year_page(year_links[year], year)
            total_articles_scraped += count
            print(f"Year {year}: {count} articles scraped")
            time.sleep(2)
    
    print(f"\n{'='*60}")
    print(f"Scraping complete!")
    print(f"Total articles scraped: {total_articles_scraped}")
    print(f"{'='*60}")

except Exception as e:
    print(f"Exception: {e}")
    import traceback
    traceback.print_exc()

finally:
    driver.quit()


Scraping year 2015
Error scraping year page https://misq.umn.edu/misq/issue/browse-by-year/2015: HTTPConnectionPool(host='localhost', port=51528): Max retries exceeded with url: /session/e568a06f3a293aa3dd6922c7a9992c9e/url (Caused by NewConnectionError("HTTPConnection(host='localhost', port=51528): Failed to establish a new connection: [Errno 61] Connection refused"))
Year 2015: 0 articles scraped

Scraping year 2016
Error scraping year page https://misq.umn.edu/misq/issue/browse-by-year/2016: HTTPConnectionPool(host='localhost', port=51528): Max retries exceeded with url: /session/e568a06f3a293aa3dd6922c7a9992c9e/url (Caused by NewConnectionError("HTTPConnection(host='localhost', port=51528): Failed to establish a new connection: [Errno 61] Connection refused"))
Year 2016: 0 articles scraped


In [None]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
import time
import os

# Set the range of articles to process
START_INDEX = 0
END_INDEX = 100  # Adjust based on your needs

# Read the CSV file created by the first cell
csv_path = os.path.join(os.getcwd(), 'MISQ_Issues.csv')
print(f"Reading articles from: {csv_path}")
journals_data = pd.read_csv(csv_path)

# Save detailed article data in the same directory
OUT_FILE = os.path.join(os.getcwd(), 'MISQ_article_data.csv')
print(f"Detailed article data will be saved to: {OUT_FILE}\n")

def getAuthorsData(authors, driver):
    """Extract author information - adjust selectors based on actual website structure"""
    authdata = []
    for auth in authors:
        name, desc = '', ''
        email = None
        try:
            # Try to click author link if it's clickable
            try:
                auth.click()
                time.sleep(1)
            except:
                pass
            
            # Extract author name - adjust selector
            try:
                name = auth.text.strip()
                # If name is in a child element
                name_elem = auth.find_element(By.CSS_SELECTOR, '.author-name, .name, strong')
                name = name_elem.text.strip()
            except:
                name = auth.text.strip()
            
            # Extract author affiliation/description - adjust selector
            try:
                desc_elem = driver.find_element(By.CSS_SELECTOR, '.affiliation, .author-affiliation, .institution')
                desc = desc_elem.text.strip()
            except:
                desc = ''
            
            # Extract email - adjust selector
            try:
                email_elem = auth.find_element(By.CSS_SELECTOR, 'a[href^="mailto:"], .email')
                email = email_elem.get_attribute('href').replace('mailto:', '') if email_elem.get_attribute('href') else None
                if not email:
                    email = email_elem.text.strip()
            except:
                email = None
                
        except Exception as e:
            print(f"Error extracting author data: {e}")
        
        authdata.append([name, email, desc])
    return authdata

# Create CSV file with headers if it doesn't exist
if not os.path.exists(OUT_FILE) or os.path.getsize(OUT_FILE) == 0:
    with open(OUT_FILE, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['URL','Journal_Title','Article_Title','Volume_Issue','Month_Year','Abstract','Keywords','Author_name','Author_email','Author_Address'])

for index, row in journals_data.iloc[START_INDEX:END_INDEX].iterrows():
    driver = webdriver.Chrome()
    final_data = []

    url = str(row.get('URL', '')).strip()
    article_date = row.get('Vol Issue Year', None)

    if not url or not url.startswith('http'):
        driver.quit()
        continue

    title = "N/A"
    article_journal = "MIS Quarterly"
    article_vol = row.get('Volume Issue', 'N/A')
    abstract = None
    keyword_list = []

    try:
        driver.get(url)
        driver.implicitly_wait(10)

        # Wait for page to load - adjust selector based on actual structure
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'h1, .article-title, .title, article'))
        )

        # Extract title - adjust selector
        try:
            title = driver.find_element(By.CSS_SELECTOR, 'h1, .article-title, .title, article h1').text.strip()
        except:
            title = "N/A"

        # Extract volume/issue info if available on article page
        try:
            vol_info = driver.find_element(By.CSS_SELECTOR, '.volume-info, .issue-info, .publication-info').text.strip()
            article_vol = vol_info
        except:
            pass

    except Exception as e:
        print(f"Error loading page {url}: {e}")

    # Extract abstract - adjust selector
    try:
        abstract = driver.find_element(By.CSS_SELECTOR, '.abstract, #abstract, .article-abstract, section.abstract').text.strip()
    except:
        try:
            abstract = driver.find_element(By.CSS_SELECTOR, 'div:contains("Abstract")').text.strip()
        except:
            abstract = None

    # Extract keywords - adjust selector
    try:
        keywords = driver.find_elements(By.CSS_SELECTOR, '.keyword, .keywords span, .tag, .article-keywords span')
        keyword_list = []
        for key in keywords:
            keyword_list.append(key.text.strip())
    except:
        keyword_list = []

    final_data = [url, article_journal, title, article_vol, article_date, abstract, keyword_list]

    # Extract author information - adjust selector
    try:
        author_group = driver.find_element(By.CSS_SELECTOR, '.authors, .author-list, .article-authors, .contributors')
        authors = author_group.find_elements(By.CSS_SELECTOR, '.author, .contributor, li, span.author')
        
        if authors:
            auth_data = getAuthorsData(authors, driver)
            if auth_data:
                for i in auth_data:
                    with open(OUT_FILE, mode='a', newline='', encoding='utf-8') as file:
                        writer = csv.writer(file)
                        writer.writerow(final_data + i)
                        file.flush()
            else:
                with open(OUT_FILE, mode='a', newline='', encoding='utf-8') as file:
                    writer = csv.writer(file)
                    writer.writerow(final_data + ["N/A", "N/A", "N/A"])
                    file.flush()
        else:
            # Try alternative selector for authors
            authors = driver.find_elements(By.CSS_SELECTOR, '.author, .contributor, [class*="author"]')
            if authors:
                auth_data = getAuthorsData(authors, driver)
                for i in auth_data:
                    with open(OUT_FILE, mode='a', newline='', encoding='utf-8') as file:
                        writer = csv.writer(file)
                        writer.writerow(final_data + i)
                        file.flush()
            else:
                with open(OUT_FILE, mode='a', newline='', encoding='utf-8') as file:
                    writer = csv.writer(file)
                    writer.writerow(final_data + ["N/A", "N/A", "N/A"])
                    file.flush()
    except Exception as e:
        with open(OUT_FILE, mode='a', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow(final_data + ["N/A", "N/A", "N/A"])
            file.flush()
        print(f"Error processing author data on {url}: {e}")

    driver.quit()
    time.sleep(2)  # Be respectful with requests