In [8]:
import asyncio
from playwright.async_api import async_playwright
import pandas as pd

URL = "https://business.columbia.edu/faculty/faculty-by-areas-of-expertise"

async def scrape_columbia():
    async with async_playwright() as p:
        browser = await p.chromium.launch(
            headless=False,  # Columbia requires real browser
            args=[
                "--disable-blink-features=AutomationControlled",
                "--disable-dev-shm-usage"
            ]
        )

        context = await browser.new_context(
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
            viewport={"width": 1400, "height": 900}
        )

        page = await context.new_page()
        await page.goto(URL, wait_until="domcontentloaded", timeout=0)

        # Columbia requires scroll to trigger content render
        await page.evaluate("""() => {
            window.scrollBy(0, document.body.scrollHeight);
        }""")
        await page.wait_for_timeout(3000)

        # Try all possible card selectors
        selectors = [
            "div.m-listing-faculty",       # Expected
            "div.views-row",               # Drupal views fallback
            "div.field--item",             # Sometimes used
            "article"                      # Very fallback
        ]

        cards = []
        for s in selectors:
            found = await page.query_selector_all(s)
            if len(found) > len(cards):
                cards = found

        print("Found cards:", len(cards))

        results = []

        for card in cards:
            name_el = await card.query_selector("h3")
            name = (await name_el.inner_text()).strip() if name_el else ""

            title_els = await card.query_selector_all("dt.m-detail-meta__item-title")
            titles = [(await t.inner_text()).strip() for t in title_els]
            title = "; ".join(titles)

            div_els = await card.query_selector_all("dd.m-detail-meta__item-content")
            divisions = [(await d.inner_text()).strip() for d in div_els]
            division = "; ".join(divisions)

            if name:
                results.append({
                    "name": name,
                    "title": title,
                    "division": division
                })

        await browser.close()
        return results


In [9]:
data = await scrape_columbia()
data[:5]  


Found cards: 576


[{'name': 'Alex Mills',
  'title': 'Visiting Professor of Business',
  'division': 'Decision, Risk, and Operations Division'},
 {'name': 'Harry Mamaysky',
  'title': 'Professor of Professional Practice in the Faculty of Business; Faculty Director',
  'division': 'Finance Division; Program for Financial Studies'},
 {'name': 'Mattan Griffel',
  'title': 'Assistant Professor of Professional Practice in the Faculty of Business',
  'division': 'Decision, Risk, and Operations Division'},
 {'name': 'Kamel Jedidi',
  'title': 'Jerome A. Chazen Professor of Global Business',
  'division': 'Marketing Division'},
 {'name': 'Ciamac Moallemi',
  'title': 'William von Mueffling Professor of Business',
  'division': 'Decision, Risk, and Operations Division'}]

In [10]:
df = pd.DataFrame(data)
df.to_excel("columbia_faculty_by_expertise.xlsx", index=False)
df.head()


Unnamed: 0,name,title,division
0,Alex Mills,Visiting Professor of Business,"Decision, Risk, and Operations Division"
1,Harry Mamaysky,Professor of Professional Practice in the Facu...,Finance Division; Program for Financial Studies
2,Mattan Griffel,Assistant Professor of Professional Practice i...,"Decision, Risk, and Operations Division"
3,Kamel Jedidi,Jerome A. Chazen Professor of Global Business,Marketing Division
4,Ciamac Moallemi,William von Mueffling Professor of Business,"Decision, Risk, and Operations Division"
