In [3]:
import asyncio
from playwright.async_api import async_playwright
import pandas as pd
import time

URL = "https://www.kellogg.northwestern.edu/academics-research/faculty-directory/?facultyType=Clinical;Tenure-line"

def parse_department(title: str) -> str:    
    if not title:
        return ""
    parts = title.rsplit(" of ", 1)
    return parts[1].strip() if len(parts) == 2 else ""

async def scrape_kellogg():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(URL, wait_until="networkidle")

        await page.wait_for_selector("div.faculty-directory-listing__faculty-member")

        
        while True:
            try:
                load_more = await page.query_selector("button[data-loadmore='true'], button:has-text('More Faculty')")
                if load_more is None:
                    break
                await load_more.scroll_into_view_if_needed()
                await load_more.click()
                await asyncio.sleep(1.5)
            except:
                break

        cards = await page.query_selector_all("div.faculty-directory-listing__faculty-member")
        print("Total faculty:", len(cards))

        results = []
        for c in cards:
            name_el = await c.query_selector("a.faculty-directory-listing__name, span.faculty-directory-listing__name")
            title_el = await c.query_selector("div.faculty-directory-listing__title")

            name = (await name_el.inner_text()).strip() if name_el else ""
            title = (await title_el.inner_text()).strip() if title_el else ""
            department = parse_department(title)

            results.append({
                "name": name,
                "title": title,
                "department": department
            })

        await browser.close()
        return results



In [4]:
data = await scrape_kellogg()
data[:5]



Total faculty: 192


[{'name': 'Tarek Abdallah\ufeff',
  'title': 'Associate Professor of Operations',
  'department': 'Operations'},
 {'name': 'Chethana Achar\ufeff',
  'title': 'Associate Professor of Marketing',
  'department': 'Marketing'},
 {'name': 'Nabil Al-Najjar\ufeff',
  'title': 'John L. and Helen Kellogg Professor of Managerial Economics & Decision Sciences',
  'department': 'Managerial Economics & Decision Sciences'},
 {'name': 'Matthew Allen\ufeff',
  'title': 'John L. Ward Clinical Professor of Family Enterprises and Executive Director of the Ward Center for Family Enterprises',
  'department': 'the Ward Center for Family Enterprises'},
 {'name': 'Torben Gustav Andersen\ufeff',
  'title': 'Nathan S. and Mary P. Sharp Professor of Finance',
  'department': 'Finance'}]

In [5]:
df = pd.DataFrame(data)
df.to_excel("kellogg_faculty.xlsx", index=False)
df.head()


Unnamed: 0,name,title,department
0,Tarek Abdallah﻿,Associate Professor of Operations,Operations
1,Chethana Achar﻿,Associate Professor of Marketing,Marketing
2,Nabil Al-Najjar﻿,John L. and Helen Kellogg Professor of Manager...,Managerial Economics & Decision Sciences
3,Matthew Allen﻿,John L. Ward Clinical Professor of Family Ente...,the Ward Center for Family Enterprises
4,Torben Gustav Andersen﻿,Nathan S. and Mary P. Sharp Professor of Finance,Finance
