In [1]:
"""
Cell 1: Scrape Faculty Basic Information
Collects: Name, Title, Department, Profile URL from all 27 pages
"""

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
from tqdm import tqdm

def setup_driver():
    """Setup Chrome browser"""
    chrome_options = Options()
    chrome_options.add_argument('--headless')  # Run in background
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-blink-features=AutomationControlled')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--window-size=1920,1080')
    chrome_options.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
    chrome_options.add_argument('--disable-extensions')
    chrome_options.add_argument('--disable-infobars')
    chrome_options.add_argument('--remote-debugging-port=9222')
    
    try:
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=chrome_options)
        return driver
    except Exception as e:
        print(f"Method 1 failed: {e}")
        print("Trying method 2: using system Chrome...")
        try:
            driver = webdriver.Chrome(options=chrome_options)
            return driver
        except Exception as e2:
            print(f"Method 2 also failed: {e2}")
            print("\nPlease try:")
            print("1. Update Chrome to latest version")
            print("2. Run: pip install --upgrade selenium webdriver-manager")
            raise

def wait_for_faculty_cards(driver, timeout=20):
    """Wait for faculty cards to load"""
    try:
        WebDriverWait(driver, timeout).until(
            EC.presence_of_element_located((By.CLASS_NAME, "de_loop_custom-template"))
        )
        time.sleep(3)
        return True
    except:
        return False

def scrape_faculty_page(driver, page_num):
    """Scrape faculty information from a single page"""
    faculty_list = []
    
    url = f"https://www.mccombs.utexas.edu/faculty-and-research/faculty-directory?filter=true&page={page_num}"
    
    print(f"\nVisiting page {page_num}...")
    driver.get(url)
    
    if not wait_for_faculty_cards(driver):
        print(f"  ⚠️ Page {page_num} loading timeout")
        return faculty_list
    
    try:
        cards = driver.find_elements(By.CSS_SELECTOR, ".de_loop_custom-template article")
        
        if not cards:
            cards = driver.find_elements(By.TAG_NAME, "article")
        
        print(f"  Found {len(cards)} faculty cards")
        
        for card in cards:
            try:
                link_elem = card.find_element(By.CSS_SELECTOR, "a[href*='mccombs.utexas.edu/faculty']")
                profile_url = link_elem.get_attribute('href')
                
                try:
                    name_elem = card.find_element(By.CSS_SELECTOR, ".utm-faculty-card__name")
                    name = name_elem.text.strip()
                except:
                    name = link_elem.text.strip().split('\n')[0]
                
                if not name:
                    name = "Unknown"
                
                title = ""
                department = ""
                
                try:
                    title_elem = card.find_element(By.CSS_SELECTOR, ".utm-faculty-card__title")
                    title = title_elem.text.strip()
                except:
                    pass
                
                try:
                    dept_elem = card.find_element(By.CSS_SELECTOR, ".utm-faculty-card__department")
                    department = dept_elem.text.strip()
                except:
                    pass
                
                faculty_list.append({
                    'name': name,
                    'title': title,
                    'department': department,
                    'profile_url': profile_url
                })
                
            except Exception as e:
                print(f"    Error processing card: {e}")
                continue
        
    except Exception as e:
        print(f"  Error finding cards: {e}")
    
    return faculty_list

# Main execution for Cell 1
print("=" * 70)
print("McCombs Faculty Scraper - Part 1: Faculty Information")
print("=" * 70)
print("\nInitializing browser...")

driver = setup_driver()
all_faculty = []

try:
    print("\nStep 1: Scraping faculty basic information...")
    print("=" * 70)
    
    total_pages = 27
    
    for page in tqdm(range(1, total_pages + 1), desc="Scraping pages"):
        faculty_on_page = scrape_faculty_page(driver, page)
        all_faculty.extend(faculty_on_page)
        time.sleep(2)
    
    print(f"\n✅ Found {len(all_faculty)} faculty members")
    
    if len(all_faculty) == 0:
        print("\n⚠️ No faculty found. Please check:")
        print("1. Website is accessible")
        print("2. Browser window display")
    else:
        # Save faculty list
        df_faculty = pd.DataFrame(all_faculty)
        df_faculty.to_csv('faculty_list.csv', index=False, encoding='utf-8-sig')
        print(f"\n✅ Faculty list saved to: faculty_list.csv")
        print(f"\nPreview (first 5):")
        print(df_faculty.head())
        
finally:
    print("\nClosing browser...")
    driver.quit()

print("\n" + "=" * 70)
print("Cell 1 Complete! Proceed to Cell 2 to scrape publications.")
print("=" * 70)

McCombs Faculty Scraper - Part 1: Faculty Information

Initializing browser...

Step 1: Scraping faculty basic information...


Scraping pages:   0%|          | 0/27 [00:00<?, ?it/s]


Visiting page 1...
  Found 15 faculty cards


Scraping pages:   4%|▎         | 1/27 [00:09<04:13,  9.73s/it]


Visiting page 2...
  Found 15 faculty cards


Scraping pages:   7%|▋         | 2/27 [00:21<04:26, 10.65s/it]


Visiting page 3...
  Found 15 faculty cards


Scraping pages:  11%|█         | 3/27 [00:29<03:53,  9.71s/it]


Visiting page 4...
  Found 15 faculty cards


Scraping pages:  15%|█▍        | 4/27 [00:38<03:35,  9.37s/it]


Visiting page 5...
  Found 15 faculty cards


Scraping pages:  19%|█▊        | 5/27 [00:46<03:15,  8.90s/it]


Visiting page 6...
  Found 15 faculty cards


Scraping pages:  22%|██▏       | 6/27 [00:54<02:58,  8.49s/it]


Visiting page 7...
  Found 15 faculty cards


Scraping pages:  26%|██▌       | 7/27 [01:02<02:48,  8.41s/it]


Visiting page 8...
  Found 15 faculty cards


Scraping pages:  30%|██▉       | 8/27 [01:10<02:38,  8.32s/it]


Visiting page 9...
  Found 15 faculty cards


Scraping pages:  33%|███▎      | 9/27 [01:19<02:30,  8.36s/it]


Visiting page 10...
  Found 15 faculty cards


Scraping pages:  37%|███▋      | 10/27 [01:26<02:18,  8.15s/it]


Visiting page 11...
  Found 15 faculty cards


Scraping pages:  41%|████      | 11/27 [01:34<02:09,  8.08s/it]


Visiting page 12...
  Found 15 faculty cards


Scraping pages:  44%|████▍     | 12/27 [01:42<01:59,  7.98s/it]


Visiting page 13...
  Found 15 faculty cards


Scraping pages:  48%|████▊     | 13/27 [01:50<01:50,  7.92s/it]


Visiting page 14...
  Found 15 faculty cards


Scraping pages:  52%|█████▏    | 14/27 [01:58<01:46,  8.18s/it]


Visiting page 15...
  Found 15 faculty cards


Scraping pages:  56%|█████▌    | 15/27 [02:06<01:37,  8.13s/it]


Visiting page 16...
  Found 15 faculty cards


Scraping pages:  59%|█████▉    | 16/27 [02:14<01:27,  7.92s/it]


Visiting page 17...
  Found 15 faculty cards


Scraping pages:  63%|██████▎   | 17/27 [02:22<01:19,  7.91s/it]


Visiting page 18...
  Found 15 faculty cards


Scraping pages:  67%|██████▋   | 18/27 [02:30<01:10,  7.88s/it]


Visiting page 19...
  Found 15 faculty cards


Scraping pages:  70%|███████   | 19/27 [02:37<01:01,  7.74s/it]


Visiting page 20...
  Found 15 faculty cards


Scraping pages:  74%|███████▍  | 20/27 [02:44<00:53,  7.61s/it]


Visiting page 21...
  Found 15 faculty cards


Scraping pages:  78%|███████▊  | 21/27 [02:52<00:45,  7.55s/it]


Visiting page 22...
  Found 15 faculty cards


Scraping pages:  81%|████████▏ | 22/27 [03:00<00:38,  7.61s/it]


Visiting page 23...
  Found 15 faculty cards


Scraping pages:  85%|████████▌ | 23/27 [03:07<00:30,  7.55s/it]


Visiting page 24...
  Found 15 faculty cards


Scraping pages:  89%|████████▉ | 24/27 [03:14<00:22,  7.56s/it]


Visiting page 25...
  Found 15 faculty cards


Scraping pages:  93%|█████████▎| 25/27 [03:22<00:15,  7.58s/it]


Visiting page 26...
  Found 15 faculty cards


Scraping pages:  96%|█████████▋| 26/27 [03:30<00:07,  7.61s/it]


Visiting page 27...
  Found 2 faculty cards


Scraping pages: 100%|██████████| 27/27 [03:37<00:00,  8.07s/it]



✅ Found 392 faculty members

✅ Faculty list saved to: faculty_list.csv

Preview (first 5):
                 name                               title  \
0       Soren Aandahl                            Lecturer   
1  Christopher Aarons  Assistant Professor of Instruction   
2      Ashish Agarwal                           Professor   
3       Shiva Agarwal                 Assistant Professor   
4    Joshua Alexander  Assistant Professor of Instruction   

                                     department  \
0                                    Accounting   
1                                     Marketing   
2  Information, Risk, and Operations Management   
3                                    Management   
4                                       Finance   

                                         profile_url  
0  https://www.mccombs.utexas.edu/faculty-and-res...  
1  https://www.mccombs.utexas.edu/faculty-and-res...  
2  https://www.mccombs.utexas.edu/faculty-and-res...  
3  https://www

In [3]:
import pandas as pd

#Read csv
df = pd.read_csv("faculty_list.csv")

# Print the first 10 lines
print(df.head(10))

                 name                               title  \
0       Soren Aandahl                            Lecturer   
1  Christopher Aarons  Assistant Professor of Instruction   
2      Ashish Agarwal                           Professor   
3       Shiva Agarwal                 Assistant Professor   
4    Joshua Alexander  Assistant Professor of Instruction   
5         Megan Allen  Assistant Professor of Instruction   
6      Andres Almazan                           Professor   
7         Mark Alpert                  Professor Emeritus   
8        Aydogan Alti                 Associate Professor   
9       Richard Amato                            Lecturer   

                                     department  \
0                                    Accounting   
1                                     Marketing   
2  Information, Risk, and Operations Management   
3                                    Management   
4                                       Finance   
5                     