In [None]:
# ===============================================================
# Wisconsin Business School Faculty List Scraper (Revised Version)
# Handles pagination and filters
# ===============================================================

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import time

print("üöÄ Wisconsin Business School Faculty List Scraper (Revised Version)")
print("="*70)

# Configure browser
options = webdriver.ChromeOptions()
# Uncomment to enable headless mode
# options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')

driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()), 
    options=options
)

try:
    # =============== Step 1: Visit page and set filter ===============
    print("\nüìå Step 1: Visiting Directory page...")
    
    directory_url = "https://business.wisc.edu/directory/"
    driver.get(directory_url)
    
    print("‚è≥ Waiting for page to load...")
    time.sleep(5)
    
    # Find and select "Faculty" filter
    print("\nüîç Looking for 'Type' filter...")
    
    try:
        # Find dropdown with class='facetwp-dropdown'
        type_select = None
        
        selects = driver.find_elements(By.CLASS_NAME, "facetwp-dropdown")
        
        if selects:
            # Usually the first one is the Type filter
            type_select = selects[0]
            print(f"‚úì Found filter (class='facetwp-dropdown')")
            
            select_obj = Select(type_select)
            
            # Show all options
            print("\nAvailable options:")
            for idx, option in enumerate(select_obj.options, 1):
                print(f"  {idx}. {option.text}")
            
            # Select "Faculty"
            try:
                select_obj.select_by_value("faculty")
                print("\n‚úÖ Selected 'Faculty' (220 members)")
                time.sleep(5)  # Wait for page update and load
            except Exception as e:
                print(f"\n‚ö†Ô∏è  Selection failed: {e}")
        else:
            print("\n‚ö†Ô∏è  Filter not found")
    
    except Exception as e:
        print(f"\n‚ö†Ô∏è  Filter operation failed: {e}")
        print("Continuing to scrape current displayed content...")
    
    # =============== Step 2: Get all Faculty from all pages ===============
    print("\nüìå Step 2: Traversing all pages...")
    
    all_profile_urls = set()
    current_page = 1
    max_pages = 20  # Try maximum 20 pages (you said there are 19 pages)
    
    while current_page <= max_pages:
        print(f"\n{'='*70}")
        print(f"üìÑ Page {current_page}")
        print(f"{'='*70}")
        
        # Wait for page to load
        time.sleep(3)
        
        # Scroll page to ensure all content is visible
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1)
        driver.execute_script("window.scrollTo(0, 0);")
        time.sleep(1)
        
        # Get all profile links on current page
        page_links = driver.find_elements(By.TAG_NAME, "a")
        page_profile_count = 0
        
        for link in page_links:
            try:
                href = link.get_attribute('href')
                if href and '/profile/' in href and 'business.wisc.edu' in href:
                    if href not in all_profile_urls:
                        all_profile_urls.add(href)
                        page_profile_count += 1
            except:
                continue
        
        print(f"  ‚úì Added {page_profile_count} Faculty on this page")
        print(f"  ‚úì Total collected {len(all_profile_urls)} Faculty")
        
        # Find next page button
        next_clicked = False
        
        try:
            next_page_num = current_page + 1
            
            # Scroll to bottom of page to ensure pagination buttons are visible
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(1)
            
            # Method 1: Use JavaScript to find and click FacetWP pagination
            try:
                # FacetWP uses specific pagination structure
                js_click = f"""
                var pages = document.querySelectorAll('.facetwp-page');
                for (var i = 0; i < pages.length; i++) {{
                    if (pages[i].textContent.trim() === '{next_page_num}') {{
                        pages[i].click();
                        return true;
                    }}
                }}
                return false;
                """
                result = driver.execute_script(js_click)
                if result:
                    print(f"  üîò Clicked page {next_page_num} (JavaScript)")
                    next_clicked = True
                    time.sleep(4)
            except Exception as e:
                print(f"  Method 1 failed: {e}")
            
            # Method 2: Find button containing number and click with JavaScript
            if not next_clicked:
                try:
                    page_buttons = driver.find_elements(By.XPATH, 
                        f"//a[contains(@class, 'facetwp-page') and text()='{next_page_num}']")
                    
                    if page_buttons:
                        # Scroll to button position
                        driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", page_buttons[0])
                        time.sleep(0.5)
                        # Click using JavaScript
                        driver.execute_script("arguments[0].click();", page_buttons[0])
                        print(f"  üîò Clicked page {next_page_num}")
                        next_clicked = True
                        time.sleep(4)
                except Exception as e:
                    print(f"  Method 2 failed: {e}")
            
            # Method 3: Find "Next" button
            if not next_clicked:
                try:
                    next_buttons = driver.find_elements(By.XPATH, 
                        "//a[contains(@class, 'facetwp-page') and (contains(text(), '‚Ä∫') or contains(text(), 'Next'))]")
                    
                    if next_buttons:
                        driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", next_buttons[0])
                        time.sleep(0.5)
                        driver.execute_script("arguments[0].click();", next_buttons[0])
                        print(f"  üîò Clicked 'Next' button")
                        next_clicked = True
                        time.sleep(4)
                except Exception as e:
                    print(f"  Method 3 failed: {e}")
            
            # Method 4: Find all pagination links
            if not next_clicked:
                try:
                    all_page_links = driver.find_elements(By.CSS_SELECTOR, "a.facetwp-page, .pagination a")
                    print(f"  Found {len(all_page_links)} pagination links")
                    
                    for link in all_page_links:
                        if link.text.strip() == str(next_page_num):
                            driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", link)
                            time.sleep(0.5)
                            driver.execute_script("arguments[0].click();", link)
                            print(f"  üîò Clicked page {next_page_num}")
                            next_clicked = True
                            time.sleep(4)
                            break
                except Exception as e:
                    print(f"  Method 4 failed: {e}")
            
            if not next_clicked:
                print(f"  ‚ö†Ô∏è  Next page button not found, may have reached last page")
                break
            
            current_page += 1
            
        except Exception as e:
            print(f"  ‚úó Pagination operation failed: {e}")
            import traceback
            traceback.print_exc()
            break
    
    # Convert to list and sort
    profile_urls = sorted(list(all_profile_urls))
    
    print(f"\n{'='*70}")
    print(f"‚úÖ Total found {len(profile_urls)} Faculty")
    print(f"{'='*70}")
    
    if len(profile_urls) == 0:
        print("‚ùå No Faculty found")
        with open("directory_error.html", "w", encoding="utf-8") as f:
            f.write(driver.page_source)
        print("Saved page to directory_error.html")
        driver.quit()
        exit()
    
    # Show first 20
    print("\nüìã First 20 Faculty:")
    for i, url in enumerate(profile_urls[:20], 1):
        name = url.split('/profile/')[-1].replace('-', ' ').title()
        print(f"  {i}. {name}")
    
    # =============== Step 3: Collect basic information for each Faculty ===============
    print(f"\nüìå Step 3: Collecting detailed information for {len(profile_urls)} Faculty")
    print("="*70)
    
    # Ask whether to continue
    user_input = input(f"\nThere are {len(profile_urls)} Faculty, estimated time {len(profile_urls)*2//60} minutes.\nEnter 'y' to continue, or enter a number to collect only first N, or 'n' to save links only: ")
    
    if user_input.lower() == 'n':
        # Only save links, don't visit detail pages
        df_urls = pd.DataFrame({
            'Profile_URL': profile_urls,
            'Name_from_URL': [url.split('/profile/')[-1].replace('-', ' ').title() for url in profile_urls]
        })
        df_urls.to_csv("wisc_faculty_urls_only.csv", index=False, encoding='utf-8-sig')
        print(f"‚úÖ Saved {len(profile_urls)} links to wisc_faculty_urls_only.csv")
        driver.quit()
        exit()
    
    elif user_input.isdigit():
        profile_urls = profile_urls[:int(user_input)]
        print(f"‚úì Will collect first {len(profile_urls)} Faculty")
    elif user_input.lower() != 'y':
        print("Cancelled")
        driver.quit()
        exit()
    
    faculty_list = []
    failed_count = 0
    
    for idx, profile_url in enumerate(profile_urls, 1):
        try:
            name_from_url = profile_url.split('/profile/')[-1].replace('-', ' ').title()
            
            print(f"[{idx}/{len(profile_urls)}] {name_from_url}")
            
            driver.get(profile_url)
            time.sleep(2)
            
            # Parse page
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            
            # Initialize data
            info = {
                'Name': '',
                'First_Name': '',
                'Last_Name': '',
                'Title': '',
                'Department': '',
                'Email': '',
                'Phone': '',
                'Office': '',
                'Profile_URL': profile_url,
                'Google_Scholar_URL': ''
            }
            
            # Extract name
            name_tag = soup.find('h1')
            if name_tag:
                full_name = name_tag.get_text(strip=True)
                info['Name'] = full_name
                
                name_parts = full_name.split()
                if len(name_parts) >= 2:
                    info['First_Name'] = name_parts[0]
                    info['Last_Name'] = name_parts[-1]
            else:
                info['Name'] = name_from_url
                name_parts = name_from_url.split()
                if len(name_parts) >= 2:
                    info['First_Name'] = name_parts[0]
                    info['Last_Name'] = name_parts[-1]
            
            # Extract title
            for tag in soup.find_all(['p', 'div', 'span', 'h2', 'h3']):
                text = tag.get_text(strip=True)
                if text and len(text) < 200:
                    if any(word in text.lower() for word in ['professor', 'lecturer', 'instructor', 'assistant', 'associate']):
                        info['Title'] = text
                        break
            
            # Extract department
            dept_elem = soup.find(['p', 'div', 'span'], class_=lambda x: x and 'department' in str(x).lower())
            if dept_elem:
                info['Department'] = dept_elem.get_text(strip=True)
            
            # Extract email
            email_tag = soup.find('a', href=lambda x: x and 'mailto:' in x)
            if email_tag:
                info['Email'] = email_tag.get_text(strip=True).replace('mailto:', '')
            
            # Extract phone
            phone_tag = soup.find('a', href=lambda x: x and 'tel:' in x)
            if phone_tag:
                info['Phone'] = phone_tag.get_text(strip=True)
            
            # Find Google Scholar link
            scholar_link = soup.find('a', href=lambda x: x and 'scholar.google' in str(x))
            if scholar_link:
                info['Google_Scholar_URL'] = scholar_link.get('href')
            
            faculty_list.append(info)
            
            print(f"  ‚úì {info['Name']}")
            print(f"    {info['Title'][:50] if info['Title'] else 'N/A'}")
            
            # Save every 20
            if idx % 20 == 0:
                backup_df = pd.DataFrame(faculty_list)
                backup_df.to_csv("wisc_faculty_backup.csv", index=False, encoding='utf-8-sig')
                print(f"\n  üíæ Backed up {idx} members\n")
            
        except Exception as e:
            print(f"  ‚úó Failed: {e}")
            failed_count += 1
            continue
    
    # =============== Step 4: Save data ===============
    print("\n" + "="*70)
    print("üìå Step 4: Saving data...")
    
    df = pd.DataFrame(faculty_list)
    
    # Full information
    df.to_csv("wisc_faculty_list_full.csv", index=False, encoding='utf-8-sig')
    print(f"‚úÖ wisc_faculty_list_full.csv ({len(df)} members)")
    
    # Statistics
    print("\nüìä Statistics:")
    print("="*70)
    print(f"Total Faculty: {len(df)}")
    print(f"Success: {len(df)}")
    print(f"Failed: {failed_count}")
    print(f"With Email: {(df['Email'] != '').sum()}")
    print(f"With Google Scholar: {(df['Google_Scholar_URL'] != '').sum()}")
    
    print("\n‚úÖ All completed!")

except KeyboardInterrupt:
    print("\n‚ö†Ô∏è  User interrupted")
    if 'faculty_list' in locals() and faculty_list:
        pd.DataFrame(faculty_list).to_csv("wisc_faculty_interrupted.csv", index=False, encoding='utf-8-sig')
        print(f"üíæ Saved {len(faculty_list)} members")

except Exception as e:
    print(f"\n‚ùå Error: {e}")
    import traceback
    traceback.print_exc()

finally:
    try:
        driver.quit()
        print("\nüîí Browser closed")
    except:
        pass

In [None]:
import pandas as pd

# Read CSV
df = pd.read_csv("wisc_faculty_list_full.csv", encoding='utf-8-sig')

# Keep only Name and Profile_URL columns
df_filtered = df[['Name', 'Profile_URL']]

# Save as Excel
df_filtered.to_excel("wisc_faculty_name_url.xlsx", index=False, engine='openpyxl')

print(f"‚úÖ Saved to wisc_faculty_name_url.xlsx")
print(f"Total {len(df_filtered)} rows")

In [11]:
import pandas as pd

# Read Excel file
df = pd.read_excel("wisc_faculty_name_url.xlsx")

# Display first 10 rows as table
print(df.head(10).to_string(index=False))

                  Name                                                        Profile_URL
         Aaron Thielen         https://business.wisc.edu/directory/profile/aaron-thielen/
        Abdullah Yavas        https://business.wisc.edu/directory/profile/abdullah-yavas/
           Adam J Bock           https://business.wisc.edu/directory/profile/adam-j-bock/
        Adam R Smedema        https://business.wisc.edu/directory/profile/adam-r-smedema/
          Alan Stoffer          https://business.wisc.edu/directory/profile/alan-stoffer/
Alexander D. Stajkovic https://business.wisc.edu/directory/profile/alexander-d-stajkovic/
         Alina Arefeva         https://business.wisc.edu/directory/profile/alina-arefeva/
              Allen Li              https://business.wisc.edu/directory/profile/allen-li/
   Alyssa Gosbee Stang   https://business.wisc.edu/directory/profile/alyssa-gosbee-stang/
          Amanda Kenny          https://business.wisc.edu/directory/profile/amanda-kenny/
