In [None]:
"""
Cell 2: Scrape Publications
Reads faculty list from Cell 1 and scrapes publications for each faculty
"""

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
from tqdm import tqdm

def setup_driver():
    """Setup Chrome browser"""
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-blink-features=AutomationControlled')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--window-size=1920,1080')
    chrome_options.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
    chrome_options.add_argument('--disable-extensions')
    chrome_options.add_argument('--disable-infobars')
    chrome_options.add_argument('--remote-debugging-port=9222')
    
    try:
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=chrome_options)
        return driver
    except Exception as e:
        print(f"Method 1 failed: {e}")
        try:
            driver = webdriver.Chrome(options=chrome_options)
            return driver
        except Exception as e2:
            print(f"Method 2 also failed: {e2}")
            raise

def scrape_publications(driver, profile_url):
    """Scrape publications from faculty profile page"""
    publications = []
    
    try:
        driver.get(profile_url)
        time.sleep(3)
        
        # Click Publications tab
        try:
            pub_tab_selectors = [
                "//a[contains(text(), 'Publications')]",
                "//button[contains(text(), 'Publications')]",
                "//*[@role='tab' and contains(text(), 'Publications')]",
            ]
            
            pub_tab_clicked = False
            for selector in pub_tab_selectors:
                try:
                    pub_tab = driver.find_element(By.XPATH, selector)
                    driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});", pub_tab)
                    time.sleep(0.5)
                    
                    try:
                        pub_tab.click()
                    except:
                        driver.execute_script("arguments[0].click();", pub_tab)
                    
                    time.sleep(2)
                    pub_tab_clicked = True
                    break
                except:
                    continue
        except:
            pass
        
        # Find publications list
        pub_selectors = [
            "//li[@class='dm-profile-activity']",
            "//ol[@class='dm-profile-activities']//li",
            "//section[@data-hash='#Publications']//li",
            "//div[@class='dm-profile-contents']//li",
        ]
        
        for selector in pub_selectors:
            try:
                pub_elements = driver.find_elements(By.XPATH, selector)
                visible_pubs = [pub for pub in pub_elements if pub.is_displayed()]
                
                if visible_pubs:
                    for pub in visible_pubs:
                        text = pub.text.strip()
                        if text and len(text) > 20:
                            publications.append(text)
                    if publications:
                        break
            except:
                continue
        
        # Remove duplicates
        publications = list(dict.fromkeys(publications))
        
    except Exception as e:
        pass
    
    return publications

# Main execution for Cell 2
print("=" * 70)
print("McCombs Faculty Scraper - Part 2: Publications")
print("=" * 70)

# Load faculty list from Cell 1
try:
    df_faculty = pd.read_csv('faculty_list.csv')
    print(f"\n✅ Loaded {len(df_faculty)} faculty members from faculty_list.csv")
except FileNotFoundError:
    print("\n❌ Error: faculty_list.csv not found!")
    print("Please run Cell 1 first to generate the faculty list.")
    raise

print("\nInitializing browser...")
driver = setup_driver()

try:
    print("\nStep 2: Scraping publications...")
    print("=" * 70)
    
    all_publications = []
    
    for i, row in enumerate(tqdm(df_faculty.iterrows(), total=len(df_faculty), desc="Scraping publications"), 1):
        faculty = row[1]
        
        try:
            publications = scrape_publications(driver, faculty['profile_url'])
            
            # Create one record per publication
            if publications:
                for pub in publications:
                    all_publications.append({
                        'Name': faculty['name'],
                        'Title': faculty['title'],
                        'Department': faculty['department'],
                        'Profile URL': faculty['profile_url'],
                        'Publication': pub
                    })
            else:
                # Keep one record even if no publications
                all_publications.append({
                    'Name': faculty['name'],
                    'Title': faculty['title'],
                    'Department': faculty['department'],
                    'Profile URL': faculty['profile_url'],
                    'Publication': ""
                })
            
            # Progress update every 10 faculty
            if i % 10 == 0:
                print(f"\n  Processed {i}/{len(df_faculty)} faculty, {len(all_publications)} total records")
            
            time.sleep(1)
            
        except Exception as e:
            print(f"\n  Error processing {faculty['name']}: {e}")
            all_publications.append({
                'Name': faculty['name'],
                'Title': faculty['title'],
                'Department': faculty['department'],
                'Profile URL': faculty['profile_url'],
                'Publication': ""
            })
            continue
    
    # Convert to DataFrame
    df = pd.DataFrame(all_publications)
    
    # Display statistics
    print("\n" + "=" * 70)
    print("Data Statistics:")
    print("=" * 70)
    print(f"Total records (rows): {len(df)}")
    print(f"Total faculty: {df['Name'].nunique()}")
    print(f"Total publications: {df['Publication'].astype(bool).sum()}")
    print(f"Faculty with publications: {df[df['Publication'] != '']['Name'].nunique()}")
    
    # Preview
    print("\nFirst 10 records preview:")
    print(df[['Name', 'Title', 'Publication']].head(10).to_string(index=False))
    
    # Save to CSV
    output_file = 'mccombs_faculty_publications.csv'
    df.to_csv(output_file, index=False, encoding='utf-8-sig')
    
    print(f"\n✅ Data saved to: {output_file}")
    
    # Top 5 faculty by publication count
    print("\nTop 5 faculty by publication count:")
    pub_counts = df[df['Publication'] != ''].groupby('Name').size().sort_values(ascending=False).head()
    for name, count in pub_counts.items():
        print(f"  {name}: {count} publications")
    
finally:
    print("\nClosing browser...")
    driver.quit()

print("\n" + "=" * 70)
print("Cell 2 Complete! Publications scraped successfully.")
print("=" * 70)

In [3]:
import pandas as pd

#Read csv
df = pd.read_csv("mccombs_faculty_publications.csv")

# Print the first 10 lines
print(df.head(10))


                 Name                               Title  \
0       Soren Aandahl                            Lecturer   
1       Soren Aandahl                            Lecturer   
2  Christopher Aarons  Assistant Professor of Instruction   
3  Christopher Aarons  Assistant Professor of Instruction   
4  Christopher Aarons  Assistant Professor of Instruction   
5  Christopher Aarons  Assistant Professor of Instruction   
6      Ashish Agarwal                           Professor   
7      Ashish Agarwal                           Professor   
8      Ashish Agarwal                           Professor   
9      Ashish Agarwal                           Professor   

                                     Department  \
0                                    Accounting   
1                                    Accounting   
2                                     Marketing   
3                                     Marketing   
4                                     Marketing   
5                     