In [None]:
"""
Cell 2: Scrape Publications
Reads faculty list from Cell 1 and scrapes publications for each faculty
"""

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
from tqdm import tqdm

def setup_driver():
    """Setup Chrome browser"""
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-blink-features=AutomationControlled')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--window-size=1920,1080')
    chrome_options.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
    chrome_options.add_argument('--disable-extensions')
    chrome_options.add_argument('--disable-infobars')
    chrome_options.add_argument('--remote-debugging-port=9222')
    
    try:
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=chrome_options)
        return driver
    except Exception as e:
        print(f"Method 1 failed: {e}")
        try:
            driver = webdriver.Chrome(options=chrome_options)
            return driver
        except Exception as e2:
            print(f"Method 2 also failed: {e2}")
            raise

def scrape_publications(driver, profile_url):
    """Scrape publications from faculty profile page"""
    publications = []
    
    try:
        driver.get(profile_url)
        time.sleep(3)
        
        # Click Publications tab - target the specific section
        try:
            # Find the Publications section link by data-hash
            pub_tab_selector = "//section[@data-hash='#Publications']//ancestor::div//a[contains(text(), 'Publications')] | //a[@href='#Publications'] | //*[@data-hash='#Publications']//preceding::a[contains(text(), 'Publications')][1]"
            
            try:
                pub_tab = driver.find_element(By.XPATH, pub_tab_selector)
                driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});", pub_tab)
                time.sleep(0.5)
                
                try:
                    pub_tab.click()
                except:
                    driver.execute_script("arguments[0].click();", pub_tab)
                
                time.sleep(2)
            except:
                # Fallback: try generic Publications link
                pub_tab = driver.find_element(By.XPATH, "//a[contains(text(), 'Publications')]")
                driver.execute_script("arguments[0].click();", pub_tab)
                time.sleep(2)
        except:
            pass
        
        # CRITICAL: Only get publications from the Publications section
        # Use very specific selector to avoid other sections
        pub_selector = "//section[@data-hash='#Publications']//li[@class='dm-profile-activity']"
        
        try:
            pub_elements = driver.find_elements(By.XPATH, pub_selector)
            
            # Only get visible elements
            for pub in pub_elements:
                if pub.is_displayed():
                    text = pub.text.strip()
                    # Additional filtering: must be substantial text
                    if text and len(text) > 30:
                        # Exclude if it looks like a section header or navigation
                        if not any(x in text for x in ['About', 'Education', 'Awards', 'Honors', 'Publications', 'tabindex']):
                            publications.append(text)
        except:
            pass
        
        # Remove duplicates while preserving order
        publications = list(dict.fromkeys(publications))
        
    except Exception as e:
        pass
    
    return publications

# Main execution for Cell 2
print("=" * 70)
print("McCombs Faculty Scraper - Part 2: Publications")
print("=" * 70)

# Load faculty list from Cell 1
try:
    df_faculty = pd.read_csv('faculty_list.csv')
    print(f"\n✅ Loaded {len(df_faculty)} faculty members from faculty_list.csv")
except FileNotFoundError:
    print("\n❌ Error: faculty_list.csv not found!")
    print("Please run Cell 1 first to generate the faculty list.")
    raise

print("\nInitializing browser...")
driver = setup_driver()

try:
    print("\nStep 2: Scraping publications...")
    print("=" * 70)
    
    all_publications = []
    
    for i, row in enumerate(tqdm(df_faculty.iterrows(), total=len(df_faculty), desc="Scraping publications"), 1):
        faculty = row[1]
        
        try:
            publications = scrape_publications(driver, faculty['profile_url'])
            
            # Create one record per publication
            if publications:
                for pub in publications:
                    all_publications.append({
                        'Name': faculty['name'],
                        'Title': faculty['title'],
                        'Department': faculty['department'],
                        'Profile URL': faculty['profile_url'],
                        'Publication': pub
                    })
            else:
                # Keep one record even if no publications
                all_publications.append({
                    'Name': faculty['name'],
                    'Title': faculty['title'],
                    'Department': faculty['department'],
                    'Profile URL': faculty['profile_url'],
                    'Publication': ""
                })
            
            # Progress update every 10 faculty
            if i % 10 == 0:
                print(f"\n  Processed {i}/{len(df_faculty)} faculty, {len(all_publications)} total records")
            
            time.sleep(1)
            
        except Exception as e:
            print(f"\n  Error processing {faculty['name']}: {e}")
            all_publications.append({
                'Name': faculty['name'],
                'Title': faculty['title'],
                'Department': faculty['department'],
                'Profile URL': faculty['profile_url'],
                'Publication': ""
            })
            continue
    
    # Convert to DataFrame
    df = pd.DataFrame(all_publications)
    
    # Display statistics
    print("\n" + "=" * 70)
    print("Data Statistics:")
    print("=" * 70)
    print(f"Total records (rows): {len(df)}")
    print(f"Total faculty: {df['Name'].nunique()}")
    print(f"Total publications: {df['Publication'].astype(bool).sum()}")
    print(f"Faculty with publications: {df[df['Publication'] != '']['Name'].nunique()}")
    
    # Preview
    print("\nFirst 10 records preview:")
    print(df[['Name', 'Title', 'Publication']].head(10).to_string(index=False))
    
    # Save to CSV
    output_file = 'mccombs_faculty_publications.csv'
    df.to_csv(output_file, index=False, encoding='utf-8-sig')
    
    print(f"\n✅ Data saved to: {output_file}")
    
    # Top 5 faculty by publication count
    print("\nTop 5 faculty by publication count:")
    pub_counts = df[df['Publication'] != ''].groupby('Name').size().sort_values(ascending=False).head()
    for name, count in pub_counts.items():
        print(f"  {name}: {count} publications")
    
finally:
    print("\nClosing browser...")
    driver.quit()

print("\n" + "=" * 70)
print("Cell 2 Complete! Publications scraped successfully.")
print("=" * 70)

In [None]:
"""
Remove Empty Publications
Remove rows where Publication field is empty or whitespace only
"""

import pandas as pd

# Read the data
df = pd.read_csv('mccombs_faculty_publications.csv')

print("=" * 70)
print("Remove Empty Publications")
print("=" * 70)

print(f"\nOriginal data:")
print(f"  Total rows: {len(df)}")
print(f"  Total faculty: {df['Name'].nunique()}")

# Count empty publications
empty_count = (df['Publication'].isna() | (df['Publication'].str.strip() == '')).sum()
print(f"  Empty publications: {empty_count}")

# Remove empty publications
df_cleaned = df[df['Publication'].notna() & (df['Publication'].str.strip() != '')].copy()

print(f"\nAfter removing empty publications:")
print(f"  Total rows: {len(df_cleaned)}")
print(f"  Total faculty: {df_cleaned['Name'].nunique()}")
print(f"  Removed: {empty_count} rows")

# Preview
print(f"\nFirst 5 rows preview:")
print(df_cleaned[['Name', 'Publication']].head().to_string(index=False))

# Save cleaned data
output_file = 'mccombs_faculty_publications_cleaned.csv'
df_cleaned.to_csv(output_file, index=False, encoding='utf-8-sig')

print(f"\n✅ Cleaned data saved to: {output_file}")
print("\n" + "=" * 70)

In [None]:
"""
Extract Title from Publication
Add a 'title' column extracted from Publication field
Note: Name and Year already exist in the dataset
"""

import pandas as pd
import re

# Read the cleaned data
df = pd.read_csv('mccombs_faculty_publications_cleaned.csv')

print("=" * 70)
print("Extract Title from Publication")
print("=" * 70)

print(f"\nInput data:")
print(f"  Total rows: {len(df)}")
print(f"  Existing columns: {', '.join(df.columns)}")

def extract_title(text):
    """
    Extract paper title from publication text
    
    Title is usually:
    1. In double quotes "Title"
    2. Between year and period/journal name
    3. After author names and year
    """
    if pd.isna(text):
        return ""
    
    text = str(text).strip()
    
    # Method 1: Extract text in double quotes "Title"
    match = re.search(r'"([^"]+)"', text)
    if match:
        title = match.group(1).strip()
        return title
    
    # Method 2: Extract after year, before period or <i> tag
    # Pattern: ... 2024. Title. <i>Journal</i> or ... 2024. Title?. Journal
    year_match = re.search(r'\b(19|20)\d{2}\b', text)
    if year_match:
        # Get text after year
        after_year = text[year_match.end():].strip()
        
        # Remove leading punctuation
        after_year = after_year.lstrip('., ')
        
        # Extract until: period, <i>, or end
        # Try to find end markers
        end_markers = [
            r'\.\s*<i>',           # Period before italic (journal name)
            r'\?\.',               # Question mark + period
            r'\.\s*[A-Z][a-z]',   # Period before capitalized word (might be journal)
            r'\.\s*\d',            # Period before number (volume/issue)
        ]
        
        for marker in end_markers:
            marker_match = re.search(marker, after_year)
            if marker_match:
                title = after_year[:marker_match.start()].strip()
                # Clean up trailing punctuation
                title = title.rstrip('.,?! ')
                if len(title) > 10:  # Reasonable title length
                    return title
        
        # If no marker found, take first sentence
        sentence_match = re.match(r'([^.?!]+)', after_year)
        if sentence_match:
            title = sentence_match.group(1).strip()
            title = title.rstrip('.,?! ')
            if len(title) > 10:
                return title
    
    # Method 3: If still no title, return first 100 chars as fallback
    return text[:100].strip()

# Extract title
print("\nExtracting titles...")
df['title'] = df['Publication'].apply(extract_title)

# Statistics
successful_extractions = (df['title'].str.len() > 10).sum()
print(f"\n✅ Successfully extracted {successful_extractions} titles ({successful_extractions/len(df)*100:.1f}%)")

# Check for potential issues
short_titles = df[df['title'].str.len() < 10]
if len(short_titles) > 0:
    print(f"\n⚠️ Warning: {len(short_titles)} titles are very short (<10 chars)")
    print("First 3 examples:")
    for i in range(min(3, len(short_titles))):
        print(f"  {i+1}. Title: '{short_titles.iloc[i]['title']}'")
        print(f"     Original: {short_titles.iloc[i]['Publication'][:100]}...")

# Display examples
print("\n" + "=" * 70)
print("Examples of extracted titles:")
print("=" * 70)

for i in range(min(5, len(df))):
    print(f"\nExample {i+1}:")
    print(f"  Title: {df.iloc[i]['title']}")
    print(f"  Original: {df.iloc[i]['Publication'][:150]}...")

# Reorder columns to put title near the front
# Assuming typical columns: Name, Title (job title), Department, Profile URL, Publication, Year
if 'Year' in df.columns:
    # If Year already exists, put title after it
    cols = df.columns.tolist()
    cols.remove('title')
    
    # Find where to insert title (after Year if it exists)
    if 'Year' in cols:
        year_idx = cols.index('Year')
        cols.insert(year_idx + 1, 'title')
    else:
        # Put title as second column
        cols.insert(1, 'title')
    
    df = df[cols]
else:
    # If no Year column, just keep title at the end
    pass

# Preview final structure
print("\n" + "=" * 70)
print("Final data structure:")
print("=" * 70)
print(f"Columns: {', '.join(df.columns)}")
print(f"\nFirst 5 rows preview:")
preview_cols = ['Name', 'title'] + ([col for col in df.columns if col not in ['Name', 'title', 'Publication']])
if len(preview_cols) > 5:
    preview_cols = preview_cols[:5]
print(df[preview_cols].head().to_string(index=False))

# Save with title column
output_file = 'mccombs_faculty_publications_with_title.csv'
df.to_csv(output_file, index=False, encoding='utf-8-sig')

print(f"\n✅ Data with title column saved to: {output_file}")
print("\n" + "=" * 70)
print("Complete! You now have:")
print("- Name (already existed)")
print("- title (newly extracted) ⭐")
print("- Year (already existed)")
print("- Publication (original)")
print("=" * 70)

In [4]:
df = pd.read_csv("mccombs_faculty_publications_with_title.csv")
print(df.head(10))


                 Name                                              Title  \
0  Christopher Aarons                                  The Digital Helix   
1  Christopher Aarons  Digital Transformation: Delivering on the Promise   
2  Christopher Aarons  Aarons, C. “Why People Matter Far More Than Di...   
3      Ashish Agarwal  Follow Your Heart or Listen to Users? The Case...   
4      Ashish Agarwal  Demand-Side Effects of Open Innovation: The Ca...   
5      Ashish Agarwal  The Effect of Popularity Cues and Peer Endorse...   
6      Ashish Agarwal  Liu, Y., Agarwal, A., Lai, G. and Zhou, W. On-...   
7      Ashish Agarwal  Yang, C., Agarwal, A. and Konana, P. General B...   
8      Ashish Agarwal  Artificial Intelligence, Collusion and Ad Auct...   
9      Ashish Agarwal  Promotional Inventory Displays: An Empirical A...   

                                     Department    Year  \
0                                     Marketing     NaN   
1                                     Marketi