In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import time

INPUT_FILE = "wisc_faculty_name_url.xlsx"
OUTPUT_FILE = "wisconsin_publications_from_web.csv"

HEADERS = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0 Safari/537.36"
}

def scrape_publications(url):
    """Scrape journal publications from a faculty profile page."""
    results = []
    
    try:
        resp = requests.get(url, headers=HEADERS, timeout=15)
        if resp.status_code != 200:
            return results
        
        soup = BeautifulSoup(resp.text, "html.parser")
        
        # Locate the Research section
        research_block = soup.find(id="profile-Research")
        if not research_block:
            return results
        
        # Extract all publication entries
        entries = research_block.find_all("p", class_="journalEntry")
        
        for p in entries:
            text = p.get_text(strip=True)
            if len(text) > 5:
                results.append(text)
                
    except Exception as e:
        print(f"Error scraping {url}: {e}")
    
    return results


def main():
    df = pd.read_excel(INPUT_FILE)
    
    if "Name" not in df.columns or "Profile_URL" not in df.columns:
        raise ValueError("Excel must contain columns 'Name' and 'Profile_URL'")
    
    all_rows = []
    
    for _, row in tqdm(df.iterrows(), total=len(df), desc="Scraping faculty"):
        name = row["Name"]
        url = row["Profile_URL"]
        
        if pd.isna(url) or not str(url).startswith("http"):
            continue
        
        pubs = scrape_publications(url)
        
        for pub in pubs:
            all_rows.append({
                "Faculty Name": name,
                "Publication Text": pub,
                "Source": url
            })
        
        time.sleep(1)  # polite delay
    
    final_df = pd.DataFrame(all_rows)
    final_df.to_csv(OUTPUT_FILE, index=False)
    
    print(f"\n✅ Completed! Extracted {len(final_df)} publications, saved to {OUTPUT_FILE}")


if __name__ == "__main__":
    main()


In [3]:
import pandas as pd
import re

# ============================
# Config
# ============================
INPUT_CSV = "wisconsin_publications_from_web.csv"
OUTPUT_XLSX = "wisconsin_publications_parsed.xlsx"

# ============================
# Publication Text Parser
# ============================
def parse_publication_text(text: str):
    """
    Parse Wisconsin School of Business publication text into:
    authors, year, title, venue
    """
    if not text or not isinstance(text, str):
        return "", "", "", ""
    
    # Normalize spaces
    t = " ".join(text.split())

    # 1️⃣ Extract (YEAR)
    year_match = re.search(r"\((\d{4})\)", t)
    year = year_match.group(1) if year_match else ""

    # Split into before-year and after-year parts
    if year:
        before_year, after_year = t.split(f"({year})", 1)
    else:
        before_year, after_year = t, ""

    # 2️⃣ Authors
    authors = before_year.strip(" ,.;")

    # 3️⃣ Remove leading punctuation
    after_year = after_year.lstrip("). ").strip()

    # 4️⃣ Title + venue splitting by first period
    title = ""
    venue = ""

    parts = after_year.split(".")
    if len(parts) >= 2:
        title = parts[0].strip()
        venue = ".".join(parts[1:]).strip()
    else:
        # No period: entire text is title
        title = after_year.strip()
        venue = ""

    return authors, year, title, venue


# ============================
# Load CSV and Parse It
# ============================
df = pd.read_csv(INPUT_CSV)

# Add new columns
df["authors"] = ""
df["year"] = ""
df["title"] = ""
df["venue"] = ""

# Parse each row
for idx, row in df.iterrows():
    authors, year, title, venue = parse_publication_text(row["Publication Text"])
    df.at[idx, "authors"] = authors
    df.at[idx, "year"] = year
    df.at[idx, "title"] = title
    df.at[idx, "venue"] = venue

# ============================
# Save as Excel
# ============================
df.to_excel(OUTPUT_XLSX, index=False)
print(f"✅ Done! Parsed file saved to: {OUTPUT_XLSX}")


✅ Done! Parsed file saved to: wisconsin_publications_parsed.xlsx


In [5]:
import pandas as pd

# Read your parsed Excel
df = pd.read_excel("wisconsin_publications_parsed.xlsx")

#  Print the first 10 lines
print(df.head(10))


     Faculty Name                                   Publication Text  \
0  Abdullah Yavas  Allen, M. & Rutherford, J. & Rutherford, R. & ...   
1  Abdullah Yavas  Akin, Z. & Yavas, A. (2025).Behavior in Long-R...   
2  Abdullah Yavas  Yavas, A. & Zhu, S. (2024).Misreporting of Sec...   
3  Abdullah Yavas  Fan, Y. & Yavas, A. (2023).Price Dynamics in P...   
4  Abdullah Yavas  Lu, F. & Li, L. & Yavas, A. (2023).The Impact ...   
5  Abdullah Yavas  Li, L. & Yavas, A. & Zhu, B. (2023).Externalit...   
6  Abdullah Yavas  Higgins, E. & Yavas, A. & Zhu, S. (2022).Priva...   
7  Abdullah Yavas  Li, L. & Ucar, E. & Yavas, A. (2022).Social Ca...   
8  Abdullah Yavas  Emmerling, T. & Yavas, A. & Yildirim, Y. (2021...   
9  Abdullah Yavas  Fan, Y. & Yavas, A. (2020).How Does Mortgage D...   

                                              Source  \
0  https://business.wisc.edu/directory/profile/ab...   
1  https://business.wisc.edu/directory/profile/ab...   
2  https://business.wisc.edu/directory/