# Assignment 1
> **Github repository**: [02467_Assignment1](https://github.com/JulWin24/02467_Assignment1)
>
> **Group members**:
> - Rune Harlyk (s234814)
> - Joseph Nguyen (s)
> - Julius Winkel (s234862)

In [16]:
import requests
from bs4 import BeautifulSoup
from unidecode import unidecode
from fuzzywuzzy import fuzz
from collections import defaultdict
from tqdm import tqdm
import pandas as pd
import json
import re
import os

## Part 1: Web-scraping

### Fetch program

In [7]:
url = "https://ic2s2-2023.org/program"

req = requests.get(url)
soup = BeautifulSoup(req.text, "html.parser")

### Get names

In [8]:
names = set()

def get_plenary_names(names, soup): 
    new_names = {name.strip() for nav_list in soup.find_all("ul", class_="nav_list") 
        for i in nav_list.find_all("i") 
        for name in i.get_text(strip=True).split(",")}
    print(f"Found: {len(new_names)} plenary names")
    names.update(new_names)

def get_keynotes_names(names, soup):
    new_names = {a.get_text(strip=True).replace("Keynote - ", "") 
        for a in soup.find_all("a", href=lambda x: x and x.startswith("/keynotes#"))}
    print(f"Found: {len(new_names)} keynotes names")
    names.update(new_names)
    
def get_chair_names(names, soup):
    new_names = {i.get_text(strip=True).replace("Chair: ", "") 
          for i in soup.find_all("i") if i.get_text(strip=True).startswith("Chair:")}
    print(f"Found: {len(new_names)} chair names")
    names.update(new_names)

get_plenary_names(names, soup)
get_keynotes_names(names, soup)
get_chair_names(names, soup)

print(f"Found: {len(names)} names in total" )

Found: 1475 plenary names
Found: 10 keynotes names
Found: 49 chair names
Found: 1491 names in total


### Clean names

In [9]:
def clean_name(name):
    name = unidecode(name)
    return name

def clean_names(names):
    names = {clean_name(name) for name in names}
    return names

def fuzz_names(names, threshold=90):
    names_list = sorted(names)
    name_groups = defaultdict(list)

    for name in names_list:
        first_letter = name[0] if name else ""
        name_groups[first_letter].append(name)

    merge_map = {}
    for letter, group in name_groups.items():
        for i, name in enumerate(group):
            for j in range(i + 1, len(group)):
                match_name = group[j]
                score = fuzz.ratio(name, match_name)
                if score >= threshold:
                    merge_map[match_name] = name

    merged_names = set()
    for name in names_list:
        standardized_name = merge_map.get(name, name)
        merged_names.add(standardized_name)

    return merged_names

names = clean_names(names)
print(f"After cleaning: {len(names)} names")

names = fuzz_names(names)
print(f"After fuzzing: {len(names)} names")

After cleaning: 1486 names
After fuzzing: 1460 names


### Save to file

In [10]:
with open('author_names_2023.txt', 'w', encoding="utf8") as f:
    for name in sorted(names):
        f.write(f"{name}\n")

## Part 2: Ready Made vs Custom Made Data

## Part 3: Gathering Research Articles using the OpenAlex API

### Loading researches 2024

In [None]:
names_file = "author_names_2024.txt"
data_file = "author_data.csv"

with open(names_file, 'r', encoding="utf8") as f:
    names = f.read().splitlines()

print(f"Loaded names: {len(names)}")
names = clean_names(names)

names = fuzz_names(names)
print(f"After fuzzing: {len(names)} names")

names = sorted(names)

## Part 4: The Network of Computational Social Scientists