# Assignment 1
> **Github repository**: [02467_Assignment1](https://github.com/JulWin24/02467_Assignment1)
>
> **Group members**:
> - Rune Harlyk (s234814)
> - Joseph Nguyen (s234826)
> - Julius Winkel (s234862)

In [1]:
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from joblib import Parallel, delayed 
from bs4 import BeautifulSoup
from unidecode import unidecode
from fuzzywuzzy import fuzz
from collections import defaultdict
from time import sleep
from tqdm import tqdm
from ast import literal_eval 
from collections import Counter
from itertools import combinations
import matplotlib.pyplot as plt
import networkx as nx
import netwulf as nw
import pandas as pd
import numpy as np
import json
import re
import os

## Part 1: Web-scraping

### Fetch program

In [2]:
url = "https://ic2s2-2023.org/program"

req = requests.get(url)
soup = BeautifulSoup(req.text, "html.parser")

### Get names

In [3]:
names = set()

def get_plenary_names(names, soup): 
    new_names = {name.strip() for nav_list in soup.find_all("ul", class_="nav_list") 
        for i in nav_list.find_all("i") 
        for name in i.get_text(strip=True).split(",")}
    print(f"Found: {len(new_names)} plenary names")
    names.update(new_names)

def get_keynotes_names(names, soup):
    new_names = {a.get_text(strip=True).replace("Keynote - ", "") 
        for a in soup.find_all("a", href=lambda x: x and x.startswith("/keynotes#"))}
    print(f"Found: {len(new_names)} keynotes names")
    names.update(new_names)
    
def get_chair_names(names, soup):
    new_names = {i.get_text(strip=True).replace("Chair: ", "") 
          for i in soup.find_all("i") if i.get_text(strip=True).startswith("Chair:")}
    print(f"Found: {len(new_names)} chair names")
    names.update(new_names)

get_plenary_names(names, soup)
get_keynotes_names(names, soup)
get_chair_names(names, soup)

print(f"Found: {len(names)} names in total" )

Found: 1475 plenary names
Found: 10 keynotes names
Found: 49 chair names
Found: 1491 names in total


### Clean names

In [4]:
def clean_name(name):
    name = unidecode(name)
    return name

def clean_names(names):
    names = {clean_name(name) for name in names}
    return names

def fuzz_names(names, threshold=90):
    names_list = sorted(names)
    name_groups = defaultdict(list)

    for name in names_list:
        first_letter = name[0] if name else ""
        name_groups[first_letter].append(name)

    merge_map = {}
    for letter, group in name_groups.items():
        for i, name in enumerate(group):
            for j in range(i + 1, len(group)):
                match_name = group[j]
                score = fuzz.ratio(name, match_name)
                if score >= threshold:
                    merge_map[match_name] = name

    merged_names = set()
    for name in names_list:
        standardized_name = merge_map.get(name, name)
        merged_names.add(standardized_name)

    return merged_names

names = clean_names(names)
print(f"After cleaning: {len(names)} names")

names = fuzz_names(names)
print(f"After fuzzing: {len(names)} names")

After cleaning: 1486 names
After fuzzing: 1460 names


### Save to file

In [5]:
with open('author_names_2023.txt', 'w', encoding="utf8") as f:
    for name in sorted(names):
        f.write(f"{name}\n")

## Part 2: Ready Made vs Custom Made Data

## Part 3: Gathering Research Articles using the OpenAlex API

### Loading researches 2024

In [None]:
names_file = "author_names_2024.txt"
data_file = "author_data.csv"

with open(names_file, 'r', encoding="utf8") as f:
    names = f.read().splitlines()

print(f"Loaded names: {len(names)}")
names = clean_names(names)

names = fuzz_names(names)
print(f"After fuzzing: {len(names)} names")

# TODO
# 1 - Remove (Santa Fe Institute) from names
# 2 - Remove Pensylvania State University from names

names = sorted(names)

Loaded names: 1206
After fuzzing: 1202 names


### Fetching author data

In [7]:
def flatten_json(nested_data, parent_key='', sep='_', keep_path=False):
    flat_dict = {}
    for k, v in nested_data.items():
        new_key = k if not keep_path else f"{parent_key}{sep}{k}" if parent_key else k
        if isinstance(v, dict):
            flat_dict.update(flatten_json(v, new_key, sep))
        elif isinstance(v, list):
            for i, item in enumerate(v):
                if isinstance(item, dict):
                    flat_dict.update(flatten_json(item, f"{new_key}{sep}{i}", sep))
        else:
            flat_dict[new_key] = v
    return flat_dict

def filter_json(data, template):
    if isinstance(template, dict):
        return {k: filter_json(data.get(k, None), v) for k, v in template.items() if k in data}
    elif isinstance(template, list) and isinstance(data, list):
        return [filter_json(item, template[0]) for item in data if isinstance(item, dict)]
    return data

def extract_allowed(data, allowed_keys):
    result = {}
    for k, v in data.items():
        if k in allowed_keys:
            result[k] = v
        elif isinstance(v, dict):
            result.update(extract_allowed(v, allowed_keys))
    return result

def load_existing_data():
    if os.path.exists(data_file):
        return pd.read_csv(data_file).to_dict(orient="records")
    return []

In [None]:
session = requests.Session()

template = {
    "id": None,
    "display_name": None,
    "works_count": None,
    "summary_stats": {"h_index": None},
    "affiliations": [{"institution": {"country_code": None}}],
    "works_api_url": None,
}

def get_author_data(name):
    try:
        response = session.get(f"https://api.openalex.org/authors?filter=display_name.search:{name}", timeout=5)
        sleep(0.05)
        if not response.ok:
            print(response.status_code)
            return name
        json_data = response.json()
        if not json_data['results']:
            return name
        return flatten_json(filter_json(json_data['results'][0], template))
    except Exception as ex:
        return name

existing_data = load_existing_data()
existing_names = {entry['display_name'] for entry in existing_data if 'display_name' in entry}
names_to_process = list(set(names) - existing_names)
print(f"Already have {len(existing_names)}, missing {len(names_to_process)}")
author_data = existing_data
bad_names = []

with ThreadPoolExecutor(max_workers=1) as executor:
    futures = {executor.submit(get_author_data, name): name for name in names_to_process}
    for future in tqdm(as_completed(futures), total=len(futures)):
        result = future.result()
        if isinstance(result, dict):
            author_data.append(result)
        else:
            bad_names.append(result)

pd.DataFrame(author_data).to_csv(data_file, index=False)
print(f"Got data for: {len(author_data)}, missing {len(bad_names)}")

Already have 132, missing 1091


100%|██████████| 1091/1091 [04:08<00:00,  4.40it/s]

Got data for: 1115, missing 108





### Load data again and filter between 5-5000 works

In [2]:
df = pd.read_csv('author_data.csv')

print(len(df))
df = df[(df["works_count"] >= 5) & (df["works_count"] <= 5000)]
print(len(df))

1115
941


### Define filters

In [6]:
social_science_fields = ['Political science', 'Economics', 'Psychology', 'Sociology']
quantitative_fields = ['Mathematics', 'Physics', 'Computer science']
min_cited_by = 10
max_authors = 10


def get_concept_ids():
    concepts_url = "https://api.openalex.org/concepts?filter=level:0&per-page=200"
    response_concepts = requests.get(concepts_url)


    if response_concepts.ok:
        concepts = response_concepts.json()['results']
        
        social_science_ids = [i['id'] for i in concepts if i['display_name'] in social_science_fields]
        quantitative_ids = [i['id'] for i in concepts if i['display_name'] in quantitative_fields]

    return social_science_ids, quantitative_ids

def create_concept_filter():
    social_science_ids, quantitative_ids = get_concept_ids() 
    social_science_filter = '|'.join(social_science_ids)
    quantitative_filter = '|'.join(quantitative_ids)

    concept_filter = f"concepts.id:{social_science_filter},concepts.id:{quantitative_filter}"
    return concept_filter

def create_authors_filter(ids):
    return f"authorships.author.id:{'|'.join(ids)}"

concept_filter = create_concept_filter()
cited_by_filter = f"cited_by_count:>{min_cited_by}"
author_count_filter = f"authors_count:<{max_authors}"

def get_query_filter(ids):
    return ",".join((concept_filter, cited_by_filter, author_count_filter, create_authors_filter(ids)))

### Fetching works

In [4]:
# Constants
WORKS_URL = "https://api.openalex.org/works"
BATCH_SIZE = 25
MAX_REQUESTS_PER_SECOND = 10
NUM_CORES = 10

select_data = "id,title,publication_year,abstract_inverted_index,authorships,cited_by_count,concepts"

def fetch_works(batch):
    """Fetch works for a batch of authors, handling pagination."""
    batch_papers = []
    batch_abstracts = []
    cursor = "*"  # Start with '*' to get the first page

    while cursor:  # Continue fetching until there's no cursor (no more pages)
        query_url = (
            f"{WORKS_URL}?filter={get_query_filter(batch)}"
            f"&select={select_data}"
            f"&per_page=200"  # Fetch max results per request
            f"&cursor={cursor}"  # Use cursor for pagination
        )

        while True:  # Retry fetching if rate limited
            response = requests.get(query_url)
            if response.status_code == 429:
                print(f"Got rate limited, waiting for 0.5 second")
                sleep(0.5)
                continue
            break

        if not response.ok:
            print(f"Error fetching batch: {response.status_code}")
            continue

        json_data = response.json()

        # Store retrieved works
        for work in json_data.get("results", []):
            batch_papers.append({
                "id": work["id"],
                "publication_year": work.get("publication_year"),
                "cited_by_count": work.get("cited_by_count", 0),
                "author_ids": [auth["author"]["id"] for auth in work.get("authorships", [])]
            })

            batch_abstracts.append({
                "id": work["id"],
                "title": work.get("title"),
                "abstract_inverted_index": work.get("abstract_inverted_index")
            })

        cursor = json_data.get("meta", {}).get("next_cursor")

        sleep(1 / MAX_REQUESTS_PER_SECOND)

    return batch_papers, batch_abstracts

### Getting works from authors

In [None]:
# Process authors in parallel batches
author_ids = df["id"].tolist()
author_batches = [author_ids[i: i + BATCH_SIZE] for i in range(0, len(author_ids), BATCH_SIZE)]

results = Parallel(n_jobs=NUM_CORES)(
    delayed(fetch_works)(batch) for batch in tqdm(author_batches, desc="Fetching works in parallel", unit="batch")
)

# Flatten results
all_papers = [paper for batch_papers, _ in results for paper in batch_papers]
all_abstracts = [abstract for _, batch_abstracts in results for abstract in batch_abstracts]

# Convert to DataFrame
papers_df = pd.DataFrame(all_papers)
abstracts_df = pd.DataFrame(all_abstracts)

# Drop
papers_df = papers_df.drop_duplicates(subset='id', keep='first')
abstracts_df = abstracts_df.drop_duplicates(subset='id', keep='first')

# Save to CSV
papers_df.to_csv("ic2s2_papers.csv", index=False)
abstracts_df.to_csv("ic2s2_abstract.csv", index=False)

print(f"Got {len(papers_df)} papers")
print(f"Got {len(abstracts_df)} abstracts")


[A
[A
[A
Fetching works in parallel: 100%|██████████| 38/38 [00:10<00:00,  3.68batch/s]


Got 10173 papers
Got 10173 abstracts


## Part 4: The Network of Computational Social Scientists

### Getting final dataset with authors and coauthors

In [3]:
papers_df = pd.read_csv("ic2s2_papers.csv", converters={'author_ids': literal_eval})
all_author_ids = papers_df.explode('author_ids')["author_ids"].unique().tolist()
len(set(all_author_ids))

15361

In [7]:
all_author_batches_ids = [all_author_ids[i: i + BATCH_SIZE] for i in range(0, len(all_author_ids), BATCH_SIZE)]

results = Parallel(n_jobs=NUM_CORES)(
    delayed(fetch_works)(batch) for batch in tqdm(all_author_batches_ids, desc="Fetching works in parallel", unit="batch")
)

all_papers = [paper for batch_papers, _ in results for paper in batch_papers]
# all_abstracts = [abstract for _, batch_abstracts in results for abstract in batch_abstracts]

# Convert to DataFrame
papers_df = pd.DataFrame(all_papers)
# abstracts_df = pd.DataFrame(all_abstracts)

# Drop
papers_df = papers_df.drop_duplicates(subset='id', keep='first')
# abstracts_df = abstracts_df.drop_duplicates(subset='id', keep='first')

Fetching works in parallel: 100%|██████████| 615/615 [04:39<00:00,  2.20batch/s]


In [8]:
papers_df.to_csv("ic2s2_coauthors_papers.csv", index=False)
# abstracts_df.to_csv("ic2s2_coauthors_abstracts.csv", index=False)

In [9]:
papers_coauthor_df = pd.read_csv("ic2s2_coauthors_papers.csv", converters={'author_ids': literal_eval})
# abstracts_coauthor_df = pd.read_csv("ic2s2_coauthors_abstracts.csv")

print(f"Got {len(papers_coauthor_df)} number of papers")
# print(f"Got {len(abstracts_coauthor_df)} number of abstracts")

Got 184455 number of papers


## Part 1: Network Construction

### Getting author pairs

In [41]:
edges = Counter()

valid_authors = set(df["id"])

filtered_papers = papers_coauthor_df[
    papers_coauthor_df["author_ids"].apply(lambda authors: all(a in valid_authors for a in authors))
].reset_index(drop=True)

for author_list in filtered_papers["author_ids"]:
    for pair in combinations(author_list, 2):
        edges[pair] += 1

edgelist = [(a, b, count) for (a, b), count in edges.items()]
len(edgelist)

123

### Graph construction

In [30]:
def save_graph(graph_file, G):
    data = nx.readwrite.json_graph.node_link_data(G)
    with open(graph_file, "w") as f:
        json.dump(data, f)

def load_graph(graph_file):
    with open(graph_file, "r") as f:
        data = json.load(f)
    return nx.readwrite.json_graph.node_link_graph(data)

In [42]:
print(len(df))
df = df.drop_duplicates(subset='id', keep='first')
print(len(df))

df_exploded = papers_coauthor_df.explode("author_ids")

author_stats = df_exploded.groupby("author_ids").agg(
    first_publication_year=("publication_year", "min"),
    cited_by_count=("cited_by_count", "sum")
).reset_index()

df_merged = df.merge(author_stats, left_on="id", right_on="author_ids", how="inner")
df_merged.drop(columns=["author_ids"], inplace=True)
attr_dict = df_merged[["id", "display_name", "country_code", "first_publication_year", "cited_by_count"]].set_index("id").to_dict("index")

924
924


In [43]:
graph_file = "ic2s2_coauthors_graph.json"
G = nx.Graph()
G.add_weighted_edges_from(edgelist)
nx.set_node_attributes(G, attr_dict)

In [31]:
save_graph(graph_file, G)

## Part 2: Preliminary Network Analysis

### Network Metrics:

In [44]:
# Network Stats
num_links = len(edgelist)
num_nodes = len(set(papers_coauthor_df.explode('author_ids')["author_ids"].unique().tolist()))
print(f"Got {num_links} links between {num_nodes} nodes")

# Density Stats
print(f'Network density is: {nx.density(G)}')

# Number of connected components
num_isolated = len(list(nx.isolates(G)))
print("Is fully connected: ", nx.is_connected(G))
print("Number of connected components: ", nx.number_connected_components(G))
print("Number of isolated nodes: ", num_isolated)

Got 123 links between 225626 nodes
Network density is: 0.020664869721473494
Is fully connected:  False
Number of connected components:  28
Number of isolated nodes:  0


### Degree analysis

In [45]:
degrees = [d for _, d in G.degree()]
strengths = [s for _, s in G.degree(weight="weight")]

degree_stats = {
    "avg": np.mean(degrees),
    "median": np.median(degrees),
    "mode": Counter(degrees).most_common(1)[0][0],
    "min": np.min(degrees),
    "max": np.max(degrees)
}

strength_stats = {
    "avg": np.mean(strengths),
    "median": np.median(strengths),
    "mode": Counter(strengths).most_common(1)[0][0],
    "min": np.min(strengths),
    "max": np.max(strengths)
}

print(degree_stats)
print(strength_stats)

{'avg': 2.169811320754717, 'median': 2.0, 'mode': 1, 'min': 1, 'max': 10}
{'avg': 3.830188679245283, 'median': 2.0, 'mode': 2, 'min': 1, 'max': 48}


#### Top authors

In [46]:
def top_nodes_by_degree(G, top_n=5):
    return sorted(G.degree, key=lambda x: x[1], reverse=True)[:top_n]

top_5 = top_nodes_by_degree(G)
print(top_5)

[('https://openalex.org/A5056499434', 10), ('https://openalex.org/A5000679279', 8), ('https://openalex.org/A5082698243', 7), ('https://openalex.org/A5026949484', 6), ('https://openalex.org/A5020533147', 6)]


## Visualize

In [None]:
config = {
    "zoom": 0.6,
    "scale_node_size_by_strength": True,
    "node_size_variation": 1,
    "node_size": 30,
    "node_gravity": 0.45,
}

id_to_name = pd.Series(df.display_name.values, index=df.id).to_dict()

G_named = nx.relabel_nodes(G, id_to_name)

network, config = nw.visualize(G_named, config=config)

# fig, ax = nw.draw_netwulf(network, figsize=(10,10))
plt.show()
# plt.savefig("myfigure.pdf")