# Citation data

Note:

- The resulting data can be found in the folder `data`. 

To do:

- For some reason, I could not `dill.dump` the data. We should run the notebook once to ensure data in the folder `data` matches the outcome of the notebook.

Clean up:

1. `create_author_network`: Should we delete the lines of code that were commented out?
2. Perceptron: The code for the perceptron is still in here; should we delete it?


In [2]:
import dill
import copy
import json
import numpy as np
import pandas as pd
import networkx as nx
import pickle

from itertools import chain, chain#, batched
from tqdm.auto import tqdm

from pyalex import Works, Authors, Sources, Institutions, Concepts, Publishers, Funders, config

config.email = "h.w.a.duijf@uu.nl"
config.max_retries = 5

## Methods

### Prune citation data: only articles, and remove articles without bibliography

In [3]:
def get_works_with_references(works: list) -> list:
    works_pruned: list = []
    for work in works:
        try:
            assert work["referenced_works"] != []
            works_pruned.append(work)
        except:
            pass
    return works_pruned

In [4]:
def get_articles(works: list) -> list:
    articles: list = []
    for work in works:
        try:
            assert work["primary_location"]["source"]["type"] == "journal"
            assert work["type"] == "article"
            articles.append(work)
        except:
            pass
    return articles

### Create author network from dataframe of records

In [5]:
def create_author_network(works: list) -> nx.DiGraph:
    # Create a directed graph
    # Create dataframe
    df = pd.DataFrame(works)
    G: nx.DiGraph = nx.DiGraph()


    # Add nodes and edges
    for _, row in tqdm(df.iterrows(), total=len(works)):
        for author in row['authorships']:
            this_author = author['author']['id'] 
            
            # ignore the author with the id "A9999999999" as it is a placeholder for missing values
            if this_author.split("/")[-1] == "A9999999999": 
                continue
            
            # add the author if not already present in the network
            if this_author not in G.nodes():
                G.add_node(this_author)
                # G.nodes()[this_author]['authored_paper_count'] = 1
                # G.nodes()[this_author]['cited_count'] = 0
                # G.nodes()[this_author]['titles'] = [row['title']]
            # else:
                # G.nodes()[this_author]['authored_paper_count'] += 1
                # G.nodes()[this_author]['titles'].append(row['title'])
            
            # add edges
            for cited_work_id in row["referenced_works"]:
                cited_work = df[df['id'] == cited_work_id] # This fails silently if citations are not present!
                if len(cited_work) >= 1: # In case of multiple hits (shouldn't happen once sampling is fixed)
                    cited_work = cited_work.iloc[0]
                    
                for cited_author in cited_work['authorships']:
                    cited_author = cited_author['author']['id'] 
                    
                    if cited_author not in G.nodes():
                        G.add_node(cited_author)
                        # G.nodes()[cited_author]['cited_count'] = 1
                        # G.nodes()[cited_author]['authored_paper_count'] = 0
                        # G.nodes()[cited_author]['titles'] = [cited_work['title']]
                    # else:
                        # G.nodes()[cited_author]['cited_count'] += 1
                        # G.nodes()[cited_author]['titles'].append(cited_work['title'])
                    
                    
                    if G.has_edge(cited_author, this_author):  # edges go FROM cited TO citing
                        pass
                    else:
                        G.add_edge(cited_author, this_author)
                            
    # # Optionally, you can print the nodes and edges to verify
    # print(f"{len(G.nodes())=:,}")
    # print(f"{len(G.edges())=:,}")
    return G

### Pruning by removing ‘twins’ (aka, strong co-authors)

In [6]:
def generate_twins_dict(net: nx.DiGraph, records: list) -> dict:
    
    authors_twins_dict: dict = {}

    for author_id in tqdm(net.nodes()):
        author_records = [
            work for work in records 
            if author_id in [author["author"]["id"] for author in work["authorships"]]]
            
        for k, record in enumerate(author_records):
            if k == 0:
                coauthors = [
                    coauthor["author"]["id"] 
                    for coauthor in record["authorships"]
                    if coauthor["author"]["id"] != author_id
                ]
                twins = set(coauthors)
            elif twins == set():
                break
            else:
                coauthors = [
                    coauthor["author"]["id"] 
                    for coauthor in record["authorships"]
                    if coauthor["author"]["id"] != author_id
                ]
                twins = twins.intersection(set(coauthors))
        if twins:
            authors_twins_dict[author_id] = twins
    return authors_twins_dict

In [7]:
def prune_network(net: nx.DiGraph, authors_twins_dict: dict) -> nx.DiGraph:
    network_pruned = copy.deepcopy(net)
    for author_id, twins in tqdm(authors_twins_dict.items()):
        twins_in_network = [twin for twin in twins if twin in network_pruned.nodes()]
        if twins_in_network:
            network_pruned.remove_node(author_id)
    return network_pruned

### Pruning by taking the largest weakly connected component

In [8]:
def produce_lcc(net: nx.DiGraph) -> nx.DiGraph:
    # Extract largest component:
    largest_cc = max(nx.weakly_connected_components(net), key=len)
    lcc = copy.deepcopy(net.subgraph(largest_cc))
    return lcc

In [9]:
def remove_self_loops(net: nx.DiGraph) -> nx.DiGraph:
    network_pruned = copy.deepcopy(net).copy()
    for node in net.nodes():
        if (node, node) in net.edges():
            network_pruned.remove_edge(node, node)
    return network_pruned

## Citation data from OpenAlex

In [10]:
def get_works_from_OA(text: str, year: str) -> list:
    query = Works().search(text).filter(publication_year=year)
    works: list = []

    for _, work in enumerate(chain(*query.paginate(per_page=200, n_max=None))):
        works.append(work)
    print(f"{len(works)=:,}")
    return works

### Peptic ulcer disease

Get the records

In [11]:
works_pud = get_works_from_OA(text="peptic ulcer disease", year="1900-1978") 

len(works)=28,822


In [12]:
works_pud_pruned = get_works_with_references(works_pud)
print(f"{len(works_pud_pruned)=:,}")

len(works_pud_pruned)=14,942


In [13]:
articles_pud = get_articles(works_pud_pruned)
print(f"{len(articles_pud)=:,}")

len(articles_pud)=13,754


Create author-based network

In [14]:
network_pud_original = create_author_network(articles_pud) 
print(f"{network_pud_original.number_of_nodes()=:,}")
print(f"{network_pud_original.number_of_edges()=:,}")

100%|██████████| 13754/13754 [14:19<00:00, 15.99it/s] 

network_pud_original.number_of_nodes()=19,378
network_pud_original.number_of_edges()=68,310





Prune author-based network

In [15]:
authors_twins_dict = generate_twins_dict(network_pud_original, works_pud)
network_pud_pruned = prune_network(network_pud_original, authors_twins_dict)
print(f"{network_pud_pruned.number_of_nodes()=:,}")
print(f"{network_pud_pruned.number_of_edges()=:,}")

network_pud_pruned_lcc = produce_lcc(network_pud_pruned)
print(f"{network_pud_pruned_lcc.number_of_nodes()=:,}")
print(f"{network_pud_pruned_lcc.number_of_edges()=:,}")

network_pud_final = remove_self_loops(network_pud_pruned_lcc)
print(f"{network_pud_final.number_of_nodes()=:,}")
print(f"{network_pud_final.number_of_edges()=:,}")

100%|██████████| 19378/19378 [15:56<00:00, 20.26it/s]
100%|██████████| 12111/12111 [00:00<00:00, 203368.64it/s]


network_pud_pruned.number_of_nodes()=9,450
network_pud_pruned.number_of_edges()=23,149
network_pud_pruned_lcc.number_of_nodes()=5,493
network_pud_pruned_lcc.number_of_edges()=22,891
network_pud_final.number_of_nodes()=5,493
network_pud_final.number_of_edges()=22,159


In [16]:
info_dict = {
    "data_type": 
        ["works", 
        "works with refs", 
        "articles", 
        "author network",
        "author network pruned",
        "author network pruned lcc",
        "author network final"],
    "number_of_nodes": [
        len(works_pud), 
        len(works_pud_pruned), 
        len(articles_pud), 
        network_pud_original.number_of_nodes(), 
        network_pud_pruned.number_of_nodes(), 
        network_pud_pruned_lcc.number_of_nodes(), 
        network_pud_final.number_of_nodes()
    ],
    "number_of_edges": [
        np.nan,  # works do not have edges
        np.nan,  # works with refs do not have edges
        np.nan,  # articles do not have edges
        network_pud_original.number_of_edges(), 
        network_pud_pruned.number_of_edges(), 
        network_pud_pruned_lcc.number_of_edges(), 
        network_pud_final.number_of_edges()
    ]
}
df_info = pd.DataFrame(info_dict)
df_info.astype({"number_of_edges": "Int64"})

Unnamed: 0,data_type,number_of_nodes,number_of_edges
0,works,28822,
1,works with refs,14942,
2,articles,13754,
3,author network,19378,68310.0
4,author network pruned,9450,23149.0
5,author network pruned lcc,5493,22891.0
6,author network final,5493,22159.0


Save networks

In [17]:
# with open('data/pud_works.pkl', 'w') as f:
#     dill.dump(works_pud, f)

# with open('data/pud_original.pkl', 'wb') as f:
#     dill.dump(network_pud_original, f)

with open('pud_final_dill.pkl', 'wb') as f:
    # f.write("hello world".encode('utf-8'))  
    dill.dump(network_pud_final, f)

# Save the object to a file
with open('pud_final.pkl', 'wb') as f:
    pickle.dump(network_pud_final, f)

### Perceptron

In [18]:
works_perceptron = get_works_from_OA(text="perceptron", year="1900-2000") 

len(works)=13,636


In [19]:
works_perceptron_pruned = get_works_with_references(works_perceptron)
print(f"{len(works_perceptron_pruned)=:,}")

len(works_perceptron_pruned)=9,841


In [20]:
articles_perceptron = get_articles(works_perceptron_pruned)
print(f"{len(articles_perceptron)=:,}")

len(articles_perceptron)=7,668


Create author-based network

In [21]:
network_perceptron_original = create_author_network(articles_perceptron) 
print(f"{network_perceptron_original.number_of_nodes()=:,}")
print(f"{network_perceptron_original.number_of_edges()=:,}")

100%|██████████| 7668/7668 [09:18<00:00, 13.73it/s]  

network_perceptron_original.number_of_nodes()=12,671
network_perceptron_original.number_of_edges()=69,724





Prune author-based network

In [22]:
authors_twins_dict = generate_twins_dict(network_perceptron_original, articles_perceptron)
network_perceptron_pruned = prune_network(network_perceptron_original, authors_twins_dict)
print(f"{network_perceptron_pruned.number_of_nodes()=:,}")
print(f"{network_perceptron_pruned.number_of_edges()=:,}")

network_perceptron_pruned_lcc = produce_lcc(network_perceptron_pruned)
print(f"{network_perceptron_pruned_lcc.number_of_nodes()=:,}")
print(f"{network_perceptron_pruned_lcc.number_of_edges()=:,}")

network_perceptron_final = remove_self_loops(network_perceptron_pruned_lcc)
print(f"{network_perceptron_final.number_of_nodes()=:,}")
print(f"{network_perceptron_final.number_of_edges()=:,}")

100%|██████████| 12671/12671 [03:40<00:00, 57.36it/s]
100%|██████████| 10321/10321 [00:00<00:00, 189535.82it/s]


network_perceptron_pruned.number_of_nodes()=4,304
network_perceptron_pruned.number_of_edges()=19,919
network_perceptron_pruned_lcc.number_of_nodes()=3,173
network_perceptron_pruned_lcc.number_of_edges()=19,846
network_perceptron_final.number_of_nodes()=3,173
network_perceptron_final.number_of_edges()=18,902


Save networks

In [23]:
# with open('data/perceptron_works.pkl', 'wb') as f:
#     dill.dump(works_perceptron, f)

# with open('data/perceptron_original.pkl', 'wb') as f:
#     dill.dump(network_perceptron_original, f)
    
with open('perceptron_final_dill.pkl', 'wb') as f:
    dill.dump(network_perceptron_final, f)

# Save the object to a file
with open('perceptron_final.pkl', 'wb') as f:
    pickle.dump(network_perceptron_final, f)