# Citation data

Note:

- The resulting data can be found in the folder `data`. 

To do:

- For some reason, I could not `dill.dump` the data. We should run the notebook once to ensure data in the folder `data` matches the outcome of the notebook.

Clean up:

1. `create_author_network`: Should we delete the lines of code that were commented out?
2. Perceptron: The code for the perceptron is still in here; should we delete it?


In [2]:
import dill
import copy
import json
import numpy as np
import pandas as pd
import networkx as nx
import pickle

from itertools import chain, chain#, batched
from tqdm.auto import tqdm

from pyalex import Works, Authors, Sources, Institutions, Concepts, Publishers, Funders, config

config.email = "h.w.a.duijf@uu.nl"
config.max_retries = 5

## Methods

### Prune citation data: only articles, and remove articles without bibliography

In [2]:
def get_works_with_references(works: list) -> list:
    works_pruned: list = []
    for work in works:
        try:
            assert work["referenced_works"] != []
            works_pruned.append(work)
        except:
            pass
    return works_pruned

In [3]:
def get_articles(works: list) -> list:
    articles: list = []
    for work in works:
        try:
            assert work["primary_location"]["source"]["type"] == "journal"
            assert work["type"] == "article"
            articles.append(work)
        except:
            pass
    return articles

### Create author network from dataframe of records

In [4]:
def create_author_network(works: list, add_cited_authors: bool=True) -> nx.DiGraph:
    """Create a directed author citation network from a list of works.
    Arguments
    ---------
    works : list[Work]
        A list of works (see PyAlex).
    add_cited_authors : bool, optional
        Whether to add cited authors as nodes in the graph even if they are not present in the works list. Defaults to True.
    Returns
    -------
    G : nx.DiGraph
        A directed graph where nodes are authors and edges represent citations
        from cited authors to citing authors."""
    # Create a directed graph and dataframe of works
    df = pd.DataFrame(works)
    G: nx.DiGraph = nx.DiGraph()

    # Add nodes 
    for _, row in df.iterrows():
        for author in row['authorships']:
            this_author = author['author']['id'] 
            
            # ignore the author with the id "A9999999999" as it is a placeholder for missing values
            if (this_author is None) or (this_author.split("/")[-1] == "A9999999999"): 
                continue
            
            # add the author if not already present in the network
            if this_author not in G.nodes():
                G.add_node(this_author)
                G.nodes()[this_author]['n_works'] = 1
            else:
                G.nodes()[this_author]['n_works'] += 1
            
    # Add edges
    for _, row in tqdm(df.iterrows(), total=len(works)):
        for author in row['authorships']:
            this_author = author['author']['id'] 
            if this_author not in G.nodes():
                continue
            
            for cited_work_id in row["referenced_works"]:
                cited_work = df[df['id'] == cited_work_id] # This fails silently if citations are not present!
                if len(cited_work) >= 1: # In case of multiple hits (shouldn't happen once sampling is fixed)
                    cited_work = cited_work.iloc[0]
                
                for cited_author in cited_work['authorships']:
                    cited_author = cited_author['author']['id'] 
                    
                    if cited_author not in G.nodes() and add_cited_authors:
                        if (cited_author is not None) and (cited_author.split("/")[-1] != "A9999999999"): 
                            G.add_node(cited_author)
                            G.nodes()[cited_author]['n_works'] = 0
                    
                    # edges go FROM cited TO citing
                    if cited_author in G.nodes() and not G.has_edge(cited_author, this_author):
                        G.add_edge(cited_author, this_author)

    return G

### Pruning by removing ‘twins’ (aka, strong co-authors)

In [5]:
def generate_twins_dict(net: nx.DiGraph, records: list) -> dict:
    
    authors_twins_dict: dict = {}

    for author_id in tqdm(net.nodes()):
        author_records = [
            work for work in records 
            if author_id in [author["author"]["id"] for author in work["authorships"]]]
        
        twins = set()
        for k, record in enumerate(author_records):
            if k == 0:
                coauthors = [
                    coauthor["author"]["id"] 
                    for coauthor in record["authorships"]
                    if coauthor["author"]["id"] != author_id
                ]
                twins = set(coauthors)
            elif twins == set():
                break
            else:
                coauthors = [
                    coauthor["author"]["id"] 
                    for coauthor in record["authorships"]
                    if coauthor["author"]["id"] != author_id
                ]
                twins = twins.intersection(set(coauthors))
        if twins:
            authors_twins_dict[author_id] = twins
    return authors_twins_dict

In [6]:
def prune_network(net: nx.DiGraph, authors_twins_dict: dict) -> nx.DiGraph:
    network_pruned = copy.deepcopy(net)
    for author_id, twins in tqdm(authors_twins_dict.items()):
        twins_in_network = [twin for twin in twins if twin in network_pruned.nodes()]
        if twins_in_network:
            network_pruned.remove_node(author_id)
    return network_pruned

### Pruning by taking the largest weakly connected component

In [7]:
def produce_lcc(net: nx.DiGraph) -> nx.DiGraph:
    # Extract largest component:
    largest_cc = max(nx.weakly_connected_components(net), key=len)
    
    lcc = nx.DiGraph()
    lcc.add_nodes_from((n, net.nodes[n]) for n in largest_cc)
    lcc.add_edges_from((n, nbr, d)
        for n, nbrs in net.adj.items() if n in largest_cc
        for nbr, d in nbrs.items() if nbr in largest_cc)
    lcc.graph.update(net.graph)
    # lcc = copy.deepcopy(net.subgraph(largest_cc))
    return lcc

In [8]:
def remove_self_loops(net: nx.DiGraph) -> nx.DiGraph:
    network_pruned = copy.deepcopy(net).copy()
    for node in net.nodes():
        if (node, node) in net.edges():
            network_pruned.remove_edge(node, node)
    return network_pruned

## Citation data from OpenAlex

In [9]:
def set_version(self, v):
    self._add_params("data-version", str(v))
    return self

Works.version = set_version

# results = Works().filter(publication_year=2020).version(2).get()

def OA_full_text_search(text: str, year: str, version: int=1) -> list:
    """Note: version=1 takes the old OA, version=2 takes the new OA Waldren"""
    query = Works().search(f'"{text}"').filter(publication_year=year).version(version)
    works: list = []

    for _, work in enumerate(chain(*query.paginate(per_page=200, n_max=None))):
        works.append(work)
    print(f"{len(works)=:,}")
    return works

def OA_title_abstract_search(text: str, year: str, version: int=1) -> list:
    """Note: version=1 takes the old OA, version=2 takes the new OA Waldren"""
    query = Works().search_filter(title_and_abstract=f'"{text}"').filter(publication_year=year).version(version)
    works: list = []

    for _, work in enumerate(chain(*query.paginate(per_page=200, n_max=None))):
        works.append(work)
    print(f"{len(works)=:,}")
    return works

### Peptic ulcer disease

Get the records

In [None]:
w1 = OA_title_abstract_search(text="peptic ulcer disease", year="1900-1978")
w1_articles = get_articles(w1)
print(len(w1_articles))

len(works)=383
267


In [None]:
w2 = OA_title_abstract_search(text="peptic ulcer disease", year="1900-1978", version=2)
w2_articles = get_articles(w2)
print(len(w2_articles))

len(works)=490
355


In [None]:
string = "peptic ulcer disease"
works_pud = OA_full_text_search(text=string, year="1900-1978", version=2) 

len(works)=685


In [10]:
string = "peptic ulcer disease"
works_pud = OA_full_text_search(text=string, year="1900-1978")

len(works)=2,146


In [11]:
with open('pud_works.pkl', 'wb') as f:
    dill.dump(works_pud, f)

In [11]:
with open('pud_works.pkl', 'rb') as f:
    works_pud = dill.load(f)

In [12]:
works_pud_pruned = get_works_with_references(works_pud)
print(f"{len(works_pud_pruned)=:,}")

len(works_pud_pruned)=1,464


In [13]:
articles_pud = get_articles(works_pud_pruned)
print(f"{len(articles_pud)=:,}")

len(articles_pud)=1,276


Create author-based network

In [14]:
network_pud_original = create_author_network(articles_pud) 
print(f"{network_pud_original.number_of_nodes()=:,}")
print(f"{network_pud_original.number_of_edges()=:,}")

  0%|          | 0/1276 [00:00<?, ?it/s]

network_pud_original.number_of_nodes()=2,601
network_pud_original.number_of_edges()=3,257


Prune author-based network

In [16]:
authors_twins_dict = generate_twins_dict(network_pud_original, works_pud)
network_pud_pruned = prune_network(network_pud_original, authors_twins_dict)
print(f"{network_pud_pruned.number_of_nodes()=:,}")
print(f"{network_pud_pruned.number_of_edges()=:,}")

network_pud_pruned_lcc = produce_lcc(network_pud_pruned)
print(f"{network_pud_pruned_lcc.number_of_nodes()=:,}")
print(f"{network_pud_pruned_lcc.number_of_edges()=:,}")

network_pud_final = remove_self_loops(network_pud_pruned_lcc)
print(f"{network_pud_final.number_of_nodes()=:,}")
print(f"{network_pud_final.number_of_edges()=:,}")

  0%|          | 0/2601 [00:00<?, ?it/s]

  0%|          | 0/2001 [00:00<?, ?it/s]

network_pud_pruned.number_of_nodes()=1,035
network_pud_pruned.number_of_edges()=739
network_pud_pruned_lcc.number_of_nodes()=312
network_pud_pruned_lcc.number_of_edges()=628
network_pud_final.number_of_nodes()=312
network_pud_final.number_of_edges()=583


In [17]:
info_dict = {
    "data_type": 
        ["works", 
        "works with refs", 
        "articles", 
        "author network",
        "author network pruned",
        "author network pruned lcc",
        "author network final"],
    "number_of_nodes": [
        f"{len(works_pud):,.0f}", 
        f"{len(works_pud_pruned):,.0f}", 
        f"{len(articles_pud):,.0f}", 
        f"{network_pud_original.number_of_nodes():,.0f}", 
        f"{network_pud_pruned.number_of_nodes():,.0f}", 
        f"{network_pud_pruned_lcc.number_of_nodes():,.0f}", 
        f"{network_pud_final.number_of_nodes():,.0f}"
    ],
    "number_of_edges": [
        "N/A",  # works do not have edges
        "N/A",  # works with refs do not have edges
        "N/A",  # articles do not have edges
        f"{network_pud_original.number_of_edges():,.0f}", 
        f"{network_pud_pruned.number_of_edges():,.0f}", 
        f"{network_pud_pruned_lcc.number_of_edges():,.0f}", 
        f"{network_pud_final.number_of_edges():,.0f}"
    ]
}
df_info = pd.DataFrame(info_dict)
# df_info.astype({"number_of_edges": "Int64"})
df_info

Unnamed: 0,data_type,number_of_nodes,number_of_edges
0,works,2146,
1,works with refs,1464,
2,articles,1276,
3,author network,2601,3257.0
4,author network pruned,1035,739.0
5,author network pruned lcc,312,628.0
6,author network final,312,583.0


Save networks

In [None]:
# with open('data/pud_works.pkl', 'w') as f:
#     dill.dump(works_pud, f)

# with open('data/pud_original.pkl', 'wb') as f:
#     dill.dump(network_pud_original, f)

with open('pud_final.pkl', 'wb') as f:
    dill.dump(network_pud_final, f)


Loading the network from file

In [3]:
with open('pud_final.pkl', 'rb') as f:
    network = dill.load(f)

### Perceptron

In [40]:
string = "perceptron"
works_perceptron = OA_full_text_search(text=string, year="1900-2000") 

len(works)=13,636


In [41]:
works_perceptron_pruned = get_works_with_references(works_perceptron)
print(f"{len(works_perceptron_pruned)=:,}")

len(works_perceptron_pruned)=9,840


In [42]:
articles_perceptron = get_articles(works_perceptron_pruned)
print(f"{len(articles_perceptron)=:,}")

len(articles_perceptron)=7,668


Create author-based network

In [43]:
network_perceptron_original = create_author_network(articles_perceptron) 
print(f"{network_perceptron_original.number_of_nodes()=:,}")
print(f"{network_perceptron_original.number_of_edges()=:,}")

  0%|          | 0/7668 [00:00<?, ?it/s]

network_perceptron_original.number_of_nodes()=12,673
network_perceptron_original.number_of_edges()=69,734


Prune author-based network

In [44]:
authors_twins_dict = generate_twins_dict(network_perceptron_original, articles_perceptron)
network_perceptron_pruned = prune_network(network_perceptron_original, authors_twins_dict)
print(f"{network_perceptron_pruned.number_of_nodes()=:,}")
print(f"{network_perceptron_pruned.number_of_edges()=:,}")

network_perceptron_pruned_lcc = produce_lcc(network_perceptron_pruned)
print(f"{network_perceptron_pruned_lcc.number_of_nodes()=:,}")
print(f"{network_perceptron_pruned_lcc.number_of_edges()=:,}")

network_perceptron_final = remove_self_loops(network_perceptron_pruned_lcc)
print(f"{network_perceptron_final.number_of_nodes()=:,}")
print(f"{network_perceptron_final.number_of_edges()=:,}")

  0%|          | 0/12673 [00:00<?, ?it/s]

  0%|          | 0/10321 [00:00<?, ?it/s]

network_perceptron_pruned.number_of_nodes()=4,306
network_perceptron_pruned.number_of_edges()=19,932
network_perceptron_pruned_lcc.number_of_nodes()=3,176
network_perceptron_pruned_lcc.number_of_edges()=19,859
network_perceptron_final.number_of_nodes()=3,176
network_perceptron_final.number_of_edges()=18,914


Save networks

In [45]:
# with open('data/perceptron_works.pkl', 'wb') as f:
#     dill.dump(works_perceptron, f)

# with open('data/perceptron_original.pkl', 'wb') as f:
#     dill.dump(network_perceptron_original, f)
    
with open('perceptron_final_dill.pkl', 'wb') as f:
    dill.dump(network_perceptron_final, f)

# Save the object to a file
with open('perceptron_final.pkl', 'wb') as f:
    pickle.dump(network_perceptron_final, f)

In [46]:
info_dict = {
    "data_type": 
        ["works", 
        "works with refs", 
        "articles", 
        "author network",
        "author network pruned",
        "author network pruned lcc",
        "author network final"],
    "number_of_nodes": [
        f"{len(works_perceptron):,.0f}", 
        f"{len(works_perceptron_pruned):,.0f}", 
        f"{len(articles_perceptron):,.0f}", 
        f"{network_perceptron_original.number_of_nodes():,.0f}", 
        f"{network_perceptron_pruned.number_of_nodes():,.0f}", 
        f"{network_perceptron_pruned_lcc.number_of_nodes():,.0f}", 
        f"{network_perceptron_final.number_of_nodes():,.0f}"
    ],
    "number_of_edges": [
        "N/A",  # works do not have edges
        "N/A",  # works with refs do not have edges
        "N/A",  # articles do not have edges
        f"{network_perceptron_original.number_of_edges():,.0f}", 
        f"{network_perceptron_pruned.number_of_edges():,.0f}", 
        f"{network_perceptron_pruned_lcc.number_of_edges():,.0f}", 
        f"{network_perceptron_final.number_of_edges():,.0f}"
    ]
}
df_info = pd.DataFrame(info_dict)
# df_info.astype({"number_of_edges": "Int64"})
df_info

Unnamed: 0,data_type,number_of_nodes,number_of_edges
0,works,13636,
1,works with refs,9840,
2,articles,7668,
3,author network,12673,69734.0
4,author network pruned,4306,19932.0
5,author network pruned lcc,3176,19859.0
6,author network final,3176,18914.0
