In [1]:
from itertools import combinations
from pyvis.network import Network
import networkx as nx
import numpy as np
import pandas as pd
import json
import copy
import nltk

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize



In [2]:
datasets = []
publications = []
tools = []
with open ('./data/datasets.json') as f:
    datasets = json.load(f)
with open ('./data/publications.json') as f:
    publications = json.load(f)
with open ('./data/tools.json') as f:
    tools = json.load(f)

len(datasets), len(publications), len(tools)

(1059, 3199, 243)

In [9]:
API_KEY = <key here> 

In [12]:
titles = [x['metadata']['summary']['title'] for x in datasets]
titles[:5]

['Cystic Fibrosis Patient Microbiology Cultures',
 'Cystic Fibrosis Patient Liver Enzyme',
 'Cystic Fibrosis Patient Annual Review Encounters',
 'Cystic Fibrosis Patient Transplants',
 'Cystic Fibrosis Patient Sweat Tests']

In [6]:
descriptions = [x['metadata']['summary']['description'] for x in datasets]
descriptions[:5]

["The UK CF Registry is a centralised database of all 60 CF centres across the UK. Data are manually entered in calendar years by CF clinical teams for the 99% of people with a diagnosis of CF who consent to their data being donated to the Registry. Data are entered onto a secure web-portal. For more information please see www.cysticfibrosis.org.uk/registry and 'Data Resource Profile: The UK CF Registry' published in the International Journal of Epidemiology (2018 Feb 1;47(1)9-10e).",
 "The UK CF Registry is a centralised database of all 60 CF centres across the UK. Data are manually entered in calendar years by CF clinical teams for the 99% of people with a diagnosis of CF who consent to their data being donated to the Registry. Data are entered onto a secure web-portal. For more information please see www.cysticfibrosis.org.uk/registry and 'Data Resource Profile: The UK CF Registry' published in the International Journal of Epidemiology (2018 Feb 1;47(1)9-10e).",
 "The UK CF Registry

In [15]:
paper_abstracts = [x['abstract'] for x in publications]
paper_abstracts[1]

'<h4>Introduction</h4>This study will evaluate the effectiveness of home adaptations, both in preventing hospital admissions due to falls for older people, and improving timely discharge. Results will provide evidence for services at the interface between health and social care, informing policies seeking to promote healthy ageing through prudent healthcare and fall prevention.<h4>Methods and analysis</h4>All individuals living in Wales, UK, aged 60 years and over, will be included in the study using anonymised linked data from the Secure Anonymised Information Linkage Databank. We will use a national database of home modifications implemented by the charity organisation Care & Repair Cymru (C&R) from 2009 to 2017 to define an intervention cohort. We will use the electronic Frailty Index to assign individual levels of frailty (fit, mild, moderate or severe) and use these to create a comparator group (non-C&R) of people who have not received a C&R intervention. Coprimary outcomes will b

In [47]:
from google import genai

client = genai.Client(api_key=API_KEY)

def can_you_find_a_dataset(description):
    response = client.models.generate_content(
        model="gemini-2.0-flash", contents=f"Can you find a name of any dataset(s) in the following description from a paper abstract, give me a comma separated list of names of possible dataset names, if you dont think there is one, dont make up any and return no response:\n\n {description}"
    )
    return (response.text)

In [48]:
abstract = paper_abstracts[1001]
from IPython.core.display import HTML, display
display(HTML(abstract))

  from IPython.core.display import HTML, display


In [49]:
print (can_you_find_a_dataset(abstract))

IQVIA Medical Research Data



In [50]:
abstract = paper_abstracts[2005]
display(HTML(abstract))

In [55]:
candidate_datasets = can_you_find_a_dataset(abstract).split(",")
print (candidate_datasets)

['OnCovid Registry']


In [52]:
titles

['Cystic Fibrosis Patient Microbiology Cultures',
 'Cystic Fibrosis Patient Liver Enzyme',
 'Cystic Fibrosis Patient Annual Review Encounters',
 'Cystic Fibrosis Patient Transplants',
 'Cystic Fibrosis Patient Sweat Tests',
 'Cystic Fibrosis Patient NTM culture',
 'Cystic Fibrosis Patient CFQ-R',
 'Cystic Fibrosis Patient Tendon Rupture',
 'Cystic Fibrosis Patient Demographics',
 'Cystic Fibrosis Patient Chronic Medication',
 'COVID antigen testing - Pillar 1',
 'Patient Medical Card Registration (NI)',
 'SARS-CoV-2 viral sequencing data (COG-UK data) - Lineage/Variant Data - NI',
 'Mortality (Death registration)',
 'COVID antigen testing - Pillar 2',
 'Cystic Fibrosis Patient Microbiology Cultures',
 'Cystic Fibrosis Patient Liver Enzyme',
 'Cystic Fibrosis Patient Annual Review Encounters',
 'Cystic Fibrosis Patient Transplants',
 'Cystic Fibrosis Patient Sweat Tests',
 'Cystic Fibrosis Patient NTM culture',
 'Cystic Fibrosis Patient CFQ-R',
 'Cystic Fibrosis Patient Tendon Rupture',

In [3]:
print (json.dumps(publications[0],indent=6))

{
      "publication_type": "Research articles",
      "paper_title": "The role of health and social factors in education outcome: A record-linked electronic birth cohort analysis.",
      "id": 1,
      "year_of_publication": "2019",
      "journal_name": "PloS one",
      "authors": "Evans A, Dunstan F, Fone DL, Bandyopadhyay A, Schofield B, Demmler JC, Rahman MA, Lyons RA, Paranjothy S.",
      "paper_doi": "https://doi.org/10.1371/journal.pone.0220771",
      "abstract": "<h4>Background and objective</h4>Health status in childhood is correlated with educational outcomes. Emergency hospital admissions during childhood are common but it is not known how these unplanned breaks from schooling impact on education outcomes. We hypothesised that children who had emergency hospital admissions had an increased risk of lower educational attainment, in addition to the increased risks associated with other health, social and school factors.<h4>Methods</h4>This record-linked electronic birth co

#### Example Tool

In [4]:
dataset_to_dataset = pd.read_json("./data/linkages_dataset_to_dataset.json").set_index('source_dataset_id')
dataset_to_dataset

Unnamed: 0_level_0,target_dataset_id
source_dataset_id,Unnamed: 1_level_1
16,20
19,20
31,15
51,78
54,700
...,...
719,722
809,808
811,808
814,808


In [5]:
dataset_to_publication = pd.read_json("./data/linkages_dataset_to_publication.json").set_index('source_dataset_id')
dataset_to_publication

Unnamed: 0_level_0,target_publication_id
source_dataset_id,Unnamed: 1_level_1
11,1411
14,523
14,1537
14,1664
15,1406
...,...
880,2386
880,3015
881,2343
892,2939


In [6]:
dataset_to_tool = pd.read_json("./data/linkages_dataset_to_tool.json").set_index('source_dataset_id')
dataset_to_tool.head(5)

Unnamed: 0_level_0,target_tool_id
source_dataset_id,Unnamed: 1_level_1
778,93
23,102
283,102
413,102
728,102


In [7]:
tool_to_publication = pd.read_json("./data/linkages_tool_to_publication.json").set_index('source_tool_id')
tool_to_publication

Unnamed: 0_level_0,target_publication_id
source_tool_id,Unnamed: 1_level_1
234,296
242,3166
243,3166


In [8]:
G = nx.Graph()
for entry in publications:
    node_id = f"publication_{entry['id']}"
    if (doi := entry.get('paper_doi',None)):
        doi = doi.replace("https://doi.org/","")
        G.add_node(node_id, title=doi, label=doi, group=2)

for _, row in dataset_to_publication.iterrows():
    source = f"dataset_{row.name}"
    target = f"publication_{row['target_publication_id']}"
    G.add_edge(source, target, group=2)

len(G.nodes)

3327

In [9]:
import string
import math

In [61]:
G = nx.Graph()

def clean_author(x):
    return x.lstrip().rstrip().translate(str.maketrans('', '', string.punctuation))

for publication in publications:

    pub_node = f"publication_{publication['id']}"
    if (doi := entry.get('paper_doi',None)):
        doi = doi.replace("https://doi.org/","")
        G.add_node(pub_node, title=doi, label=doi, group=1)
    else:
        continue

        
    authors = [clean_author(x) for x in publication['authors'].split(",")]
    for i in range(len(authors)):
        weight = 1/math.sqrt(i+1)
        G.add_node(authors[i], group=2)
        G.add_edge(pub_node, authors[i],weight=weight, group=2)
        
        for j in range(i + 1, len(authors)):
            weight = 1/math.sqrt((i+j))
            if j == len(authors) - 1:
                weight = 1/math.sqrt((i+1))
            G.add_edge(authors[i], authors[j], weight = weight, group=3)

len(G.nodes)

22916

In [41]:
communities = sorted(
    nx.community.greedy_modularity_communities(G,
                                               weight='weight',
                                               resolution=10), 
    key=len, reverse=True)

In [42]:
len(communities)

276

In [43]:
[
    len([
        node 
        for node in c 
        if 'publication_' in node 
    ])
    for c in communities
]

[218,
 155,
 2,
 86,
 101,
 94,
 121,
 3,
 94,
 91,
 68,
 103,
 52,
 19,
 58,
 68,
 9,
 62,
 36,
 21,
 62,
 42,
 32,
 70,
 21,
 51,
 1,
 28,
 42,
 37,
 17,
 27,
 3,
 30,
 27,
 50,
 37,
 19,
 31,
 17,
 54,
 19,
 28,
 7,
 20,
 26,
 20,
 28,
 2,
 24,
 3,
 5,
 11,
 25,
 15,
 8,
 11,
 26,
 2,
 31,
 8,
 20,
 1,
 20,
 21,
 6,
 11,
 15,
 4,
 14,
 9,
 33,
 26,
 8,
 10,
 5,
 9,
 18,
 3,
 11,
 8,
 3,
 11,
 10,
 6,
 5,
 17,
 5,
 4,
 13,
 6,
 21,
 10,
 17,
 6,
 5,
 4,
 3,
 2,
 2,
 7,
 13,
 1,
 1,
 7,
 12,
 2,
 10,
 4,
 1,
 1,
 4,
 9,
 11,
 4,
 2,
 2,
 3,
 3,
 7,
 1,
 2,
 7,
 6,
 3,
 5,
 1,
 1,
 4,
 5,
 2,
 1,
 5,
 4,
 1,
 2,
 4,
 2,
 1,
 5,
 1,
 2,
 1,
 1,
 1,
 1,
 4,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 4,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 2,
 1,
 1,
 1,
 5,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 2,
 2,
 1,
 1,
 3,
 2,
 2,
 1,
 1,
 1,
 7,
 1,
 1,
 1,
 3,
 1,
 2,
 1,
 2,
 3,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,


In [62]:
author_nodes = [node for node, data in G.nodes(data=True) if data.get("group") != 1]
author_nodes
G.remove_nodes_from(author_nodes)
len(G.nodes)

3199

In [63]:
for i, comm in enumerate(communities):  
    community_nodes = [node for node in comm if 'publication' in node]  
    n = len(community_nodes)
    if n>20: continue
    if n<10: continue
    
    for u in community_nodes:
        for v in community_nodes:
            if u != v:
                G.add_edge(u, v, weight=1, group=1, color='red') 
    

In [64]:
isolated_nodes = list(nx.isolates(G))
G.remove_nodes_from(isolated_nodes)
len(G.nodes)

411

In [65]:
for entry in datasets:
    node_id = f"dataset_{entry['id']}"
    short_title = entry.get('metadata', {}).get('summary', {}).get('shortTitle', f"Dataset {node_id}")
    G.add_node(node_id, title=short_title, label=short_title, group=1)

for entry in publications:
    node_id = f"publication_{entry['id']}"
    if (doi := entry.get('paper_doi',None)):
        doi = doi.replace("https://doi.org/","")
        G.add_node(node_id, title=doi, label=doi, group=2)


for _, row in dataset_to_publication.iterrows():
    source = f"dataset_{row.name}"
    target = f"publication_{row['target_publication_id']}"
    G.add_edge(source, target, group=2)
len(G.nodes)

4242

In [66]:
isolated_nodes = list(nx.isolates(G))
G.remove_nodes_from(isolated_nodes)
len(G.nodes)

1034

In [67]:
low_degree_nodes = [node for node, degree in dict(G.degree()).items() if degree < 3]
G.remove_nodes_from(low_degree_nodes)
len(G.nodes)

510

In [68]:
net = Network(notebook=True)
net.from_nx(G)

net.barnes_hut(
    gravity=-2000,
    central_gravity=0.3,
    spring_length=95,
    spring_strength=0.04,
    damping=0.09,
    overlap=0.1
)

net.show("temp.html")

temp.html


### Building a Simple Network Graph

Given the data we have provided, we can build a simple network graph to visualise the existing entities and linkages between them:

First, add all datasets as nodes...

In [None]:
G = nx.Graph()

for entry in datasets:
    node_id = f"dataset_{entry['id']}"
    short_title = entry.get('metadata', {}).get('summary', {}).get('shortTitle', f"Dataset {node_id}")
    G.add_node(node_id, title=short_title, label=short_title, group=1)

len(G.nodes)

Add linkages as edges...

In [None]:
for _, row in dataset_to_dataset.iterrows():
    source = f"dataset_{row.name}"
    target = f"dataset_{row['target_dataset_id']}"
    G.add_edge(source, target, group=1)

Add publications and linkages between them and datasets

In [None]:
for entry in publications:
    node_id = f"publication_{entry['id']}"
    if (doi := entry.get('paper_doi',None)):
        doi = doi.replace("https://doi.org/","")
        G.add_node(node_id, title=doi, label=doi, group=2)

for _, row in dataset_to_publication.iterrows():
    source = f"dataset_{row.name}"
    target = f"publication_{row['target_publication_id']}"
    G.add_edge(source, target, group=2)

len(G.nodes)

Do the same for tools...

In [None]:
for entry in tools:
    node_id = f"tool_{entry['id']}"
    if (name := entry.get('name',None)):
        G.add_node(node_id, title=name, label=name, group=3)

for _, row in dataset_to_tool.iterrows():
    source = f"dataset_{row.name}"
    target = f"tool_{row['target_tool_id']}"
    G.add_edge(source, target, group=3)

for _, row in tool_to_publication.iterrows():
    source = f"tool_{row.name}"
    target = f"publication_{row['target_publication_id']}"
    G.add_edge(source, target, group=3)

len(G.nodes)

Make a copy of the graph with all entity nodes

In [None]:
Gorig = copy.deepcopy(G) 

Remove isolated nodes - in the gateway we have a lot of entities that have no linkages (yet!)

In [None]:
isolated_nodes = list(nx.isolates(G))
G.remove_nodes_from(isolated_nodes)
len(G.nodes)

Draw a visualisation of this graph

In [None]:
net = Network(notebook=True)
net.from_nx(G)

net.barnes_hut(
    gravity=-2000,
    central_gravity=0.3,
    spring_length=95,
    spring_strength=0.04,
    damping=0.09,
    overlap=0.1
)

net.show("./public/direct_orginal_linkages.html")

### Finding additional linkages

Now we can explore trying to build up more indirect linkages

Firstly, in a simple example, we can extract all authors from the publications 

In [None]:
publication_authors = [
    set([
        auth.lstrip()
        for auth in 
        pub["authors"].split(",")
    ]) 
    for pub in publications
]
publication_authors[0]

Build a similarity matrix based on how many overlapping authors two papers have..

In [None]:
n = len(publication_authors)
similarity_matrix = np.zeros((n, n))

for i, j in combinations(range(n), 2):
    common_authors = publication_authors[i] & publication_authors[j]  
    min_authors = min(len(publication_authors[i]), len(publication_authors[j])) 
    similarity_matrix[i, j] = similarity_matrix[j, i] = len(common_authors) / min_authors if min_authors > 0 else 0

similarity_matrix = np.tril(similarity_matrix)
np.fill_diagonal(similarity_matrix, 0)
    
similarity_matrix

Find those publications in which there is a >80% overlap in the authors

In [None]:
high_similarity_indices = np.argwhere((similarity_matrix >= 0.8))

For example...

In [None]:
similar = high_similarity_indices[10]
similar

In [None]:
publications[similar[0]]['authors']

In [None]:
publications[similar[1]]['authors']

Now add these assumed linkaged from authorship overlap as an additional linkage between publications

In [None]:
G2 = copy.deepcopy(Gorig)
for i,j in high_similarity_indices:
    source = f"publication_{publications[i]['id']}"
    target = f"publication_{publications[j]['id']}"    
    G2.add_edge(source, target, group=4)

isolated_nodes = list(nx.isolates(G2))
G2.remove_nodes_from(isolated_nodes)
len(G.nodes)

Draw the new graph with these added relationships

In [None]:
net = Network(notebook=True)
net.from_nx(G2)

net.barnes_hut(
    gravity=-2000,
    central_gravity=0.3,
    spring_length=95,
    spring_strength=0.04,
    damping=0.09,
    overlap=0.1
)
net.show("./public/direct_linkages_with_authors.html")

#### Additional dataset linkages

The dataset metadata contains an abstract about the datasets - we can use this and some simple NLP to try and extract some indirect linkages

For example, an abstract may look like this:

In [None]:
datasets[0]['metadata']['summary']['abstract']

Extract the abstracts:

In [None]:
dataset_abstracts = [
    dataset['metadata']['summary']["abstract"] 
    if dataset['metadata']['summary']["abstract"] 
    else "" 
    for dataset in datasets
]

Tokenise the words in the abstracts

In [None]:
tokenized_abstracts = [
    " ".join(word_tokenize(abstract.lower())) 
    for abstract in dataset_abstracts
]

Vectorise and calcuated a simularity matrix between all the dataset abstracts

In [None]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)  
tfidf_matrix = vectorizer.fit_transform(tokenized_abstracts)

similarity_matrix = np.tril(cosine_similarity(tfidf_matrix))
np.fill_diagonal(similarity_matrix, 0)
similarity_matrix

In [None]:
similarity_matrix.shape,len(datasets)

Find some examples of high overlap in the abstracts:

In [None]:
high_similarity_indices = np.argwhere((similarity_matrix >= 0.75) & (similarity_matrix < 0.98))
similar = high_similarity_indices[12]
similar

In [None]:
similarity_matrix[similar[0],similar[1]]

In [None]:
dataset_abstracts[similar[0]]

In [None]:
dataset_abstracts[similar[1]]

Construct a third network graph, to add in these additional indirect linkages based on the abstracts..

In [None]:
G3 = copy.deepcopy(G2)

In [None]:
for i,j in high_similarity_indices:
    source = f"dataset_{datasets[i]['id']}"
    target = f"dataset_{datasets[j]['id']}"
    G3.add_edge(source, target, color='red', weight=0.5)

isolated_nodes = list(nx.isolates(G3))
G3.remove_nodes_from(isolated_nodes)
len(G3.nodes)

In [None]:
for i,j in high_similarity_indices:
    source = f"dataset_{datasets[i]['id']}"
    target = f"dataset_{datasets[j]['id']}"
    G3.add_edge(source, target, color='red')

isolated_nodes = list(nx.isolates(G3))
G3.remove_nodes_from(isolated_nodes)
len(G3.nodes)

In [None]:
net = Network(notebook=True)
net.from_nx(G3)

net.barnes_hut(
    gravity=-2000,
    central_gravity=0.3,
    spring_length=95,
    spring_strength=0.04,
    damping=0.09,
    overlap=0.1
)
net.show("./public/direct_linkages_and_indirect.html")

Visualise in a "simpler" manner... 

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(12, 8))  
pos = nx.kamada_kawai_layout(G3) 

nx.draw_networkx_nodes(G3, pos, node_size=10, node_color='lightblue')

edges = G3.edges()
edge_colors = ['black' for _ in edges]
edge_styles = ['--' for _ in edges]
nx.draw_networkx_edges(G3, pos, edgelist=edges, edge_color=edge_colors, style='--', width=0.5)


plt.show()