In [1]:
from pyvis.network import Network
import networkx as nx

In [17]:
from itertools import combinations
import numpy as np
import pandas as pd
import json

In [3]:
datasets = []
publications = []
tools = []
with open ('data/datasets.json') as f:
    datasets = json.load(f)
with open ('data/publications.json') as f:
    publications = json.load(f)
with open ('data/tools.json') as f:
    tools = json.load(f)

len(datasets), len(publications), len(tools)

(1059, 3199, 243)

In [4]:
dataset_to_dataset = pd.read_json("data/linkages_dataset_to_dataset.json").set_index('source_dataset_id')
dataset_to_dataset

Unnamed: 0_level_0,target_dataset_id
source_dataset_id,Unnamed: 1_level_1
16,20
19,20
31,15
51,78
54,700
...,...
719,722
809,808
811,808
814,808


In [5]:
dataset_to_publication = pd.read_json("data/linkages_dataset_to_publication.json").set_index('source_dataset_id')
dataset_to_publication

Unnamed: 0_level_0,target_publication_id
source_dataset_id,Unnamed: 1_level_1
11,1411
14,523
14,1537
14,1664
15,1406
...,...
880,2386
880,3015
881,2343
892,2939


In [6]:
dataset_to_tool = pd.read_json("data/linkages_dataset_to_tool.json").set_index('source_dataset_id')
dataset_to_tool.head(5)

Unnamed: 0_level_0,target_tool_id
source_dataset_id,Unnamed: 1_level_1
778,93
23,102
283,102
413,102
728,102


In [7]:
tool_to_publication = pd.read_json("data/linkages_tool_to_publication.json").set_index('source_tool_id')
tool_to_publication

Unnamed: 0_level_0,target_publication_id
source_tool_id,Unnamed: 1_level_1
234,296
242,3166
243,3166


In [8]:
G = nx.Graph()

for entry in datasets:
    node_id = f"dataset_{entry['id']}"
    short_title = entry.get('metadata', {}).get('summary', {}).get('shortTitle', f"Dataset {node_id}")
    G.add_node(node_id, title=short_title, label=short_title, group=1)

len(G.nodes)

1044

In [9]:
for _, row in dataset_to_dataset.iterrows():
    source = f"dataset_{row.name}"
    target = f"dataset_{row['target_dataset_id']}"
    G.add_edge(source, target, group=1)

In [10]:
for entry in publications:
    node_id = f"publication_{entry['id']}"
    if (doi := entry.get('paper_doi',None)):
        doi = doi.replace("https://doi.org/","")
        G.add_node(node_id, title=doi, label=doi, group=2)

for _, row in dataset_to_publication.iterrows():
    source = f"dataset_{row.name}"
    target = f"publication_{row['target_publication_id']}"
    G.add_edge(source, target, group=2)

len(G.nodes)

4242

In [11]:
for entry in tools:
    node_id = f"tool_{entry['id']}"
    if (name := entry.get('name',None)):
        G.add_node(node_id, title=name, label=name, group=3)

for _, row in dataset_to_tool.iterrows():
    source = f"dataset_{row.name}"
    target = f"tool_{row['target_tool_id']}"
    G.add_edge(source, target, group=3)

for _, row in tool_to_publication.iterrows():
    source = f"tool_{row.name}"
    target = f"publication_{row['target_publication_id']}"
    G.add_edge(source, target, group=3)

len(G.nodes)

4485

In [12]:
isolated_nodes = list(nx.isolates(G))
G.remove_nodes_from(isolated_nodes)
len(G.nodes)

801

In [13]:
net = Network(notebook=True)
net.from_nx(G)

net.barnes_hut(
    gravity=-2000,
    central_gravity=0.3,
    spring_length=95,
    spring_strength=0.04,
    damping=0.09,
    overlap=0.1
)

net.show("direct_linkages.html")

direct_linkages.html


In [18]:
publication_authors = [set(pub["authors"].split(",")) for pub in publications]

n = len(publication_authors)

similarity_matrix = np.zeros((n, n))

for i, j in combinations(range(n), 2):
    common_authors = publication_authors[i] & publication_authors[j]  
     min_authors = min(len(publication_authors[i]), len(publication_authors[j])) 
    similarity_matrix[i, j] = similarity_matrix[j, i] = len(common_authors) / min_authors if min_authors > 0 else 0

    
similarity_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 2., 0., 0.],
       ...,
       [0., 0., 2., ..., 0., 2., 0.],
       [0., 1., 0., ..., 2., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

publication_authors = [pub["authors"] for pub in publications]
publication_titles = [pub["paper_title"] for pub in publications]
publication_abstracts = [pub["abstract"] for pub in publications]

vectorizer = TfidfVectorizer().fit_transform(publication_authors)
similarity_matrix = np.tril(cosine_similarity(vectorizer))
similarity_matrix

In [None]:
reduced_matrix = np.where(similarity_matrix >= 0.9, similarity_matrix, 0)
np.fill_diagonal(reduced_matrix, 0)
nonzero_rows = np.any(reduced_matrix > 0, axis=1) 
nonzero_cols = np.any(reduced_matrix > 0, axis=0) 

final_matrix = reduced_matrix[np.ix_(nonzero_rows, nonzero_cols)]
final_matrix
final_matrix.shape

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(np.tril(similarity_matrix), annot=True, cmap="coolwarm")# xticklabels=publication_titles, yticklabels=publication_titles)
plt.title("Publication Similarity Based on Authors (TF-IDF & Cosine Similarity)")
plt.xticks(rotation=45, ha="right")
plt.yticks(rotation=0)
plt.show()