In [1]:
from helpers import find_unconnected_pairs
import typer
import os
import networkx as nx
from pprint import pprint
from PyPDF2 import PdfReader
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
graph_path = ".litmap/research_graph.xml"

In [3]:
# Load the graph
G = nx.read_graphml(graph_path)

In [4]:
print("Graph Report")
print("Number of nodes: ", G.number_of_nodes())

Graph Report
Number of nodes:  2012


In [5]:
# Grabbing pagerank results
pagerank_results = nx.pagerank(G)

# Sort the pagerank results
pagerank_results = sorted(pagerank_results.items(), key=lambda x: x[1], reverse=True)
pagerank_results

[('2023.acl-long.338.pdf', 0.14203101430857407),
 ('2023.acl-long.656.pdf', 0.1347378567244267),
 ('2023.findings-acl.309.pdf', 0.09220981320360556),
 ('2023.acl-long.895.pdf', 0.09135470380200494),
 ('Computational Linguistics', 0.0007608915705460566),
 ('-', 0.0007608915705460566),
 ('it', 0.0007608915705460566),
 ('evaluation', 0.0007608915705460566),
 ('data', 0.0007608915705460566),
 ('models', 0.0007608915705460566),
 ('BERT', 0.0007608915705460566),
 ('text', 0.0007608915705460566),
 ('model', 0.0007608915705460566),
 ('Natural Language Processing', 0.0007608915705460566),
 ('Empirical Methods', 0.0007608915705460566),
 ('artifacts', 0.0007608915705460566),
 ('linguistic phenomena', 0.0007608915705460566),
 ('GPU hours', 0.0007608915705460566),
 ('preprocessing', 0.0007608915705460566),
 ('hyperparameter search', 0.0007608915705460566),
 ('normalization', 0.0007608915705460566),
 ('NLTK', 0.0007608915705460566),
 ('Spacy', 0.0007608915705460566),
 ('ROUGE', 0.0007608915705460566

In [6]:
# Print the top 10 pagerank results
print("Top 10 pagerank results:")
pprint(pagerank_results[:10])

Top 10 pagerank results:
[('2023.acl-long.338.pdf', 0.14203101430857407),
 ('2023.acl-long.656.pdf', 0.1347378567244267),
 ('2023.findings-acl.309.pdf', 0.09220981320360556),
 ('2023.acl-long.895.pdf', 0.09135470380200494),
 ('Computational Linguistics', 0.0007608915705460566),
 ('-', 0.0007608915705460566),
 ('it', 0.0007608915705460566),
 ('evaluation', 0.0007608915705460566),
 ('data', 0.0007608915705460566),
 ('models', 0.0007608915705460566)]


In [7]:
# Link prediction over unconnected pairs
# Grab the unconnected pairs of nodes
unconnected_pairs = find_unconnected_pairs(G)

# Predict the links
predicted_links = []
for pair in unconnected_pairs:
    predicted_links.append((pair, nx.jaccard_coefficient(G, [pair])[0][2]))

# Sort the predicted links
predicted_links.sort(key=lambda x: x[1], reverse=True)

# Print the top 10 predicted links
print("Top 10 predicted links:")
pprint(predicted_links[:10])

Top 10 predicted links:
[]


In [10]:
nodes = list(G.nodes())
unconnected_pairs = []

for i in range(len(nodes)):
    for j in range(i+1, len(nodes)):
        node1 = nodes[i]
        node2 = nodes[j]
        
        if not nx.has_path(G, node1, node2):
            unconnected_pairs.append((node1, node2))

unconnected_pairs

[]