#

# Article Graph Example

This notebook contains a quick overview of the `article_graph` module together with the `topic_modeling`, `similarity` and `ner` modules.

In [1]:
# We make available the packages inside all the modules

import sys
import os
sys.path.append(os.path.dirname(os.getcwd()))
os.path.dirname(os.getcwd())

from pdf_analyzer.config_load import load_config
from pdf_analyzer.api import API
from omegaconf import OmegaConf
from pdf_analyzer.config_load import load_config

from pdf_analyzer.api.extract.elements import extract_element_soup
from article_graph.recon import recon_generic


In [2]:
server_config = load_config("config/api/grobid-server-config.yaml")
extract_config = load_config("config/api/api-base-config.yaml")
print("SERVER_CONFIG\n"+OmegaConf.to_yaml(server_config))
print("CLOUD_CONFIG\n"+OmegaConf.to_yaml(extract_config))

base_api = API.BaseAPI(extract_config,server_config)

files = base_api.proccesed_files

SERVER_CONFIG
url:
  protocol: http
  api_domain: yordi111nas.synology.me
  port: 8070

CLOUD_CONFIG
data:
  data_dir: data/PDFs
  format: .pdf
  recursive: true
grobid:
  cache: true
  cache_dir: data/xmls
  operation_key: processFulltextDocument
  format: .grobid.tei.xml
  recursive: true

http://yordi111nas.synology.me:8070/api/isalive
GROBID server is up and running
data/xmls\Bert.grobid.tei.xml already exist, skipping... (use --force to reprocess pdf input files)
data/xmls\DistillBERT.grobid.tei.xml already exist, skipping... (use --force to reprocess pdf input files)
data/xmls\Dont_stop_pretraining.grobid.tei.xml already exist, skipping... (use --force to reprocess pdf input files)
data/xmls\GPT-3.grobid.tei.xml already exist, skipping... (use --force to reprocess pdf input files)
data/xmls\LIME.grobid.tei.xml already exist, skipping... (use --force to reprocess pdf input files)
data/xmls\LoRA.grobid.tei.xml already exist, skipping... (use --force to reprocess pdf input files)
da

In [3]:
# List of mock papers to be used in this example

papers = [
    {
        'title': 'Title 1',
        'abstract': 'The universe is a vast expanse of space containing countless galaxies, stars, planets, and other celestial objects.',
        'release_date': '2024-01-24',
        'acknowledgements': '''
            We thank the referees for their constructive feedback, which has helped us to improve the quality of this manuscript.
            This work is based on spectropolarimetric observations obtained at the TBL, AATand 3.6-m ESO telescope.
            We thank the technical staff at each of these facilities for their time and data.
            We also acknowledge the use of the PolarBase database, which makes TBL observations publicly available,
            and is operated by the Centre National de la Recherche Scientifique of France (CNRS), Observatoire''',
    },
    {
        'title': 'Title 2',
        'abstract': 'Ancient civilizations such as the Egyptians, Greeks, and Romans have left behind rich legacies of art, architecture, and knowledge.',
        'release_date': '2024-02-15',
        'acknowledgements': '''
            Acknowledgements. We are grateful to our referee, Nicolas Cowan.
            We gratefully acknowledge the open source software which made this work possible:
                astropy (Astropy Collaboration et al. 2013Collaboration et al. , 2018Collaboration et al. , 2022)),
                ipython (Pérez &amp; Granger 2007),
                numpy (Harris et al. 2020),
                scipy (Virtanen et al. 2020),
                matplotlib (Hunter 2007),
                JAX (Bradbury et al. 2018),
                arviz (Kumar et al. 2019),
                numpyro (Phan et al. 2019),
                FastChem (Stock et al. 2018(Stock et al. , 2022;;Kitzmann et al. 2023),
                LX-MIE (Kitzmann &amp; Heng 2018b),
                celerite2 (Foreman-Mackey et al. 2017; Foreman-Mackey 2018) exoplanet (Foreman-Mackey et al. 2021b),
                lightkurve (Lightkurve Collaboration et al. 2018),
                corner (Foreman-Mackey 2016),
                kelp (Morris et al. 2022).

            This research has made use of the SVO Filter Profile Service (http://svo2.cab.inta-csic.es/theory/fps/) supported
            from the Spanish MINECO through grant AYA2017-84089.'''
    },
    {
        'title': 'Title 3',
        'abstract': 'Climate change is a pressing global issue that requires urgent action to mitigate its impacts on the environment and human societies.',
        'release_date': '2024-03-10',
        'acknowledgements': '''Acknowledgements. This work is based on data from eROSITA, the soft X-ray instrument aboard SRG,
        a joint Russian-German science mission supported by the Russian Space Agency (Roskosmos),
        in the interests of the Russian Academy of Sciences represented by its Space Research Institute (IKI),
        and the Deutsches Zentrum für Luft-und Raumfahrt (DLR).'''
    },
]

## Adding the Papers to the Graph

In this section, we will be adding all the papers to the graph!

In [4]:
from article_graph.article_graph import ArticleGraph

# We create the graph
g = ArticleGraph()

# We add the documents to the graph
for paper_id, paper_info in enumerate(papers):
    
    g.add_paper(paper_id=paper_id,
                title=paper_info['title'],
                abstract=paper_info['abstract'],
                release_date=paper_info['release_date'])
    
# Explore the graph by printing the titles of the papers
for s, p, o in g.graph.triples((None, g.ns.title, None)):
    print(s, p, o)

http://open_science.com/paper#0 http://open_science.com/title Title 1
http://open_science.com/paper#1 http://open_science.com/title Title 2
http://open_science.com/paper#2 http://open_science.com/title Title 3


In [5]:
from article_graph.article_graph import ArticleGraph
from get_paper_metadata import get_paper_metadata

# We create the graph
g = ArticleGraph()

# We add the documents to the graph
for paper_id, file in enumerate(files):
    paper_info = get_paper_metadata(file)
    g.add_paper(paper_id=paper_id,
                title=paper_info['title'],
                abstract=paper_info['abstract'],
                release_date=paper_info['release_date'])
    
# Explore the graph by printing the titles of the papers
for s, p, o in g.graph.triples((None, g.ns.title, None)):
    print(s, p, o)

http://open_science.com/paper#0 http://open_science.com/title Exploring teachers’ confidence in addressing mental health issues in learners with Profound and Multiple Learning Difficulties (PMLD) pre and post training
http://open_science.com/paper#1 http://open_science.com/title DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter
http://open_science.com/paper#2 http://open_science.com/title Don’t Stop Pretraining: Adapt Language Models to Domains and Tasks
http://open_science.com/paper#3 http://open_science.com/title Pronunciation and good language learners
http://open_science.com/paper#4 http://open_science.com/title "Why Should I Trust You?"
http://open_science.com/paper#5 http://open_science.com/title LORA: LOW-RANK ADAPTATION OF LARGE LAN-GUAGE MODELS
http://open_science.com/paper#6 http://open_science.com/title A Robustly Optimized BERT Pre-training Approach with Post-training
http://open_science.com/paper#7 http://open_science.com/title Preface to the bo

## Topic Modeling

In this section, we will be exploring the use of topic modeling inside the **Article Graph** !

### Generating the Topics

In this subsection, we will be extracting topics from the papers' abstracts using the `topic_modeling` module!

In [6]:
from topic_modeling.lda import LDA

# Create the LDA model for Topic Modeling
# We need to specify the number of topics and the number of words per topic
lda_model = LDA(corpus=[paper['abstract'] for paper in papers],
                num_topics=3,
                num_words=7)
lda_model.fit()

# Display the generated topics
for i, topic in enumerate(lda_model.topics):
    print(f'Topic {i}: {topic}')

Topic 0: ['the', 'is', 'issue', 'on', 'change', 'climate', 'environment']
Topic 1: ['and', 'the', 'of', 'behind', 'knowledge', 'legacies', 'civilizations']
Topic 2: ['the', 'is', 'of', 'vast', 'planets', 'galaxies', 'countless']


In [7]:
from topic_modeling.lda import LDA

# Create the LDA model for Topic Modeling
# We need to specify the number of topics and the number of words per topic
from article_graph._utils import get_abstract

lda_model = LDA(corpus=[get_abstract(file) for file in files],
                num_topics=3,
                num_words=7)
lda_model.fit()

# Display the generated topics
for i, topic in enumerate(lda_model.topics):
    print(f'Topic {i}: {topic}')

Topic 0: ['and', 'of', 'the', 'to', 'on', 'language', 'model']
Topic 1: ['of', 'the', 'and', 'we', 'task', 'to', 'in']
Topic 2: ['and', 'the', 'of', 'in', 'to', 'we', 'model']


### Adding Topics to Graph

In this subsection, we will be adding the generated topics to the graph!

In [8]:
# We add the topics to the graph
for topic_id, keywords in enumerate(lda_model.topics):
    g.add_topic(topic_id, keywords)

# We explore the graph by printing the keywords of each topic
for s, p, o in g.graph.triples((None, g.ns.keyword, None)):
    print(s, p, o)

http://open_science.com/topic#0 http://open_science.com/keyword and
http://open_science.com/topic#1 http://open_science.com/keyword and
http://open_science.com/topic#2 http://open_science.com/keyword and
http://open_science.com/topic#0 http://open_science.com/keyword of
http://open_science.com/topic#1 http://open_science.com/keyword of
http://open_science.com/topic#2 http://open_science.com/keyword of
http://open_science.com/topic#0 http://open_science.com/keyword the
http://open_science.com/topic#1 http://open_science.com/keyword the
http://open_science.com/topic#2 http://open_science.com/keyword the
http://open_science.com/topic#0 http://open_science.com/keyword to
http://open_science.com/topic#1 http://open_science.com/keyword to
http://open_science.com/topic#2 http://open_science.com/keyword to
http://open_science.com/topic#0 http://open_science.com/keyword on
http://open_science.com/topic#0 http://open_science.com/keyword language
http://open_science.com/topic#0 http://open_scienc

### Adding TopicBelongings to Graph

In this subsection, we will be adding the topic belonging relationships to the graph! These relationships represent the topic dostributions of each paper to every topic in the graph.

In [9]:
# We predict the topic distributions for each paper to all the topics
lda_model.predict_all()

# We add the topic belonging for each topic and paper storing the degree of belonging
for paper_id, paper_info in enumerate(lda_model.topic_distributions):
    for topic_id, topic_dist in paper_info.items():
        g.add_topic_belonging(paper_id, topic_id, topic_dist)

# We explore the graph by printing the topic belonging for each paper to all the topics
for s, p, o in g.graph.triples((None, g.ns.belongs_to_topic, None)):
    for _, p1, o1 in g.graph.triples((o, g.ns.degree, None)):
        print(s, p, o, p1, o1)

http://open_science.com/paper#0 http://open_science.com/belongs_to_topic http://open_science.com/topic_belonging#0-0 http://open_science.com/degree 0.9952436694572805
http://open_science.com/paper#0 http://open_science.com/belongs_to_topic http://open_science.com/topic_belonging#0-1 http://open_science.com/degree 0.0023750987470174834
http://open_science.com/paper#0 http://open_science.com/belongs_to_topic http://open_science.com/topic_belonging#0-2 http://open_science.com/degree 0.0023812317957020938
http://open_science.com/paper#1 http://open_science.com/belongs_to_topic http://open_science.com/topic_belonging#1-0 http://open_science.com/degree 0.9957504216711487
http://open_science.com/paper#1 http://open_science.com/belongs_to_topic http://open_science.com/topic_belonging#1-1 http://open_science.com/degree 0.002149765930953944
http://open_science.com/paper#1 http://open_science.com/belongs_to_topic http://open_science.com/topic_belonging#1-2 http://open_science.com/degree 0.0020998

## Named Entity Recognition

In this section, we will be exploring the use of named entity recognition inside the **Article Graph** !

In [10]:
from extract_authors import get_all_author_metadata
from ner.extract_ner import get_all_ners,extract_ners,get_all_projects,get_projects_names,extract_award_identifiers

In [11]:
from transformers import pipeline

pipe = pipeline("token-classification", model="dslim/bert-base-NER")

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:

all_orgs_rel,all_orgs=get_all_ners(files,pipe)
#authors_list,relation_author_paper,all_orgs,relation_author_org =get_all_author_metadata(files,all_orgs)

In [20]:
all_ners=[]
for acno in [paper["acknowledgements"] for paper in papers]:
    all_ners.extend(extract_ners(acno,pipe))

In [21]:
all_ners

[{'name': 'Centre National de la Recherche Scientifique of France',
  'score': 0.95020586,
  'type': 'ORG'},
 {'name': 'CNRS', 'score': 0.92107177, 'type': 'ORG'},
 {'name': 'MINECO', 'score': 0.9240498, 'type': 'ORG'},
 {'name': 'Russian Space Agency', 'score': 0.99701536, 'type': 'ORG'},
 {'name': 'Ros kosmos', 'score': 0.97655296, 'type': 'ORG'},
 {'name': 'Russian Academy of Sciences', 'score': 0.9965687, 'type': 'ORG'},
 {'name': 'Space Research Institute', 'score': 0.9974501, 'type': 'ORG'},
 {'name': 'IKI', 'score': 0.9900939, 'type': 'ORG'},
 {'name': 'Deutsches Zentrum für Luft-und Raumfahrt',
  'score': 0.9719548,
  'type': 'ORG'},
 {'name': 'DLR', 'score': 0.9951084, 'type': 'ORG'}]

In [17]:
all_orgs

[{'name': 'Office of Naval Research', 'type': 'ORG', 'org_id': 0},
 {'name': 'Google', 'type': 'ORG', 'org_id': 1},
 {'name': 'OpenAI', 'type': 'ORG', 'org_id': 2},
 {'name': 'TerraSwarm', 'type': 'ORG', 'org_id': 3},
 {'name': 'STARnet', 'type': 'ORG', 'org_id': 4},
 {'name': 'Semiconductor Research Corporation', 'type': 'ORG', 'org_id': 5},
 {'name': 'MARCO', 'type': 'ORG', 'org_id': 6},
 {'name': 'DARPA', 'type': 'ORG', 'org_id': 7}]

## Projects Extraction

Regex can be custom if necessary, in case their are not setup default values will be used

In [None]:
from ner.extract_ner import get_all_projects,get_projects_names,extract_award_identifiers

In [None]:
regex_patterns = {
    "NIH": r'(?:#)?\b[1-9][A-Z\d]{3}[A-Z]{2}\d{6}(?:-[AS]?\d+)?\b',
    "DOD": r'(?:#)?\b[A-Z\d]{6}-\d{2}-[123]-\d{4}\b',
    "NASA": r'(?:#)?\b(?:80|NN)[A-Z]+\d{2}[A-Z\d]+\b',
    "Education": r'(?:#)?\b[A-Z]+\d+[A-Z]\d{2}[A-Z\d]+\b',
    "Universal":r'[A-Z]{3,}[0-9]+-[0-9]+'
}

In [None]:
all_projects,all_projects_relation=get_all_projects(files)

In [None]:
all_projects,all_projects_relation=get_all_projects(files,regex_patterns)

In [None]:
all_projects

[{'project_name': 'MURI',
  'project_federal_id': 'N00014-18-1-2670',
  'project_id': 0},
 {'project_name': 'ONR',
  'project_federal_id': '#N00014-13-1-0023',
  'project_id': 1}]

In [None]:
[{"project_federal_ids":extract_award_identifiers(paper["acknowledgements"]),"project_names":get_projects_names(paper["acknowledgements"])} for paper in papers]

[{'project_federal_ids': [], 'project_names': []},
 {'project_federal_ids': [], 'project_names': []},
 {'project_federal_ids': [], 'project_names': []}]

## Author Extraction

In [None]:
authors_list,relation_author_paper,all_orgs,relation_author_org =get_all_author_metadata(files,all_orgs)

In [None]:
authors_list

[{'name': 'Jacob  ',
  'last_name': 'Devlin ',
  'label': 'Jacob   Devlin ',
  'email': <email>jacobdevlin@google.com</email>,
  'author_id': 0},
 {'name': 'Ming-Wei  ',
  'last_name': 'Chang ',
  'label': 'Ming-Wei   Chang ',
  'email': <email>mingweichang@google.com</email>,
  'author_id': 1},
 {'name': 'Kenton  ',
  'last_name': 'Lee ',
  'label': 'Kenton   Lee ',
  'email': <email>kentonl@google.com</email>,
  'author_id': 2},
 {'name': 'Kristina  ',
  'last_name': 'Toutanova ',
  'label': 'Kristina   Toutanova ',
  'email': None,
  'author_id': 3},
 {'name': 'Victor  ',
  'last_name': 'Sanh ',
  'label': 'Victor   Sanh ',
  'email': <email>victor@huggingface.co</email>,
  'author_id': 4},
 {'name': 'Suchin  ',
  'last_name': 'Gururangan ',
  'label': 'Suchin   Gururangan ',
  'email': <email>suching@allenai.org</email>,
  'author_id': 5},
 {'name': 'Ana  ',
  'last_name': 'Marasović ',
  'label': 'Ana   Marasović ',
  'email': <email>anam@allenai.org</email>,
  'author_id': 6},
 {

## Adding Organizations to graph

In [None]:
for org in all_orgs:
    g.add_organization(org["org_id"],org["name"])

# We explore the graph by printing the topic belonging for each paper to all the topics
for s, p, o in g.graph.triples((None, None, g.ns.Organization)):
    for _, _, org_name in g.graph.triples((s, g.ns.name, None)):
        print(s,org_name)

http://open_science.com/Organization/0 Office of Naval Research
http://open_science.com/Organization/1 Google
http://open_science.com/Organization/2 OpenAI
http://open_science.com/Organization/3 TerraSwarm
http://open_science.com/Organization/4 STARnet
http://open_science.com/Organization/5 Semiconductor Research Corporation
http://open_science.com/Organization/6 MARCO
http://open_science.com/Organization/7 DARPA
http://open_science.com/Organization/8 Lysandre DEBUT
http://open_science.com/Organization/9 Julien CHAUMOND
http://open_science.com/Organization/10 Thomas WOLF Hugging Face
http://open_science.com/Organization/11 Allen Institute for Artificial Intelligence
http://open_science.com/Organization/12 Paul G
http://open_science.com/Organization/13 Allen School of Computer Science & Engineering
http://open_science.com/Organization/14 University of Washington
http://open_science.com/Organization/15 Johns Hopkins University
http://open_science.com/Organization/16 University of Washing

In [None]:
for relation in all_orgs_rel:
    g.add_organization_paper_relation(relation["paper_id"],relation["org_id"])

# We explore the graph by printing the topic belonging for each paper to all the topics
for s, p, o in g.graph.triples((None, g.ns.acknowledges,None)):
    if str(o).startswith(str(g.ns.Organization)):
        print(s, p, o)

http://open_science.com/Paper/2 http://open_science.com/acknowledges http://open_science.com/Organization/0
http://open_science.com/Paper/2 http://open_science.com/acknowledges http://open_science.com/Organization/1
http://open_science.com/Paper/3 http://open_science.com/acknowledges http://open_science.com/Organization/2
http://open_science.com/Paper/4 http://open_science.com/acknowledges http://open_science.com/Organization/3
http://open_science.com/Paper/4 http://open_science.com/acknowledges http://open_science.com/Organization/4
http://open_science.com/Paper/4 http://open_science.com/acknowledges http://open_science.com/Organization/5
http://open_science.com/Paper/4 http://open_science.com/acknowledges http://open_science.com/Organization/6
http://open_science.com/Paper/4 http://open_science.com/acknowledges http://open_science.com/Organization/7


In [None]:
for relation in relation_author_org:
    g.add_organization_author_relation(relation["author_id"],relation["org_id"])

# We explore the graph by printing the topic belonging for each paper to all the topics
for s, p, o in g.graph.triples((None, g.ns.member, None)):
    print(s,p,o)

http://open_science.com/Author/4 http://open_science.com/member http://open_science.com/Organization/8
http://open_science.com/Author/4 http://open_science.com/member http://open_science.com/Organization/9
http://open_science.com/Author/4 http://open_science.com/member http://open_science.com/Organization/10
http://open_science.com/Author/5 http://open_science.com/member http://open_science.com/Organization/11
http://open_science.com/Author/6 http://open_science.com/member http://open_science.com/Organization/11
http://open_science.com/Author/7 http://open_science.com/member http://open_science.com/Organization/11
http://open_science.com/Author/8 http://open_science.com/member http://open_science.com/Organization/11
http://open_science.com/Author/9 http://open_science.com/member http://open_science.com/Organization/11
http://open_science.com/Author/10 http://open_science.com/member http://open_science.com/Organization/11
http://open_science.com/Author/11 http://open_science.com/member 

## Adding Projects to graph

In [None]:
for project in all_projects:
    g.add_project(project["project_id"],project["project_name"],project["project_federal_id"])

# We explore the graph by printing the topic belonging for each paper to all the topics
for s, p, o in g.graph.triples((None, None, g.ns.Project)):
    for _, _, org_name in g.graph.triples((s, g.ns.name, None)):
        print(s,org_name)

http://open_science.com/Project/0 MURI
http://open_science.com/Project/1 ONR


In [None]:
for relation in all_projects_relation:
    g.add_project_relation(relation["paper_id"],relation["project_id"])

# We explore the graph by printing the topic belonging for each paper to all the topics
for s, p, o in g.graph.triples((None, g.ns.acknowledges, None)):
    if str(o).startswith(str(g.ns.Project)):
        print(s, p, o)

http://open_science.com/Paper/2 http://open_science.com/acknowledges http://open_science.com/Project/0
http://open_science.com/Paper/4 http://open_science.com/acknowledges http://open_science.com/Project/1


## Add Authors to Graph

In [None]:
authors_list

[{'name': 'Jacob  ',
  'last_name': 'Devlin ',
  'label': 'Jacob   Devlin ',
  'email': <email>jacobdevlin@google.com</email>,
  'author_id': 0},
 {'name': 'Ming-Wei  ',
  'last_name': 'Chang ',
  'label': 'Ming-Wei   Chang ',
  'email': <email>mingweichang@google.com</email>,
  'author_id': 1},
 {'name': 'Kenton  ',
  'last_name': 'Lee ',
  'label': 'Kenton   Lee ',
  'email': <email>kentonl@google.com</email>,
  'author_id': 2},
 {'name': 'Kristina  ',
  'last_name': 'Toutanova ',
  'label': 'Kristina   Toutanova ',
  'email': None,
  'author_id': 3},
 {'name': 'Victor  ',
  'last_name': 'Sanh ',
  'label': 'Victor   Sanh ',
  'email': <email>victor@huggingface.co</email>,
  'author_id': 4},
 {'name': 'Suchin  ',
  'last_name': 'Gururangan ',
  'label': 'Suchin   Gururangan ',
  'email': <email>suching@allenai.org</email>,
  'author_id': 5},
 {'name': 'Ana  ',
  'last_name': 'Marasović ',
  'label': 'Ana   Marasović ',
  'email': <email>anam@allenai.org</email>,
  'author_id': 6},
 {

In [None]:
for author in authors_list:
    g.add_author(author["author_id"],author["label"],author["name"],author["last_name"],author["email"])

# We explore the graph by printing the topic belonging for each paper to all the topics
for s, p, o in g.graph.triples((None, None, g.ns.Author)):
    for _, _, full_name in g.graph.triples((s, g.ns.label, None)):
        print(s,full_name)

http://open_science.com/Author/0 Jacob   Devlin 
http://open_science.com/Author/1 Ming-Wei   Chang 
http://open_science.com/Author/2 Kenton   Lee 
http://open_science.com/Author/3 Kristina   Toutanova 
http://open_science.com/Author/4 Victor   Sanh 
http://open_science.com/Author/5 Suchin   Gururangan 
http://open_science.com/Author/6 Ana   Marasović 
http://open_science.com/Author/7 Swabha   Swayamdipta 
http://open_science.com/Author/8 Kyle   Lo 
http://open_science.com/Author/9 Iz   Beltagy 
http://open_science.com/Author/10 Doug   Downey 
http://open_science.com/Author/11 Noah  A  Smith 
http://open_science.com/Author/12 Tom  B  Brown 
http://open_science.com/Author/13 Benjamin   Mann 
http://open_science.com/Author/14 Nick   Ryder 
http://open_science.com/Author/15 Melanie   Subbiah 
http://open_science.com/Author/16 Jared   Kaplan 
http://open_science.com/Author/17 Prafulla   Dhariwal 
http://open_science.com/Author/18 Arvind   Neelakantan 
http://open_science.com/Author/19 Prana

## Reconciling Authors to graph

In [None]:
authors_ids = recon_generic([author["label"] for author in authors_list],"Q5")

100%|██████████| 9/9 [00:24<00:00,  2.75s/it]


In [None]:
authors_ids

[{'name': 'Jacob Devlin', 'match': True, 'score': 100.0, 'id': 'Q57954376'},
 {'name': 'Ming-Wei Chang', 'match': False, 'score': 100.0, 'id': 'Q88254730'},
 {'name': 'Kenton Lee', 'match': False, 'score': 100.0, 'id': 'Q83190952'},
 {'name': 'Kristina N. Toutanova',
  'match': True,
  'score': 100.0,
  'id': 'Q57267422'},
 {'name': 'Victor Sanh', 'match': True, 'score': 100.0, 'id': 'Q108941293'},
 {'name': 'Suchin Gururangan',
  'match': True,
  'score': 100.0,
  'id': 'Q57005661'},
 {'name': nan, 'match': False, 'score': nan, 'id': nan},
 {'name': 'Swabha Swayamdipta',
  'match': False,
  'score': 100.0,
  'id': 'Q57005591'},
 {'name': 'Kyle Lo', 'match': True, 'score': 100.0, 'id': 'Q104433094'},
 {'name': 'Iz Beltagy', 'match': True, 'score': 100.0, 'id': 'Q104433087'},
 {'name': 'Doug Downey', 'match': False, 'score': 100.0, 'id': 'Q55081048'},
 {'name': 'Noah A. Smith', 'match': False, 'score': 100.0, 'id': 'Q38882473'},
 {'name': 'Tom B. Brown', 'match': False, 'score': 100.0, 

In [None]:
for idx,author_data in enumerate(authors_ids):
    if author_data["match"] or (author_data["score"])>99:
        g.add_wikidata_owl("Author",idx,author_data["id"])

## Reconciling Orgs to graph

In [None]:
orgs_ids = recon_generic([org["name"] for org in all_orgs],"Q43229")

100%|██████████| 3/3 [00:08<00:00,  2.67s/it]


In [None]:
orgs_ids

[{'name': 'Office of Naval Research',
  'match': False,
  'score': 100.0,
  'id': 'Q1063818'},
 {'name': 'Google', 'match': False, 'score': 100.0, 'id': 'Q95'},
 {'name': 'OpenAI', 'match': False, 'score': 100.0, 'id': 'Q21708200'},
 {'name': nan, 'match': False, 'score': nan, 'id': nan},
 {'name': 'StarNet', 'match': False, 'score': 100.0, 'id': 'Q4050035'},
 {'name': 'Semiconductor Research Corporation',
  'match': True,
  'score': 100.0,
  'id': 'Q7449388'},
 {'name': 'Museum of Contemporary Art, Vigo',
  'match': False,
  'score': 100.0,
  'id': 'Q3395689'},
 {'name': 'Defense Advanced Research Projects Agency',
  'match': True,
  'score': 100.0,
  'id': 'Q207361'},
 {'name': nan, 'match': False, 'score': nan, 'id': nan},
 {'name': nan, 'match': False, 'score': nan, 'id': nan},
 {'name': nan, 'match': False, 'score': nan, 'id': nan},
 {'name': 'Allen Institute for Artificial Intelligence',
  'match': True,
  'score': 100.0,
  'id': 'Q16002567'},
 {'name': 'J. Paul Getty Museum',
  

In [None]:
for idx,org_data in enumerate(orgs_ids):
    if org_data["match"] or (org_data["score"])>99:
        g.add_wikidata_owl("Organization",idx,org_data["id"])

## Similarity

In this section, we will be exploring the use of similarity inside the **Article Graph** !

### Calculating similarity

In this subsection, we will be calculating the similarity between the papers' abstracts using the `similarity` module!

In [None]:
# pip install -U sentence-transformers
from similarity.Model import Model

# Name of the SentenceTransformer model to use
model_name = 'sentence-transformers/all-mpnet-base-v2'

# Create an instance of the class
Model_instance = Model([paper['abstract'] for paper in papers], model_name)

# Calculate similarity and retrieve the results
similarity_results = Model_instance.calculate_similarity()

# Print similarity results
print("Similarity results:")
for result in similarity_results:
    print(result)

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
# pip install -U sentence-transformers
from similarity.Model import Model

# Name of the SentenceTransformer model to use
model_name = 'sentence-transformers/all-mpnet-base-v2'

# Create an instance of the class
Model_instance = Model([get_abstract(file) for file in files], model_name)

# Calculate similarity and retrieve the results
similarity_results = Model_instance.calculate_similarity()

# Print similarity results
print("Similarity results:")
for result in similarity_results:
    print(result)

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Similarity results:
{'text_id1': 0, 'text_id2': 1, 'similarity': 0.7155712}
{'text_id1': 0, 'text_id2': 2, 'similarity': 0.6356489}
{'text_id1': 0, 'text_id2': 3, 'similarity': 0.729566}
{'text_id1': 0, 'text_id2': 4, 'similarity': 0.3876726}
{'text_id1': 0, 'text_id2': 5, 'similarity': 0.69620013}
{'text_id1': 0, 'text_id2': 6, 'similarity': 0.64963436}
{'text_id1': 0, 'text_id2': 7, 'similarity': 0.42570996}
{'text_id1': 0, 'text_id2': 8, 'similarity': 0.52333754}
{'text_id1': 0, 'text_id2': 9, 'similarity': 0.6604143}
{'text_id1': 1, 'text_id2': 2, 'similarity': 0.6893159}
{'text_id1': 1, 'text_id2': 3, 'similarity': 0.7637905}
{'text_id1': 1, 'text_id2': 4, 'similarity': 0.421143}
{'text_id1': 1, 'text_id2': 5, 'similarity': 0.7398964}
{'text_id1': 1, 'text_id2': 6, 'similarity': 0.6985089}
{'text_id1': 1, 'text_id2': 7, 'similarity': 0.39783502}
{'text_id1': 1, 'text_id2': 8, 'similarity': 0.46126038}
{'text_id1': 1, 'text_id2': 9, 'similarity': 0.6164546}
{'text_id1': 2, 'text_id

### Adding similarity to Graph

In this subsection, we will be adding the calculated similarity to the graph!

In [None]:
# Iterate over the similarity results and add them to the graph
for result in similarity_results:
    text_id1 = result['text_id1']
    text_id2 = result['text_id2']
    similarity_score = result['similarity']
    
    # Add the similarity to the graph using the add_similarity method
    g.add_similarity(text_id1, text_id2, similarity_score)

# Explore the graph by printing the similarity between papers
for s, p, o in g.graph.triples((None, g.ns.similar_to, None)):
    for _, p1, o1 in g.graph.triples((s, g.ns.degree, None)):
        print(f"Paper 1: {s}, Paper 2: {o}, Similarity Score: {o1}")

for s, p, o in g.graph.triples((None, g.ns.similar_from, None)):
    for _, p1, o1 in g.graph.triples((s, g.ns.degree, None)):
        print(f"Paper 1: {o1}, Paper 2: {s}, Similarity Score: {o1}")

 

Paper 1: 0.7155712, Paper 2: http://open_science.com/similarity#0-1, Similarity Score: 0.7155712
Paper 1: 0.6356489, Paper 2: http://open_science.com/similarity#0-2, Similarity Score: 0.6356489
Paper 1: 0.6893159, Paper 2: http://open_science.com/similarity#1-2, Similarity Score: 0.6893159
Paper 1: 0.729566, Paper 2: http://open_science.com/similarity#0-3, Similarity Score: 0.729566
Paper 1: 0.7637905, Paper 2: http://open_science.com/similarity#1-3, Similarity Score: 0.7637905
Paper 1: 0.7682411, Paper 2: http://open_science.com/similarity#2-3, Similarity Score: 0.7682411
Paper 1: 0.3876726, Paper 2: http://open_science.com/similarity#0-4, Similarity Score: 0.3876726
Paper 1: 0.421143, Paper 2: http://open_science.com/similarity#1-4, Similarity Score: 0.421143
Paper 1: 0.3319754, Paper 2: http://open_science.com/similarity#2-4, Similarity Score: 0.3319754
Paper 1: 0.43565473, Paper 2: http://open_science.com/similarity#3-4, Similarity Score: 0.43565473
Paper 1: 0.69620013, Paper 2: ht