In [1]:
from py2neo import Graph
import pandas as pd

from utilities import neo4j_utilities

In [2]:
from tqdm.notebook import tqdm
from tqdm import tqdm as progress_tqdm
tqdm.pandas()

In [3]:
api_key = "..."

In [4]:
# Connect to the Neo4j database
graph = Graph("bolt://localhost:7687", name = "smallstc", auth=("neo4j", "Stargate90"))

In [5]:
query = """
MATCH (a:doi)
RETURN a.title as title, a.abstract as abstract, a.languages AS languages, a.type AS type, a.tags as tags, a.name as doi
"""

In [6]:
# Execute the query
data = graph.run(query).data()

In [7]:
# Convert the result to a DataFrame
df = pd.DataFrame(data)

In [8]:
df_clean_articles = df.dropna(subset=['title'])

In [9]:
df_clean_articles.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20666 entries, 0 to 372181
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   title      20666 non-null  object
 1   abstract   6603 non-null   object
 2   languages  20072 non-null  object
 3   type       20662 non-null  object
 4   tags       18065 non-null  object
 5   doi        20666 non-null  object
dtypes: object(6)
memory usage: 1.1+ MB


## Preprocessing: Filling missing article language using LLM

In [10]:
df_clean_articles[df_clean_articles['languages'].isna()]

Unnamed: 0,title,abstract,languages,type,tags,doi
616,Medical Malpractice and Black-Box Medicine,,,book-chapter,,10.1017/9781108147972.027
1114,Feature Visualization,,,journal-article,[General Materials Science],10.23915/distill.00007
1942,Prescription Drug Monitoring Programs: Examini...,,,journal-article,"[General Medicine, Emergency Medicine]",10.5811/westjem.2014.10.24197
3597,Mass spectrometry,,,journal-article,"[Biochemistry, Molecular Biology, Structural B...",10.1002/(sici)1097-0134(1998)33:2+<1::aid-prot...
3638,[26] Raster3D: Photorealistic molecular graphics,"This chapter discusses Raster3D, which is a su...",,book-chapter,,10.1016/s0076-6879(97)77028-9
...,...,...,...,...,...,...
351285,HLT-FBK: a Complete Temporal Processing System...,,,proceedings-article,,10.18653/v1/s15-2135
351286,HCTI at SemEval-2017 Task 1: Use convolutional...,,,proceedings-article,,10.18653/v1/s17-2016
351834,Multilingual semantic role labeling,This paper describes our contribution to the s...,,proceedings-article,,10.3115/1596409.1596416
351838,International standard for a linguistic annota...,,,proceedings-article,,10.3115/1119226.1119230


In [11]:
language_options = ",".join([x[0] for x in list(df_clean_articles['languages'].value_counts().keys())])
language_options

'en,de,no,da,fr,it,nl,jv,lb,af,zu,la,cy,ca,gl,et,es,fil,sn,ga,fy,sv,pt,el,ceb,yo,el-Latn,ng,ny,co,gd,ru-Latn'

In [12]:
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate

llm = ChatOpenAI(model="gpt-3.5-turbo", api_key=api_key, temperature=0)

template = """
Return the language of the article by title: {title}. 
""" + f"Available options: {language_options}"

language_prompt = ChatPromptTemplate.from_template(template)

In [13]:
language_chain = language_prompt | llm | StrOutputParser()

In [14]:
_t = df_clean_articles[df_clean_articles['languages'].isna()].iloc[0, 0]
_t 

'Medical Malpractice and Black-Box Medicine'

In [15]:
language_chain.invoke({"title": _t})

'en'

In [16]:
df_clean_articles['languages'] = df_clean_articles.progress_apply(
    lambda row: [language_chain.invoke({"title": row['title']})] if pd.isna(row['languages']) else row['languages'], axis=1 #
)

  0%|          | 0/20666 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean_articles['languages'] = df_clean_articles.progress_apply(


In [17]:
df_clean_articles.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20666 entries, 0 to 372181
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   title      20666 non-null  object
 1   abstract   6603 non-null   object
 2   languages  20666 non-null  object
 3   type       20662 non-null  object
 4   tags       18065 non-null  object
 5   doi        20666 non-null  object
dtypes: object(6)
memory usage: 1.1+ MB


In [18]:
df_clean_articles['languages'].value_counts()

languages
[en]         20523
[de]            18
[no]            11
[da]            11
[fr]            10
[it]            10
[nl]             7
[jv]             6
[af]             6
[lb]             6
[zu]             5
[la]             5
[ca]             4
[gu]             4
[es]             4
[cy]             4
[et]             4
[gl]             4
[fil]            3
[pt]             2
[fy]             2
[el]             2
[sn]             2
[sv]             2
[ga]             2
[yo]             1
[ar]             1
[ceb]            1
[el-Latn]        1
[co]             1
[ng]             1
[gd]             1
[ny]             1
[ru-Latn]        1
Name: count, dtype: int64

In [19]:
df_clean_articles.to_csv("languages_filled.csv")

## Preprocessing: Creating tag anthology using LLM

In [20]:
unique_tags = set(value for sublist in df['tags'] if sublist is not None for value in sublist)

In [21]:
list(unique_tags)[:20]

['',
 'Probability',
 'Diathermy',
 'Dextran Sulfate',
 'Nursing (miscellaneous)',
 'Anthrax',
 'Health Care Surveys',
 'Medical Records',
 'Verbal Behavior',
 'Language',
 'E-mail',
 'Self Care',
 'Double-Blind Method',
 'Adaptation, Psychological',
 'Multidisciplinary',
 'Stem Cells',
 'Mannose-Binding Lectins',
 'Bone Neoplasms',
 'Estrogens, Conjugated (USP)',
 'Molecular Medicine']

In [22]:
llm = ChatOpenAI(model="gpt-4o", api_key=api_key, temperature=0)

template = """You will be given a set of article tags. 
Your task is to create and return a 3/4-level anthology
to be used to further map the tags.

Article tags: {tags}

Make sure to include all tags into the anthology
Return the anthology as nested python dictionary"""

anthology_prompt = ChatPromptTemplate.from_template(template)

In [23]:
anthology_chain = anthology_prompt | llm | StrOutputParser()

In [24]:
tag_anthology = anthology_chain.invoke({"tags": unique_tags})

In [25]:
with open('anthology.txt', 'w') as f:
    f.write(tag_anthology)

## Authors

In [26]:
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
embedding = SentenceTransformerEmbeddings()

In [27]:
query = """
MATCH (a:author)-[:author]->(d:doi)
RETURN a.given AS given, a.family AS family, collect(d.title) AS titles
"""

In [28]:
data = graph.run(query).data()

In [29]:
df = pd.DataFrame(data)

In [30]:
df_clean_authors = df.dropna(subset=['family'])

In [31]:
df_clean_authors.shape

(94092, 3)

In [32]:
def is_not_empty_list(lst):
    return len(lst) > 0

In [33]:
# clean authors with not articles

In [34]:
df_clean_authors = df_clean_authors[df_clean_authors['titles'].apply(is_not_empty_list)]

In [35]:
# sometimes the given name is in the family name, especially for single publication
df_clean_authors[df_clean_authors['given'].isna()]

Unnamed: 0,given,family,titles
1114,,The Tobacco Use and Dependence Clinical Practi...,[A Clinical Practice Guideline for Treating To...
1120,,for the Evidence-Based Medicine Working Group,[Users' Guides to the Medical Literature: XXII...
1129,,for the CONSORT Group,[Use of the CONSORT Statement and Quality of R...
11586,,SHARP Collaborative Group,[Study of Heart and Renal Protection (SHARP): ...
22676,,ChatGPT,[Open artificial intelligence platforms in nur...
30474,,Madhusudan,[A transient interaction between two phosphore...
30960,,The Lancet Infectious Diseases,[The COVID-19 infodemic]
31762,,The Lancet Digital Health,[ChatGPT: friend or foe?]
36228,,NHLBI Exome Sequencing Project,"[Analysis of 6,515 exomes reveals the recent o..."
47385,,Prem Ramaswami,[A randomized controlled trial of online sympt...


In [36]:
def word_count(text):
    return len(text.split())

In [37]:
# remove coorporations from authors
df_clean_authors = df_clean_authors[df_clean_authors['family'].apply(word_count) < 3]

In [38]:
df_clean_authors[df_clean_authors['given'].isna()].shape

(44, 3)

In [39]:
df_clean_authors[df_clean_authors['given'].isna()]

Unnamed: 0,given,family,titles
22676,,ChatGPT,[Open artificial intelligence platforms in nur...
30474,,Madhusudan,[A transient interaction between two phosphore...
47385,,Prem Ramaswami,[A randomized controlled trial of online sympt...
48737,,Sobral,[An appraisal of medical students' reflection-...
48741,,Harden,[Task-based learning: the answer to integratio...
48742,,Crosby,[Task-based learning: the answer to integratio...
48743,,Davis,[Task-based learning: the answer to integratio...
48744,,Howie,[Task-based learning: the answer to integratio...
48745,,Struthers,[Task-based learning: the answer to integratio...
52856,,Willyanto,[Protein sequence design by conformational lan...


In [40]:
given_names = df_clean_authors[df_clean_authors['given'].isna()]['family'].apply(lambda x: x.split(' ')[0] if len(x.split(' ')) == 2 else "_")

In [41]:
given_names.values

array(['_', '_', 'Prem', '_', '_', '_', '_', '_', '_', '_', 'Hyosig',
       'Ding', 'Hong', 'May', 'Guang-Zhong', 'Zhuo', 'Jiang', 'Jun',
       'Xiong', 'Liang', 'Song-Chun', 'Xiao-Yu', 'Dong', 'Li', 'Jia',
       'Xiao-Yu', 'Xiao-Qin', 'Qin', 'Yuanjie', 'Minlong', 'Ke', 'Xin',
       'Zhuowen', 'Xiang', 'Lixin', 'Dong', '_', 'Abhilash', 'Jennifer',
       'Paul', '_', '_', '_', '_'], dtype=object)

In [42]:
df_clean_authors.loc[given_names.index, 'given'] = given_names.values

In [43]:
# in some cases there is extra punctuation

In [44]:
df_clean_authors[(df_clean_authors['family'] == "Johnson") & (df_clean_authors['given'].str[0] == "J")].sort_values(['given'])

Unnamed: 0,given,family,titles
34480,J,Johnson,[A prognostic index in primary breast cancer]
10863,J.,Johnson,[Crystallographic structure of an RNA helix: [...
59689,James,Johnson,"[The MEME Suite, Diagnostic accuracy of X-ray ..."
85208,Jason M,Johnson,[Mapping the Genetic Architecture of Gene Expr...
87440,Jean,Johnson,[Creating Opportunities for Parent Empowerment...
69566,Jeffery,Johnson,[The Sequence of the Human Genome]
73143,Jeffrey A,Johnson,[A meta-analysis of the association between ad...
9456,Jeffrey A.,Johnson,[Understanding the complex associations betwee...
30094,Jeffrey C,Johnson,[Beliefs about and responses to childhood ear ...
74138,Jill R,Johnson,[Effectiveness of screening colonoscopy in red...


In [45]:
df_clean_authors['given'] = df_clean_authors['given'].apply(lambda x: x.replace(".", "") if x else x)

In [46]:
df_clean_authors[(df_clean_authors['family'] == "Johnson") & (df_clean_authors['given'].str[0] == "J")].sort_values(['given'])

Unnamed: 0,given,family,titles
10863,J,Johnson,[Crystallographic structure of an RNA helix: [...
34480,J,Johnson,[A prognostic index in primary breast cancer]
59689,James,Johnson,"[The MEME Suite, Diagnostic accuracy of X-ray ..."
85208,Jason M,Johnson,[Mapping the Genetic Architecture of Gene Expr...
87440,Jean,Johnson,[Creating Opportunities for Parent Empowerment...
69566,Jeffery,Johnson,[The Sequence of the Human Genome]
9456,Jeffrey A,Johnson,[Understanding the complex associations betwee...
73143,Jeffrey A,Johnson,[A meta-analysis of the association between ad...
30094,Jeffrey C,Johnson,[Beliefs about and responses to childhood ear ...
74138,Jill R,Johnson,[Effectiveness of screening colonoscopy in red...


In [47]:
one_letter_name = df_clean_authors[df_clean_authors['given'].apply(lambda x: len(x.replace(".","")) if x else 0) == 1]

In [48]:
def concatenate_lists(series):
    return series.sum()

In [49]:
# Group by 'given' and 'family' columns and concatenate lists in grouped rows
grouped_df = df_clean_authors.groupby(['given', 'family']).agg({
    'titles': concatenate_lists,
}).reset_index()

In [50]:
grouped_df.shape

(91429, 3)

In [51]:
embedding.embed_query("; ".join(grouped_df.iloc[0,-1]))[:10]

[0.039591751992702484,
 -0.07843189686536789,
 0.026333099231123924,
 0.03865569829940796,
 0.004176840651780367,
 0.022404426708817482,
 -0.0013168402947485447,
 0.008029144257307053,
 -0.003868338419124484,
 -0.015063636936247349]

In [52]:
embeddings = grouped_df['titles'].progress_apply(lambda x: embedding.embed_query("; ".join(x)))
grouped_df['embeddings'] = embeddings

  0%|          | 0/91429 [00:00<?, ?it/s]

In [53]:
grouped_df.to_csv("embedding_authors_3.csv")

In [54]:
grouped_df.head()

Unnamed: 0,given,family,titles,embeddings
0,(The Late) B,Tilander,[Crystal structure of human erythrocyte carbon...,"[0.039591751992702484, -0.07843189686536789, 0..."
1,>,Kleeff,[Insulin-Like Growth Factor Signaling as a The...,"[0.05088701471686363, -0.012295341119170189, 0..."
2,??eslovas,Venclovas,[Assessment of progress over the CASP experime...,"[-0.046196773648262024, -0.009418211877346039,..."
3,?eslovas,Venclovas,[Processing and analysis of CASP3 protein stru...,"[-0.037653807550668716, -0.04398495331406593, ..."
4,A,ANDERSON,[Cancer patients' concerns regarding access to...,"[0.05151272565126419, 0.06267254054546356, -0...."


In [55]:
from sklearn.metrics.pairwise import cosine_similarity
def process_single_letter_names(df, similarity_threshold=0.75):
    # Group by family name
    grouped = df.groupby('family')

    # Prepare a list to store the final results
    results = []

    for family_name, group in progress_tqdm(grouped):
        # Find single-letter given names
        single_letter_names = group[group['given'].str.len() == 1]

        # Process each single-letter given name
        for _, single_letter_row in single_letter_names.iterrows():
            single_letter_embedding = single_letter_row['embeddings']
            single_letter_given = single_letter_row['given']

            # Compare with other given names in the same family
            for _, row in group.iterrows():
                if row['given'] != single_letter_given and (row['given'][0] == single_letter_given or single_letter_given == "_"):
                    other_embedding = row['embeddings']
                    similarity = cosine_similarity(
                        [single_letter_embedding], [other_embedding]
                    )[0][0]
                    if similarity > similarity_threshold:
                        results.append({
                            'given': single_letter_given,
                            'family': family_name,
                            'matched_given': row['given'],
                            'similarity': similarity
                        })

    return pd.DataFrame(results)

In [56]:
candidates_df = process_single_letter_names(grouped_df, similarity_threshold=0.7)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 45765/45765 [01:08<00:00, 672.57it/s]


In [57]:
candidates_df['matched_given'].head(50)

0      Gonçalo R
1           Linn
2          Keith
3           Faiz
4           Kaat
5          Kevin
6            S J
7      Stephen J
8         Younes
9          Joana
10     Stephen F
11        Sophia
12       Richard
13      Fumihito
14       Susan G
15        Michal
16        Alvaro
17         Helen
18         Elena
19         Chris
20       Gillian
21         Marin
22         Helen
23       Helen M
24     Christian
25        Andrew
26       Agatino
27       Jessica
28        Hugues
29       Roberto
30           Tom
31      Catalina
32       Valérie
33      Adrian P
34     Charlotte
35       Andreas
36         Nadia
37      Miloslav
38        Thomas
39         Sally
40     Florian P
41          Sean
42       Jessica
43        Andrew
44       Monique
45        Durado
46          Dirk
47     Matthew J
48          Timo
49    Philippe M
Name: matched_given, dtype: object

In [58]:
candidates_df.to_csv("candidates.csv")

## Applying Preprocessing

In [59]:
# Connect to the Neo4j database
graph = Graph("bolt://localhost:7687", name = "smallstc", auth=("neo4j", "Stargate90"))

In [60]:
df_clean_articles = pd.read_csv("languages_filled.csv", index_col=[0])

In [61]:
df_clean_articles

Unnamed: 0,title,abstract,languages,type,tags,doi
0,Information Overload and Missed Test Results i...,,['en'],journal-article,['Internal Medicine'],10.1001/2013.jamainternmed.61
1,Ethical Dimensions of Using Artificial Intelli...,,['en'],journal-article,"['Health (social science)', 'Health Policy', '...",10.1001/amajethics.2019.121
2,Should Watson Be Consulted for a Second Opinion?,,['en'],journal-article,"['Health (social science)', 'Health Policy', '...",10.1001/amajethics.2019.131
3,How Should Clinicians Communicate With Patient...,,['en'],journal-article,"['Health (social science)', 'Health Policy', '...",10.1001/amajethics.2019.138
4,Are Current Tort Liability Doctrines Adequate ...,,['en'],journal-article,"['Health (social science)', 'Health Policy', '...",10.1001/amajethics.2019.160
...,...,...,...,...,...,...
367240,Socioeconomic deprivation scores as predictors...,Background\nA previous study found that variab...,['en'],journal-article,['Family Practice'],10.3399/bjgp19x704549
367277,Level or not?,,['en'],report,,10.37829/hf-2020-rc13
368135,"Understanding and using sensitivity, specifici...",,['en'],journal-article,['Ophthalmology'],10.4103/0301-4738.41424
372050,Cervical Pessary for Preventing Preterm Birth ...,,['en'],journal-article,"['Radiology, Nuclear Medicine and imaging', 'R...",10.7863/ultra.16.08054


In [62]:
candidates_df = pd.read_csv("candidates.csv", index_col=[0])

In [63]:
candidates_df

Unnamed: 0,given,family,matched_given,similarity
0,G,Abecasis,Gonçalo R,0.823202
1,L,Abraham,Linn,0.800957
2,K,Abrams,Keith,0.799398
3,F,Ahmad Khan,Faiz,0.769182
4,K,Alaerts,Kaat,0.867004
...,...,...,...,...
452,C,Zywietz,Christoph,0.714291
453,M,de Boer,Maaike,0.868919
454,M,de Jonge,Maretha,0.782835
455,M,de Mathelin,MF,0.771620


In [64]:
query = """
MATCH (a:doi)
OPTIONAL MATCH (a)-[:reference]->(ref:doi)
RETURN a.title AS title, 
       a.abstract AS abstract, 
       a.languages AS languages, 
       a.type AS type, 
       a.tags AS tags,
       a.updated_at AS updated_at, 
       a.container_title AS container_title, 
       a.is_content_present AS is_content_present,
       a.name AS doi, 
       a.issued_at AS issued_at,
       collect(ref.name) AS references
"""

In [65]:
# Execute the query
data = graph.run(query).data()

In [66]:
# Convert the result to a DataFrame
doi_df = pd.DataFrame(data)

In [67]:
query = """
MATCH (a:author)-[:author]->(d:doi)
RETURN a.given AS given, a.family AS family, a.orcid as orcid, d.name as doi, a.id_hash as id_hash
"""

In [68]:
# Execute the query
data = graph.run(query).data()

In [69]:
author_df = pd.DataFrame(data)

In [70]:
result_df = pd.merge(author_df, doi_df, on="doi")

In [71]:
result_df.drop(['languages'], axis=1, inplace=True)

In [72]:
result_df = result_df.dropna(subset=['family'])

In [73]:
result_df['given'] = result_df['given'].apply(lambda x: x.replace(".", "") if (x and len(x) >= 3) else x)

In [74]:
result_df = pd.merge(result_df, df_clean_articles[['doi', 'languages']], on="doi")

In [75]:
result_df = pd.merge(result_df, candidates_df[['given','family','matched_given']], on=['given','family'], how="left")

In [76]:
def is_nan(value):
    return pd.isna(value)

In [77]:
result_df["given"] = result_df.apply(lambda x: x['matched_given'] if not is_nan(x['matched_given']) else x['given'], axis=1)

In [78]:
result_df.drop("matched_given", axis=1, inplace=True)

In [79]:
result_df.head()

Unnamed: 0,given,family,orcid,doi,id_hash,title,abstract,type,tags,updated_at,container_title,is_content_present,issued_at,references,languages
0,Hardeep,Singh,,10.1001/2013.jamainternmed.61,b985366129f0c2a7e0ba05e5de448161963c124b684400...,Information Overload and Missed Test Results i...,,journal-article,[Internal Medicine],1695848000.0,JAMA Internal Medicine,False,1366589000.0,[],['en']
1,Hardeep,Singh,,10.1001/jamainternmed.2013.2777,b985366129f0c2a7e0ba05e5de448161963c124b684400...,Types and Origins of Diagnostic Errors in Prim...,,journal-article,[Internal Medicine],1695840000.0,JAMA Internal Medicine,False,1364170000.0,[],['en']
2,Christiane,Spitzmueller,,10.1001/2013.jamainternmed.61,1028cd415ed3557f212fb355330f99ff25253215b48e35...,Information Overload and Missed Test Results i...,,journal-article,[Internal Medicine],1695848000.0,JAMA Internal Medicine,False,1366589000.0,[],['en']
3,Nancy J,Petersen,,10.1001/2013.jamainternmed.61,160a8b32b0df7105d254186517dfb2aab0c1c3418744a4...,Information Overload and Missed Test Results i...,,journal-article,[Internal Medicine],1695848000.0,JAMA Internal Medicine,False,1366589000.0,[],['en']
4,Mona K,Sawhney,,10.1001/2013.jamainternmed.61,addfa31adbe848ff77d9f120cc42ae3cef02e4574ff5b9...,Information Overload and Missed Test Results i...,,journal-article,[Internal Medicine],1695848000.0,JAMA Internal Medicine,False,1366589000.0,[],['en']


In [80]:
result_df.columns

Index(['given', 'family', 'orcid', 'doi', 'id_hash', 'title', 'abstract',
       'type', 'tags', 'updated_at', 'container_title', 'is_content_present',
       'issued_at', 'references', 'languages'],
      dtype='object')

In [81]:
result_df.to_csv("result_df.csv")

## Upload to neo4j

In [82]:
from py2neo import Graph
system_graph = Graph("bolt://localhost:7687", auth=("neo4j", "Stargate90"), name="system")

# Create the new database "updated graph" if it doesn't exist
try:
    system_graph.run("CREATE DATABASE `smallstc-upd`")
except Exception as e:
    print(e)

[Database.ExistingDatabaseFound] Failed to create the specified database 'smallstc-upd': Database name or alias already exists.


In [83]:
graph = Graph("bolt://localhost:7687", auth=("neo4j", "Stargate90"), name="smallstc-upd")
try:
    graph.run("MATCH (n) DETACH DELETE n")
except Exception as e:
    print(e)

In [84]:
def create_author_nodes(row):
    author = Node("author", given=row['given'], family=row['family'], orcid=row['orcid'], id_hash=row["id_hash"])
    return author

def create_article_nodes(row):
    article = Node("article", doi=row['doi'], title=row['title'], abstract=row['abstract'], 
                   type=row['type'], tags=row['tags'], updated_at=row['updated_at'], 
                   container_title=row['container_title'], is_content_present=row['is_content_present'], 
                   issued_at=row['issued_at'], languages=row['languages'])
    return article

In [85]:
from py2neo import Node, Relationship

for idx, row in progress_tqdm(result_df.iterrows()):
    author = create_author_nodes(row)
    article = create_article_nodes(row)

    # Merge nodes to avoid duplicates
    graph.merge(author, "author", "id_hash")
    graph.merge(article, "article", "doi")

    # Create relationships between author and article
    written_by = Relationship(article, "AUTHORED_BY", author)
    graph.create(written_by)

    # Optionally: Add tags as nodes and create relationships to articles
    if row['tags']:
        tags = row['tags']
        for tag in tags:
            tag_node = Node("tag", name=tag.strip())
            graph.merge(tag_node, "tag", "name")
            tagged_with = Relationship(article, "TAGGED_WITH", tag_node)
            graph.create(tagged_with)

137905it [3:42:44, 10.32it/s]


In [86]:
'''for idx, row in progress_tqdm(result_df[['doi', 'references']].explode('references').dropna().iterrows()):
    ref_article = graph.nodes.match("article", doi=row['references']).first()
    doi_article = graph.nodes.match("article", doi=row['doi']).first()
    if ref_article and doi_article:
        graph.create(Relationship(doi_article, "REFERENCES", ref_article))
'''
        
        
        
def create_relationships_in_batches(df, batch_size=100):
    query_template = """
    UNWIND $rows AS row
    MATCH (a:article {doi: row.doi}), (r:article {doi: row.references})
    MERGE (a)-[:REFERENCES]->(r)
    """
    total_rows = len(df)
    for start in progress_tqdm(range(0, total_rows, batch_size)):
        end = min(start + batch_size, total_rows)
        batch = df.iloc[start:end].to_dict('records')
        graph.run(query_template, rows=batch)

df = result_df[['doi', 'references']].explode('references').dropna()        
        
# Call the function with the flattened DataFrame
create_relationships_in_batches(df, batch_size=100)

 12%|████████████▎                                                                                           | 7109/59896 [7:05:50<52:42:02,  3.59s/it]


KeyboardInterrupt: 

In [87]:
import json
with open("anthology.json", 'r') as f:
    tag_anthology = json.load(f)

In [88]:
# Recursive function to create nodes and relationships in Neo4j
def create_relationships(parent_name, children):
    if isinstance(children, dict):
        for child_name, grand_children in children.items():
            parent_node = Node("tag", name=parent_name)
            child_node = Node("tag", name=child_name)
            graph.merge(parent_node, "tag", "name")
            graph.merge(child_node, "tag", "name")
            related_to = Relationship(child_node, "RELATED_TO", parent_node)
            graph.merge(related_to)
            create_relationships(child_name, grand_children)
    elif isinstance(children, list):
        parent_node = Node("tag", name=parent_name)
        graph.merge(parent_node, "tag", "name")
        for child_name in children:
            child_node = Node("tag", name=child_name)
            graph.merge(child_node, "tag", "name")
            related_to = Relationship(child_node, "RELATED_TO", parent_node)
            graph.merge(related_to)

In [89]:
for root_tag, sub_tags in progress_tqdm(tag_anthology.items()):
    create_relationships(root_tag, sub_tags)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00,  1.42it/s]


In [90]:
df

Unnamed: 0,doi,references
5,10.1016/j.amjmed.2011.07.029,10.1200/jop.091098
5,10.1016/j.amjmed.2011.07.029,10.1001/archinte.163.21.2625
5,10.1016/j.amjmed.2011.07.029,10.1046/j.1365-2710.2002.00434.x
5,10.1016/j.amjmed.2011.07.029,10.1097/01.jac.0000290402.89284.a9
5,10.1016/j.amjmed.2011.07.029,10.1197/jamia.m3200
...,...,...
137886,10.7863/ultra.34.2.225,10.1097/sla.0b013e318204a892
137886,10.7863/ultra.34.2.225,10.1016/s0002-9610(03)00042-4
137886,10.7863/ultra.34.2.225,10.1186/1471-2407-9-335
137886,10.7863/ultra.34.2.225,10.1016/j.acra.2008.01.010


## Querying

In [91]:
#check the examples

In [92]:
from langchain.chains import GraphCypherQAChain
from langchain_community.graphs import Neo4jGraph
from langchain_openai import ChatOpenAI

graph = Neo4jGraph(url="bolt://localhost:7687", username="neo4j", password="Stargate90", database="smallstc-upd")

In [93]:
from langchain_core.prompts.prompt import PromptTemplate

CYPHER_GENERATION_TEMPLATE = """Task:Generate Cypher statement to query a graph database.
Instructions:
Use only the provided relationship types and properties in the schema.
Do not use any other relationship types or properties that are not provided.
Schema:
{schema}
Note: Do not include any explanations or apologies in your responses.
Do not respond to any questions that might ask anything else than for you to construct a Cypher statement.
Do not include any text except the generated Cypher statement.
Examples: Here are a few examples of generated Cypher statements for particular questions:
# List the titles of articles written by John Doe.
MATCH (a:author {given: "John", family: "Doe"})<-[:AUTHORED_BY]-(art:article)
RETURN art.title

# Find all articles that reference the article with DOI '10.1234/abc123'.
MATCH (a:article {doi: "10.1234/abc123"})<-[:REFERENCES]-(referenced_by:article)
RETURN referenced_by.doi, referenced_by.title

# Get the authors, titles, and citation counts of the most cited articles tagged with 'Breast Neoplasms'.
MATCH (t:tag {name: "Breast Neoplasms"})-[:RELATED_TO*0..]->(subTag:tag)
MATCH (a:article)-[:TAGGED_WITH]->(subTag)
OPTIONAL MATCH (a)<-[:REFERENCES]-(citingArticle:article)
OPTIONAL MATCH (a)-[:AUTHORED_BY]->(auth:author)
RETURN a.title, count(citingArticle) AS citationCount, collect(DISTINCT auth.given + " " + auth.family) AS authors
ORDER BY citationCount DESC
LIMIT 10


The question is:
{question}"""

CYPHER_GENERATION_PROMPT = PromptTemplate(
    input_variables=["schema", "question"], template=CYPHER_GENERATION_TEMPLATE
)

chain = GraphCypherQAChain.from_llm(
    ChatOpenAI(model="gpt-4o", api_key=api_key, temperature=0), graph=graph, verbose=True, validate_cypher=True
)

In [94]:
chain.run("How many articles reference doi: 10.1016/s0002-9394(02)01522-2?")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (a:article)-[:REFERENCES]->(b:article {doi: "10.1016/s0002-9394(02)01522-2"})
RETURN count(a)
[0m
Full Context:
[32;1m[1;3m[{'count(a)': 14}][0m

[1m> Finished chain.[0m


'14 articles reference doi: 10.1016/s0002-9394(02)01522-2.'

In [95]:
chain.run("Who are the top coauthors of Nilsson Matthew?")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (a:author {given: "Matthew", family: "Nilsson"})<-[:AUTHORED_BY]-(art:article)-[:AUTHORED_BY]->(coauthor:author)
RETURN coauthor.given, coauthor.family, COUNT(*) AS collaborations
ORDER BY collaborations DESC
LIMIT 5
[0m
Full Context:
[32;1m[1;3m[{'coauthor.given': 'Susan D', 'coauthor.family': 'Block', 'collaborations': 2}, {'coauthor.given': 'Jennifer W', 'coauthor.family': 'Mack', 'collaborations': 2}, {'coauthor.given': 'Holly G', 'coauthor.family': 'Prigerson', 'collaborations': 2}, {'coauthor.given': 'Elizabeth', 'coauthor.family': 'Trice', 'collaborations': 2}, {'coauthor.given': 'Alexi', 'coauthor.family': 'Wright', 'collaborations': 1}][0m

[1m> Finished chain.[0m


'Susan D. Block, Jennifer W. Mack, Holly G. Prigerson, and Elizabeth Trice are the top coauthors of Nilsson Matthew.'

In [96]:
chain.run("What are the fields of study of Liu Yun?")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (a:author {family: "Liu", given: "Yun"})-[:AUTHORED_BY]-(art:article)-[:TAGGED_WITH]->(t:tag)
RETURN DISTINCT t.name AS fields_of_study
[0m
Full Context:
[32;1m[1;3m[{'fields_of_study': 'General Medicine'}, {'fields_of_study': 'Ophthalmology'}, {'fields_of_study': 'Anatomy'}, {'fields_of_study': 'Surgery'}, {'fields_of_study': 'Pathology and Forensic Medicine'}, {'fields_of_study': 'Multidisciplinary'}, {'fields_of_study': 'Medical Laboratory Technology'}, {'fields_of_study': 'Radiological and Ultrasound Technology'}, {'fields_of_study': 'Radiology, Nuclear Medicine and imaging'}, {'fields_of_study': 'Computer Vision and Pattern Recognition'}][0m

[1m> Finished chain.[0m


'General Medicine, Ophthalmology, Anatomy, Surgery, Pathology and Forensic Medicine, Multidisciplinary, Medical Laboratory Technology, Radiological and Ultrasound Technology, Radiology, Nuclear Medicine and imaging, Computer Vision and Pattern Recognition.'

In [97]:
# Possible todo next: compare coauthors, but that's outside of the milestone

In [98]:
# Possible todo next: Aldington	Stephen J == Aldington	S J == Aldington	S , but that's outside of the milestone