In [2]:
import networkx as nx
import numpy as np
import pandas as pd
import psycopg2 as pg_driver

from matplotlib import pyplot as plt
%matplotlib inline

from Bio import Entrez # pip install biopython
Entrez.email = 'nikolay.kapralov@gmail.com'

In [3]:
def search(*terms):
    query=' '.join(terms)
    handle = Entrez.esearch(db='pubmed',  
                            retmax='100000',
                            retmode='xml', 
                            term=query)
    results = Entrez.read(handle)
    return results

In [6]:
terms = ['DNA', 'methylation', 'clock']

results = search(*terms)['IdList']
print(f'{len(results)} articles about {terms}')

290 articles about ['DNA', 'methylation', 'clock']


In [7]:
conn = pg_driver.connect(dbname='pubmed', user='biolabs', password='pubtrends', host='localhost')
cursor = conn.cursor()

In [8]:
%%time

pmids = [int(pmid) for pmid in results]

query = '''
SELECT pmid_citing, pmid_cited
FROM Citations
WHERE pmid_citing = ANY(%s) AND pmid_cited = ANY(%s);
'''

with conn:
    cursor.execute(query, (pmids, pmids,))

Wall time: 11.8 s


In [10]:
data = []

for row in cursor:
    data.append(row)
    
print(len(data))

628


In [12]:
G = nx.DiGraph(data)

In [32]:
list(nx.simple_cycles(G))

[[26684672, 26678252], [24884411]]

In [34]:
for v, u in G.edges():
    if v == 24884411:
        print(v, u)

24884411 24884411


In [35]:
nx.write_graphml(G, 'dna-methylation-clock.graphml')

In [37]:
%%time

pmids = [int(pmid) for pmid in results]

query = '''
SELECT C1.pmid_citing, C1.pmid_cited, C2.pmid_cited, P.year
FROM Citations C1
JOIN Citations C2
ON C1.pmid_citing = C2.pmid_citing 
JOIN Publications P
ON C1.pmid_citing = P.pmid
WHERE C1.pmid_cited = ANY(%s) AND C2.pmid_cited = ANY(%s);
'''

with conn:
    cursor.execute(query, (pmids, pmids,))

Wall time: 18min 6s


In [38]:
cocit_data = []

for row in cursor:
    if row[1] != row[2]:
        cocit_data.append(row)
    
print(len(cocit_data))

12974


In [42]:
real_cocit_data[0]

(16457718, 15975143, 16314580, 2006)

In [43]:
df = pd.DataFrame(real_cocit_data, columns=['citing', 'cited_1', 'cited_2', 'year'])

In [46]:
df.head()

Unnamed: 0,citing,cited_1,cited_2,year
0,16457718,15975143,16314580,2006.0
1,16457718,16314580,15975143,2006.0
2,16760426,1722018,15975143,2006.0
3,16760426,15975143,1722018,2006.0
4,17335343,15975143,16314580,2007.0


In [51]:
# Sort PMIDs of co-cited articles to avoid (a,b) & (b,a) duplication (can be also solved in SQL)
df['min_cited'] = df[['cited_1', 'cited_2']].min(axis=1)
df['max_cited'] = df[['cited_1', 'cited_2']].max(axis=1)

In [52]:
df.head()

Unnamed: 0,citing,cited_1,cited_2,year,min_cited,max_cited
0,16457718,15975143,16314580,2006.0,15975143,16314580
1,16457718,16314580,15975143,2006.0,15975143,16314580
2,16760426,1722018,15975143,2006.0,1722018,15975143
3,16760426,15975143,1722018,2006.0,1722018,15975143
4,17335343,15975143,16314580,2007.0,15975143,16314580


In [55]:
cocit = df.groupby(['min_cited', 'max_cited']).count()

In [66]:
cocit_grouped = []
for idx, val in cocit.iteritems():
    for i, v in val.iteritems():
        cocit_grouped.append((i[0], i[1], v))

In [68]:
CG = nx.Graph()
for t in cocit_grouped:
    CG.add_edge(t[0], t[1], weight=t[2])

In [70]:
nx.write_graphml(CG, 'dna-methylation-clock-cocitations.graphml')