**TOPICS OF INTEREST** - from the meeting in January

* inflammation aging chronic (2004) - 13k papers
* genome editing / manipulation, CRISPR - 13k papers
* induced stem cells - 73k papers - 3h for calculating co-citations
* single-cell sequencing (2012) - 3k papers
* ATAC-seq (2015) - 276 papers
* immunomodulation cancer - 71k papers
* Telomere Theories of Aging - ??
* mTOR pathway - 14255
* autophagy - ??
* Calorie restriction - 3933

Complement Factor H + Age-Related Mascular Degeneration - investigate

A file with citation stats is required to run several cells of this notebook successfully.
The file can be obtained with the following commands if user has access to `pubmed` database (use crawler).

1. Run `psql`.
2. Run following commands in the `psql` prompt:
    * `\f ','`
    * `\a`
    * `\t`
    * `\o '/path/to/the/file'`
    * `SELECT C.pmid_cited AS pmid, P.year, COUNT(1) AS count`
    * `FROM Citations C`
    * `JOIN Publications P`
    * `ON C.pmid_citing = P.pmid`
    * `GROUP BY C.pmid_cited, P.year;`
    * `\o`
3. Store `/path/to/the/file` in the `CITATION_STATS_FILE` variable in the following cell.

**Issues**:

1. Some information in tooltips with long titles may occur out of plot bounds.
2. How should I place articles with the same year? (currently y-axis position is random in [0,1]...)
3. Some research on clustering algorithms is needed! (also `networkx.algorithms.community`)

**Functions**:

1. Subtopic Analysis based on co-citation graph clustering
2. Top Cited Papers detection (overall and for certain year)
3. Citation Dynamics for a certain article

## Search Terms

In [None]:
terms = ['DNA', 'methylation', 'clock']

In [1]:
from importlib import reload
import logging
reload(logging)

import gc
import ipywidgets as widgets
import networkx as nx
import numpy as np
import pandas as pd
import psycopg2 as pg_driver

from bokeh.io import push_notebook
from bokeh.models import ColumnDataSource, LabelSet, OpenURL, TapTool, CustomJS
from bokeh.plotting import figure, show, output_notebook
from IPython.display import display
from matplotlib import pyplot as plt
%matplotlib inline

from Bio import Entrez

Entrez.email = 'nikolay.kapralov@gmail.com'
PUBMED_ARTICLE_BASE_URL = 'https://www.ncbi.nlm.nih.gov/pubmed/?term='

logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', level=logging.DEBUG)
output_notebook()

In [2]:
class KeyPaperAnalyzer:       
    def __init__(self):
        self.conn = pg_driver.connect(dbname='pubmed', user='biolabs', password='pubtrends', host='localhost')
        self.cursor = self.conn.cursor()
        
    def load_publications(self):
        logging.info('Loading publication data')
        
        query = '''
        SELECT pmid, title, year
        FROM Publications
        WHERE pmid = ANY(%s);
        '''
        
        with self.conn:
            self.cursor.execute(query, (self.pmids,))
        pub_data = []
        for row in self.cursor:
            pub_data.append(list(row))
        self.pub_df = pd.DataFrame(pub_data, columns=['pmid', 'title', 'year'])
        logging.info(f'Found {len(self.pub_df)} publications in the local database')
    
    def load_citation_stats(self, cit_stats_file):
        logging.info('Started loading citation stats')
        
        data = []
        with open(cit_stats_file, 'r') as f:
            for line in f:
                pmid, year, count = line[:-1].split(',')
                if int(pmid) in self.pmids:
                    data.append([pmid, year, count])

        self.cit_df = pd.DataFrame(data, columns = ['pmid', 'year', 'count'])
        for col in self.cit_df:
            self.cit_df[col] = self.cit_df[col].astype(str).replace('', 0).astype(int)
        self.cit_df = self.cit_df[self.cit_df['year'] > 0]
        self.cit_df = self.cit_df.pivot(index='pmid', columns='year', values='count').reset_index().replace(np.nan, 0)
        self.cit_df['total'] = self.cit_df.iloc[:, 1:].sum(axis = 1)
        self.cit_df = self.cit_df.sort_values(by='total', ascending=False)
        
        logging.info(f"Loaded citation stats for {len(self.cit_df)} of {len(self.pmids)} articles. " +
                    "Others may either have zero citations or be absent in the local database.")
        
    def load_cocitations(self):
        logging.info('Calculating co-citations for selected articles')

        query = '''
        SELECT C1.pmid_citing, C1.pmid_cited, C2.pmid_cited, P.year
        FROM Citations C1
        JOIN Citations C2
        ON C1.pmid_citing = C2.pmid_citing AND C1.pmid_cited < C2.pmid_cited
        JOIN Publications P
        ON C1.pmid_citing = P.pmid
        WHERE C1.pmid_cited = ANY(%s) AND C2.pmid_cited = ANY(%s);
        '''

        with self.conn:
            self.cursor.execute(query, (self.pmids, self.pmids,))
            
        cocit_data = []
        for row in self.cursor:
            cocit_data.append(list(row))
        self.cocit_df = pd.DataFrame(cocit_data, columns=['citing', 'cited_1', 'cited_2', 'year'])
        logging.info(f'Found {len(self.cocit_df)} co-cited pairs of articles')
        
        self.cocit_grouped_df = self.cocit_df.groupby(['cited_1', 'cited_2', 'year']).count().reset_index()
        self.cocit_grouped_df = self.cocit_grouped_df.pivot_table(index=['cited_1', 'cited_2'], 
                                                          columns=['year'], values=['citing']).reset_index()
        self.cocit_grouped_df = self.cocit_grouped_df.replace(np.nan, 0)
        self.cocit_grouped_df['total'] = self.cocit_grouped_df.iloc[:, 2:].sum(axis=1)
        self.cocit_grouped_df = self.cocit_grouped_df.sort_values(by='total', ascending=False)
        
        self.CG = nx.Graph()
        for el in analyzer.cocit_grouped_df[['cited_1', 'cited_2', 'total']].values.astype(int):
            self.CG.add_edge(el[0], el[1], weight=el[2])

    def plot_total_citations(self):
        plt.figure(figsize=(10, 5))
        ax = self.cit_df['total'].plot.bar()
        ax.set_xticklabels([])
        ax.set_xlabel('Articles')
        ax.set_ylabel('Number of citations')
    
    def search(self, *terms):
        # TODO: handle queries which return more than 100000 items
        # TODO: use local database instead of PubMed API
        self.terms = [t.lower() for t in terms]
        query=' '.join(terms)
        handle = Entrez.esearch(db='pubmed', retmax='100000', retmode='xml', term=query)
        self.pmids = [int(pmid) for pmid in Entrez.read(handle)['IdList']]
        logging.info(f'Found {len(self.pmids)} articles about {terms}')       
        
    def top_cited_papers(self, threshold=0.05):
        ids = self.cit_df.iloc[:round(len(self.cit_df) * threshold), 0].values
        counts = self.cit_df.iloc[:round(len(self.cit_df) * threshold), -1].values
        urls = [PUBMED_ARTICLE_BASE_URL + str(i) for i in ids]
        return zip(ids, urls, counts)

In [3]:
def build_data_source(df):
    # TODO: use d = ColumnDataSource(df)
    d = ColumnDataSource(data=dict(pmid=df['pmid'], title=df['title'], year=df['year'], total=df['total'],
                                   size=np.log(df['total']) / 10, pos=np.random.random(size=len(df))))
    return d

In [34]:
TOOLS = "hover,pan,tap,wheel_zoom,box_zoom,reset,save"

def serve_scatter_article_layout(source, title, year_range=None):
    callback = CustomJS(args=dict(source=source, base=PUBMED_ARTICLE_BASE_URL), code="""
        var data = source.data, selected = source.selected.indices;
        if (selected.length == 1) {
            // only consider case where one glyph is selected by user
            selected_id = data['pmid'][selected[0]]
            for (var i = 0; i < data['pmid'].length; ++i){
                if(data['pmid'][i] == selected_id){
                    window.open(base + data['pmid'][i], '_blank');
                }
            }
        }
    """)

    p = figure(tools=TOOLS, toolbar_location="above", plot_width=960, plot_height=400, x_range=year_range, title=title)
    p.xaxis.axis_label = 'Year'
    p.hover.tooltips = [
        ("PMID", '@pmid'),
        ("Title", '@title'),
        ("Year", '@year'),
        ("Cited by", '@total paper(s) total')
    ]
    p.js_on_event('tap', callback)

    p.circle(x='year', y='pos', fill_alpha=0.2, source=source, radius='size')
    
    return p

def serve_citation_dynamics_layout():
    def update(b):
        try:
            pmid = int(text.value)
            data = analyzer.cit_df[analyzer.cit_df['pmid'] == pmid]
            if len(data) == 1:
                x = data.columns[1:-1].values.astype(int)
                y = data[x].values[0]
                bar.data_source.data = {'x': x, 'y': y}
            else:
                text.value = 'Bad ID'
            push_notebook(handle=h)
        except ValueError:
            text.value = ''

    title = "Number of Citations per Year"

    p = figure(tools=TOOLS, toolbar_location="above", plot_width=960, plot_height = 300, title=title)
    p.xaxis.axis_label = "Year"
    p.yaxis.axis_label = "Number of citations"
    p.hover.tooltips = [
        ("Year", "@x"),
        ("Cited by", "@y paper(s) in @x"),
    ]

    d = ColumnDataSource(data=dict(x=[], y=[]))
    bar = p.vbar(x='x', width=0.8, top='y', source=d, color='#A6CEE3', line_width=3)
    
    text = widgets.Text(
        value='',
        placeholder='Enter PMID',
        description='PMID:',
        disabled=False
    )

    button = widgets.Button(
        description='Show',
        disabled=False,
        button_style='info',
        tooltip='Show'
    )
    button.on_click(update)

    panel = widgets.HBox([text, button])

    display(panel)
    h = show(p, notebook_handle=True)
    
    return p, h, panel

In [49]:
from collections import Counter

from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

NUM_MOST_COMMON = 5

def get_ngrams(string):
    stop_words = set(stopwords.words('english')) 
    tokens = list(filter(lambda s: any(c.isalpha() for c in s), string.lower().replace(',', '').replace('.', '').split(' ')))
    tokens = [t for t in tokens if t not in stop_words and t not in analyzer.terms]
    ngrams = list(tokens)
    for t1, t2 in zip(tokens[:-1], tokens[1:]):
        ngrams.append(t1 + ' ' + t2)
    for t1, t2, t3 in zip(tokens[:-2], tokens[1:-1], tokens[2:]):
        ngrams.append(t1 + ' ' + t2 + ' ' + t3)
    return ngrams

def get_most_common_ngrams(titles):
    ngrams = []
    for title in titles:
        ngrams.extend(get_ngrams(title))
    most_common = {}
    for ngram, cnt in Counter(ngrams).most_common(NUM_MOST_COMMON):
        most_common[ngram] = cnt / len(titles)
    return most_common

In [6]:
logging.info(f'{gc.collect()} objects collected')

analyzer = KeyPaperAnalyzer()
analyzer.search(*terms)
analyzer.load_publications()
analyzer.load_cocitations()
analyzer.load_citation_stats(cit_stats_file='../citations_per_year.csv')

2019-04-29 22:45:14,646 INFO: 0 objects collected
2019-04-29 22:45:21,746 INFO: Found 294 articles about ('DNA', 'methylation', 'clock')
2019-04-29 22:45:21,747 INFO: Loading publication data
2019-04-29 22:46:04,581 INFO: Found 223 publications in the local database
2019-04-29 22:46:04,584 INFO: Calculating co-citations for selected articles
2019-04-29 22:47:53,856 INFO: Found 4876 co-cited pairs of articles
2019-04-29 22:47:54,555 INFO: Started loading citation stats
2019-04-29 22:57:42,800 INFO: Loaded citation stats for 174 of 294 articles. Others may either have zero citations or be absent in the local database.


In [20]:
analyzer.cit_df.head()

year,pmid,1985,1986,1988,1990,1991,1992,1993,1994,1995,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,total
59,24138928,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,32.0,90.0,135.0,150.0,29.0,0.0,436.0
72,25313081,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,25.0,37.0,38.0,4.0,0.0,104.0
6,15790588,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,10.0,11.0,13.0,7.0,7.0,9.0,2.0,0.0,102.0
2,2777259,0.0,0.0,0.0,5.0,6.0,4.0,2.0,1.0,0.0,...,9.0,5.0,7.0,7.0,5.0,9.0,2.0,0.0,0.0,101.0
7,15860628,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,15.0,12.0,9.0,8.0,6.0,8.0,1.0,0.0,94.0


In [8]:
analyzer.pub_df.head()

Unnamed: 0,pmid,title,year
0,1722018,DNA methylation and cellular ageing.,1991.0
1,1943146,Quantitative genetic variation and development...,1991.0
2,2777259,Cytosine methylation and the fate of CpG dinuc...,1989.0
3,2857475,Control of haemoglobin switching by a developm...,1985.0
4,11032969,Crisis periods and apoptotic commitment: death...,2000.0


In [9]:
df_all = pd.merge(analyzer.pub_df, analyzer.cit_df, on='pmid')

In [10]:
df_all.head()

Unnamed: 0,pmid,title,year,1985,1986,1988,1990,1991,1992,1993,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,total
0,1722018,DNA methylation and cellular ageing.,1991.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,7.0
1,1943146,Quantitative genetic variation and development...,1991.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2777259,Cytosine methylation and the fate of CpG dinuc...,1989.0,0.0,0.0,0.0,5.0,6.0,4.0,2.0,...,9.0,5.0,7.0,7.0,5.0,9.0,2.0,0.0,0.0,101.0
3,2857475,Control of haemoglobin switching by a developm...,1985.0,2.0,1.0,1.0,0.0,1.0,1.0,2.0,...,2.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,19.0
4,11820819,Regulation of stage-specific nuclear transloca...,2002.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,3.0,3.0,3.0,1.0,0.0,0.0,0.0,16.0


## Citation per Year Dynamics

In [36]:
p, h, panel = serve_citation_dynamics_layout()

## Subtopics a.k.a. Clusters in the Co-citation Graph

In [13]:
import community

p = community.best_partition(analyzer.CG)
# q = list(greedy_modularity_communities(analyzer.CG))

In [14]:
df_comp = pd.Series(p).reset_index().rename(columns={'index': 'pmid', 0: 'comp'})
df_all = pd.merge(df_all, df_comp, on='pmid')

In [15]:
years = df_all.columns.values[3:-2].astype(int)
min_year, max_year = np.min(years), np.max(years)

In [50]:
n_comps = df_all['comp'].nunique()
ds = [None] * n_comps
layouts = [None] * n_comps
most_common = [None] * n_comps
for c in range(n_comps):
    ds[c] = build_data_source(df_all[df_all['comp'] == c])
    most_common[c] = dict(get_most_common_ngrams(df_all[df_all['comp'] == c]['title'].values))
    kwd = ', '.join([f'{k} ({v:.2f})' for k, v in most_common[c].items()])
    title = f'Subtopic #{c + 1}: {kwd}'
    layouts[c] = serve_scatter_article_layout(ds[c], title, year_range=[min_year, max_year])
    show(layouts[c])

## Top Cited Papers Overall

In [17]:
df_all = df_all.sort_values(by='total', ascending=False)

In [37]:
THRESHOLD = 0.1 # 10 %

In [38]:
ds_top = build_data_source(df_all.iloc[:round(len(analyzer.cit_df) * THRESHOLD), :])
layout_top = serve_scatter_article_layout(ds_top, 'Top cited papers', year_range=[min_year, max_year])
show(layout_top)

## Top Cited Papers for Each Year

In [67]:
from bokeh.palettes import Category10
from bokeh.transform import factor_cmap

max_gain_data = []
cols = df_all.columns[3:-2]
for i in range(len(cols)):
    max_gain = df_all[cols[i]].astype(int).max()
    if max_gain > 0:
        sel = df_all[df_all[cols[i]] == max_gain]
        max_gain_data.append([cols[i], sel['pmid'].values[0], 
                              sel['title'].values[0], max_gain])
        
max_gain_df = pd.DataFrame(max_gain_data, columns=['year', 'pmid', 'title', 'count'])
max_gain_df.head(20)

ds_max = ColumnDataSource(data=dict(year=max_gain_df['year'], pmid=max_gain_df['pmid'].astype(str),
                                   title=max_gain_df['title'], count=max_gain_df['count']))
colors = factor_cmap('pmid', palette=Category10[10], factors=max_gain_df['pmid'].astype(str).unique())

year_range = [1985, 2019]
p = figure(tools=TOOLS, toolbar_location="above", plot_width=960, plot_height=300, x_range=year_range, title='Max gain')
p.xaxis.axis_label = 'Year'
p.yaxis.axis_label = 'Number of citations'
p.hover.tooltips = [
    ("PMID", '@pmid'),
    ("Title", '@title'),
    ("Year", '@year'),
    ("Cited by", '@count papers in @year')
]

p.vbar(x='year', width=0.8, top='count', fill_alpha=0.2, source=ds_max, fill_color=colors, line_color=colors)

show(p)