# Pubtrends

This Jupyter Notebook can be used to perform basic publication analysis.

**IMPORTANT** 
Turn on experimental features in config file!

## Getting Started

1. Define the `SEARCH_QUERY` variable in the cell below with a list of keywords that describe the science branch of your interest.
2. Run all cells & see the results.

In [None]:
SEARCH_QUERY = 'human aging'

## Publication Analysis

In [None]:
import logging
import json


from bokeh.plotting import show, output_notebook
from matplotlib import pyplot as plt


from pysrc.papers.config import PubtrendsConfig
from pysrc.papers.db.pm_postgres_loader import PubmedPostgresLoader
from pysrc.papers.db.ss_postgres_loader import SemanticScholarPostgresLoader
from pysrc.papers.analyzer import PapersAnalyzer
from pysrc.papers.plot.plotter import Plotter
from pysrc.papers.utils import SORT_MOST_CITED, SORT_MOST_RECENT, cut_authors_list

logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s')
output_notebook()
%matplotlib inline

In [None]:
SEARCH_SORT = SORT_MOST_CITED
SEARCH_PAPERS = 1000

In [None]:
config = PubtrendsConfig(test=False)
config.feature_evolution_enabled = True
loader = PubmedPostgresLoader(config)
analyzer = PapersAnalyzer(loader, config)
try:
    ids = analyzer.search_terms(SEARCH_QUERY, limit=SEARCH_PAPERS, sort=SEARCH_SORT)
    analyzer.analyze_papers(ids, SEARCH_QUERY)
finally:
    loader.close_connection()
    analyzer.teardown()

# Report plots

In [None]:
plotter = Plotter(analyzer)

In [None]:
show(plotter.papers_by_year())

In [None]:
wc, _ = plotter.papers_word_cloud_and_callback()
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.show()

# Trends

In [None]:
show(plotter.top_cited_papers())

In [None]:
show(plotter.most_cited_per_year_papers())

In [None]:
show(plotter.fastest_growth_per_year_papers())

# Single paper citations dynamics

In [None]:
show(plotter.paper_citations_per_year(analyzer.df, analyzer.df['id'].values[0]))

## Topics a.k.a. Clusters in the Co-citation Graph

In [None]:
show(plotter.topic_years_distribution())

In [None]:
show(plotter.heatmap_topics_similarity())

In [None]:
show(plotter.topics_hierarchy())

In [None]:
# First cluster papers
show(plotter.topics_info_and_word_cloud_and_callback()[0][0])

# Other Features

## Frequent keywords timeline

In [None]:
from pysrc.papers.analysis.text import vectorize_corpus, get_frequent_tokens

corpus_terms, corpus_counts = analyzer.corpus_terms, analyzer.corpus_counts
# corpus_terms, corpus_counts = vectorize_corpus(
#     analyzer.pub_df,
#     max_features=PapersAnalyzer.VECTOR_WORDS,
#     min_df=0.4,
#     max_df=0.9
# )

TERMS = 10

logging.info('Computing frequent terms')
ftkwds = get_frequent_tokens(analyzer.df, query=SEARCH_QUERY)
freq_terms = [t for t, _ in list(ftkwds.items())[:TERMS]]
freq_terms

In [None]:
import numpy as np 
import pandas as pd

logging.info('Grouping papers by year')
t = analyzer.df[['year']].copy()
t['i'] = range(len(t))
papers_by_year = t[['year', 'i']].groupby('year')['i'].apply(list).to_dict()


logging.info('Collecting numbers of papers with term per year')
binary_counts = corpus_counts.copy()
binary_counts[binary_counts.nonzero()] = 1
numbers_per_year = np.zeros(shape=(len(papers_by_year), len(corpus_terms)))
for i, (year, iss) in enumerate(papers_by_year.items()):
    numbers_per_year[i, :] = binary_counts[iss].sum(axis=0)[0, :] # * 100 / len(iss)
    

logging.info('Collect top terms with maximum sum of numbers over years')
# top_term_idxs = set([])
# summary_numbers = numbers_per_year.mean(axis=0)
# idxs = np.argsort(summary_numbers)
# for i in range(len(idxs) - TERMS, len(idxs)):
#     top_term_idxs.add(idxs[i])    
top_term_idxs = [corpus_terms.index(t) for t in freq_terms]  

logging.info('Collecting dataframe with numbers for terms')
years = [year for year, _ in papers_by_year.items()]
term_dfs = []
for idx in top_term_idxs:
    term = corpus_terms[idx]
    term_df = pd.DataFrame(data=numbers_per_year[:, idx].astype(int), columns=['number'])
    term_df['term'] = term
    term_df['year'] = years
    term_dfs.append(term_df)
terms_df = pd.concat(term_dfs, axis=0).reset_index(drop=True)

# display(terms_df.head())

In [None]:
import holoviews as hv
from holoviews import opts
hv.extension('bokeh', 'matplotlib')
from pysrc.papers.plot.plotter import Plotter

In [None]:
# Define the value dimensions
max_numbers = terms_df['number'].max()
vdim = hv.Dimension('number', range=(-10, max_numbers + 10))
    
# Define the dataset
ds = hv.Dataset(terms_df, vdims=vdim)
curves = ds.to(hv.Curve, 'year', groupby='term').overlay().redim(year=dict(range=(min(years)-1, max(years) + 10)))


# Define a function to get the text annotations
max_year = ds['year'].max()
label_df = terms_df[terms_df.year==max_year].copy().reset_index(drop=True)


# Update percentages for better labels representation
pgroups = label_df.groupby('number')['term'].apply(list).to_dict()
delta = max_numbers / 10 # 3 for 0-100 looks fine, using extrapolation here
deltas = {}
for i, row in label_df.iterrows():
    papers, term, year = row
    if papers in deltas:
        deltas[papers] += delta
    else:
        deltas[papers] = -delta * (len(pgroups[papers]) - 1) / 2
    label_df.loc[i, 'number'] = papers + deltas[papers]
label_df.sort_values(by='term', inplace=True)
labels = hv.Labels(label_df, ['year', 'number'], 'term')


overlay = (curves * labels).relabel('Number of papers, containing most frequent keywords')

cmap = Plotter.factors_colormap(len(label_df))
palette = [Plotter.color_to_rgb(cmap(i)).to_hex() for i in range(len(label_df))]
overlay.opts(
    opts.Curve(show_frame=False, labelled=[], tools=['hover'],
               height=600, width=900, show_legend=False, xticks=list(reversed(range(max(years), min(years), -5))),
               color=hv.Cycle(values=palette), alpha=0.3, line_width=2, show_grid=True),
    opts.Labels(text_color='term', cmap=palette, text_align='left'),
    opts.NdOverlay(batched=False, 
                   gridstyle={'grid_line_dash': [6, 4], 'grid_line_width': 1, 'grid_bounds': (0, 100)})
)

## Authors graph

In [None]:
import networkx as nx
import itertools


def update_edge(graph, a1, a2, name, value):
    if a1 == a2:
        return
    if a1 > a2:
        a1, a2 = a2, a1
    if not graph.has_edge(a1, a2):
        graph.add_edge(a1, a2)
    edge = graph[a1][a2]
    edge[name] = edge.get(name, 0) + value


def build_authors_graph(df, texts_similarity, citations_graph, cocit_grouped_df, bibliographic_coupling_df):
    result = nx.Graph()
    # NOTE: we use nodes id as String to avoid problems str keys in jsonify
    # during graph visualization

    logging.info('Processing papers')
    for i, row in df[['authors']].iterrows():
        authors = row[0].split(', ')
#         authors = authors if len(authors) <= 2 else [authors[0], authors[-1]]
        for i in range(len(authors)):
            for j in range(i + 1, len(authors)):
                update_edge(result, authors[i], authors[j], 'authorship', 1)

    logging.info('Processing co-citations')
    for el in cocit_grouped_df[['cited_1', 'cited_2', 'total']].values:
        start, end, cocitation = str(el[0]), str(el[1]), float(el[2])
        authors1 = df.loc[df['id'] == start]['authors'].values[0].split(', ')
        authors2 = df.loc[df['id'] == end]['authors'].values[0].split(', ')
#         authors1 = authors1 if len(authors1) <= 2 else [authors1[0], authors1[-1]]
#         authors2 = authors2 if len(authors2) <= 2 else [authors2[0], authors2[-1]]        
        for a1, a2 in itertools.product(authors1, authors2):
            update_edge(result, a1, a2, 'cocitation', cocitation)

    logging.info('Bibliographic coupling')
    if len(bibliographic_coupling_df) > 0:
        for el in bibliographic_coupling_df[['citing_1', 'citing_2', 'total']].values:
            start, end, bibcoupling = str(el[0]), str(el[1]), float(el[2])
            authors1 = df.loc[df['id'] == start]['authors'].values[0].split(', ')
            authors2 = df.loc[df['id'] == end]['authors'].values[0].split(', ')
#             authors1 = authors1 if len(authors1) <= 2 else [authors1[0], authors1[-1]]
#             authors2 = authors2 if len(authors2) <= 2 else [authors2[0], authors2[-1]]        
            for a1, a2 in itertools.product(authors1, authors2):
                update_edge(result, a1, a2, 'bibcoupling', bibcoupling)

    logging.info('Text similarity')
    pids = list(df['id'])
    if len(df) >= 2:
        for i, pid1 in enumerate(df['id']):
            similarity_queue = texts_similarity[i]
            while not similarity_queue.empty():
                similarity, j = similarity_queue.get()
                pid2 = pids[j]
                authors1 = df.loc[df['id'] == pid1]['authors'].values[0].split(', ')
                authors2 = df.loc[df['id'] == pid2]['authors'].values[0].split(', ')
#                 authors1 = authors1 if len(authors1) <= 2 else [authors1[0], authors1[-1]]
#                 authors2 = authors2 if len(authors2) <= 2 else [authors2[0], authors2[-1]]        
                for a1, a2 in itertools.product(authors1, authors2):
                    update_edge(result, a1, a2, 'text', similarity)

    logging.info('Citations')
    for u, v in citations_graph.edges:
        authors1 = df.loc[df['id'] == u]['authors'].values[0].split(', ')
        authors2 = df.loc[df['id'] == v]['authors'].values[0].split(', ')
#         authors1 = authors1 if len(authors1) <= 2 else [authors1[0], authors1[-1]]
#         authors2 = authors2 if len(authors2) <= 2 else [authors2[0], authors2[-1]]        
        for a1, a2 in itertools.product(authors1, authors2):
            update_edge(result, a1, a2, 'citation', 1)

    return result

In [None]:
authors_graph = build_authors_graph(
    analyzer.df, analyzer.texts_similarity, analyzer.citations_graph, 
    analyzer.cocit_grouped_df, analyzer.bibliographic_coupling_df
)
logging.info(f'Built authors graph - {len(authors_graph.nodes())} nodes and {len(authors_graph.edges())} edges')

In [None]:
logging.info('Compute author citations')
author_citations = {}
for i, row in analyzer.df[['authors', 'total']].iterrows():
    authors = row['authors'].split(', ')
#     authors = authors if len(authors) <= 2 else [authors[0], authors[-1]]
    for a in authors:
        author_citations[a] = author_citations.get(a, 0) + row['total']
        
logging.info('Compute number of papers per author')
author_papers = {}
for i, row in analyzer.df[['title', 'authors']].iterrows():
    authors = row['authors'].split(', ')
#     authors = authors if len(authors) <= 2 else [authors[0], authors[-1]]
    for a in authors:
        author_papers[a] = author_papers.get(a, []) + [row['title']]

In [None]:
import community

connected_components = nx.number_connected_components(authors_graph)
logging.info(f'Authors graph has {connected_components} connected components')

logging.info('Compute aggregated similarity')
for _, _, d in authors_graph.edges(data=True):
    d['similarity'] = \
        100 * d.get('authorship', 0) + \
        PapersAnalyzer.SIMILARITY_COCITATION * d.get('cocitation', 0) + \
        PapersAnalyzer.SIMILARITY_BIBLIOGRAPHIC_COUPLING * d.get('bibcoupling', 0) + \
        PapersAnalyzer.SIMILARITY_CITATION * d.get('citation', 0) + \
        PapersAnalyzer.SIMILARITY_TEXT_CITATION * d.get('text', 0)

logging.info('Graph clustering via Louvain community algorithm')
partition_louvain = community.best_partition(
    authors_graph, weight='similarity', random_state=42
)
logging.info(f'Best partition {len(set(partition_louvain.values()))} components')
components = set(partition_louvain.values())
comp_sizes = {c: sum([partition_louvain[node] == c for node in partition_louvain.keys()]) for c in components}
logging.info(f'Components: {comp_sizes}')

In [None]:
TOP_CITED_AUTHORS_PER_COMP = 10
top_cited_authors = set([])
for group in sorted(set(partition_louvain.values())):
    authors = [a for a in partition_louvain.keys() if partition_louvain[a] == group]
    authors.sort(key=lambda a: author_citations[a], reverse=True)
    top = authors[:TOP_CITED_AUTHORS_PER_COMP]
    top_cited_authors.update(top)
    print(f'#{group} {", ".join(top)}')

In [None]:
# Plot top cited graph
filtered_authors_graph = nx.Graph()
for (a1, a2, d) in authors_graph.edges(data=True):
    # len(author_papers[a1]) > 1 and len(author_papers[a2]) > 1 and \
    if a1 in top_cited_authors and a2 in top_cited_authors:
        filtered_authors_graph.add_edge(a1, a2, **d)
logging.info(f'Built top authors graph - '
             f'{len(filtered_authors_graph.nodes())} nodes and {len(filtered_authors_graph.edges())} edges')

In [None]:
from bokeh.plotting import figure
from bokeh.models import GraphRenderer, StaticLayoutProvider, Circle, HoverTool, MultiLine
from bokeh.models.graphs import NodesAndLinkedEdges

from pysrc.papers.analysis.graph import local_sparse

# g = authors_similarity_graph
g = local_sparse(filtered_authors_graph, e=0.8)
# Layout by nx


pos = nx.spring_layout(g)
# nx.draw_networkx(g, pos)
# labels = nx.get_edge_attributes(g, 'similarity')
# nx.draw_networkx_edge_labels(g, pos, edge_labels=labels)

nodes = [a for a, _ in pos.items()]
graph = GraphRenderer()

clusters = [partition_louvain[n] for n in nodes]
cmap = Plotter.factors_colormap(len(set(clusters)))
palette = dict(zip(set(clusters), [Plotter.color_to_rgb(cmap(i)).to_hex() 
                                   for i in range(len(set(partition_louvain.values())))]))

graph.node_renderer.data_source.add(nodes, 'index')
graph.node_renderer.data_source.data['id'] = nodes
graph.node_renderer.data_source.data['cited'] = [author_citations[n] for n in nodes]
graph.node_renderer.data_source.data['papers'] = [len(author_papers[n]) for n in nodes]
graph.node_renderer.data_source.data['titles'] = ['\n'.join(author_papers[n]) for n in nodes]
graph.node_renderer.data_source.data['size'] = [np.log1p(len(author_papers[n])) * np.log1p(author_citations[n]) 
                                                for n in nodes]
graph.node_renderer.data_source.data['cluster'] = clusters
graph.node_renderer.data_source.data['color'] = [palette[partition_louvain[n]] for n in nodes]
graph.edge_renderer.data_source.data = dict(start=[a for a, _ in g.edges], 
                                            end=[a for _, a in g.edges])

### start of layout code   
x = [v[0] for _, v in pos.items()]
y = [v[1] for _, v in pos.items()]
plot = figure(title="Authors plot",
              width=900,
              height=800,
              x_range=(min(x), max(x)), y_range=(min(y), max(y)),
              tools="pan,tap,wheel_zoom,box_zoom,reset,save")

TOOLTIPS = """
    <div style="max-width: 320px">
        <div>
            <span style="font-size: 12px; font-weight: bold;">@id</span>
        </div>
        <div>
            <span style="font-size: 11px; font-weight: bold;">Papers</span>
            <span style="font-size: 10px;">@papers</span>
        </div>
        <div>
            <span style="font-size: 11px; font-weight: bold;">Cited</span>
            <span style="font-size: 10px;">@cited</span>
        </div>
        <div>
            <span style="font-size: 11px; font-weight: bold;">Cluster</span>
            <span style="font-size: 10px;">@cluster</span>
        </div>
        <div>
            <span style="font-size: 11px; font-weight: bold;">Titles</span>
            <span style="font-size: 10px;">@titles</span>
        </div>
    </div>
"""

plot.add_tools(HoverTool(tooltips=TOOLTIPS))


graph_layout = dict(zip(nodes, zip(x, y)))
graph.layout_provider = StaticLayoutProvider(graph_layout=graph_layout)

graph.node_renderer.glyph = Circle(size='size', fill_color='color')
graph.node_renderer.hover_glyph = Circle(size='size', fill_color='green')

graph.edge_renderer.glyph = MultiLine(line_color='grey', line_alpha=0.1, line_width=1)
graph.edge_renderer.hover_glyph = MultiLine(line_color='green', line_alpha=1.0, line_width=2)

graph.inspection_policy = NodesAndLinkedEdges()

plot.renderers.append(graph)
plot.min_border_left = 75
show(plot)

## Topic Evolution

In [None]:
evolution_data, keywords_data = plotter.topic_evolution()
show(evolution_data)
print(keywords_data)

## PageRank for Citation Analysis

In [None]:
import networkx as nx

# Apply PageRank algorithm with damping factor of 0.5
pr_nx = nx.pagerank(analyzer.citations_graph, alpha=0.5, tol=1e-9)

In [None]:
ancestor = dict.fromkeys(analyzer.citations_graph, (0, 0))

# Select ancestor with highest PR for each node
for v in analyzer.citations_graph:
    for u in analyzer.citations_graph[v]:
        anc, pr = ancestor[u]
        if pr_nx[v] > pr:
            ancestor[u] = (v, pr_nx[v])

In [None]:
PRG = nx.DiGraph()
for v, anc in ancestor.items():
    u, pr = anc
    if pr > 0:
        PRG.add_edge(u, v)

In [None]:
start, end = zip(*list(PRG.edges()))

In [None]:
from bokeh.plotting import figure
from bokeh.models import GraphRenderer, StaticLayoutProvider, Circle, HoverTool, MultiLine
from bokeh.models.graphs import NodesAndLinkedEdges

node_indices = list(filter(lambda node: len(analyzer.df[analyzer.df['id'] == node]) > 0, list(PRG.nodes())))

years = []
year_counts = {}
titles = []
pageranks = []
size = []
for node in node_indices:
    sel = analyzer.df[analyzer.df['id'] == node]
    year = sel['year'].values[0]
    
    if not year in year_counts:
        year_counts[year] = 1
    else:
        year_counts[year] += 1
    years.append(year)
    
    titles.append(sel['title'].values[0])
    pageranks.append(pr_nx[node] * 100)
    size.append(pr_nx[node] * 1000)
max_year_count = max(list(year_counts.values()))
min_year, max_year = min(years), max(years)

plot = figure(title="PageRank applied to citation filtering", 
              x_range=(min_year - 1, max_year+1), y_range=(0, max_year_count + 1),
              tools="", toolbar_location=None)

TOOLTIPS = """
    <div style="max-width: 320px">
        <div>
            <span style="font-size: 12px; font-weight: bold;">@title</span>
        </div>
        <div>
            <span style="font-size: 11px;">Year</span>
            <span style="font-size: 10px;">@year</span>
        </div>
        <div>
            <span style="font-size: 11px;">PMID</span>
            <span style="font-size: 10px;">@id</span>
        </div>
        <div>
            <span style="font-size: 11px;">PageRank</span>
            <span style="font-size: 10px;">@pagerank</span>
        </div>
    </div>
"""

plot.add_tools(HoverTool(tooltips=TOOLTIPS))

graph = GraphRenderer()

graph.node_renderer.data_source.add(node_indices, 'index')
graph.node_renderer.data_source.data['id'] = node_indices
graph.node_renderer.data_source.data['year'] = years
graph.node_renderer.data_source.data['title'] = titles
graph.node_renderer.data_source.data['pagerank'] = pageranks
graph.node_renderer.data_source.data['size'] = size
# graph.edge_renderer.data_source.data = dict(start=start, end=end)

### start of layout code   
x = [analyzer.df[analyzer.df['id'] == pmid]['year'].values[0] for pmid in node_indices]
y = []
tmp_year_counts = {}
for node in node_indices:
    year = analyzer.df[analyzer.df['id'] == node]['year'].values[0]
    if not year in tmp_year_counts:
        tmp_year_counts[year] = 1
    else:
        tmp_year_counts[year] += 1
    y.append(tmp_year_counts[year])

graph_layout = dict(zip(node_indices, zip(x, y)))
graph.layout_provider = StaticLayoutProvider(graph_layout=graph_layout)

graph.node_renderer.glyph = Circle(size='size', fill_color='blue')
graph.node_renderer.hover_glyph = Circle(size='size', fill_color='green')

# graph.edge_renderer.glyph = MultiLine(line_color='black', line_alpha=1, line_width=1)
# graph.edge_renderer.hover_glyph = MultiLine(line_color='green', line_width=2)

graph.inspection_policy = NodesAndLinkedEdges()

plot.min_border_left = 75
plot.renderers.append(graph)

show(plot)

### Top Papers by PageRank

In [None]:
for pmid, pagerank in sorted(pr_nx.items(), key=lambda el: el[1], reverse=True)[:10]:
    print(f"{(100*pagerank):.2f} {analyzer.df[analyzer.df['id'] == pmid]['title'].values[0]}")

### PageRank and citation ranking correlation

In [None]:
import numpy as np
from scipy.stats import spearmanr

analyzer.df['citation_rank'] = analyzer.df['total'].rank(method='first', ascending=False)
pagerank_rank = sorted(pr_nx.items(), key=lambda el: el[1], reverse=True)

r = np.zeros((len(pagerank_rank), 2))
for i, (pmid, pr) in enumerate(pagerank_rank):
    sel = analyzer.df[analyzer.df['id'] == pmid]
    if len(sel) > 0:
        r[i, 0] = i
        r[i, 1] = int(sel['citation_rank'].values[0])
        
TOP_X = [5, 10, 30, 50, 100]
for x in TOP_X:
    rho, _ = spearmanr(r[:x, 0], r[:x, 1])
    print(f'Spearman correlation coefficient for top {x}: {rho}')

## Hub nodes

In [None]:
# Very slow!

import numpy as np

adj = np.zeros((analyzer.similarity_graph.number_of_nodes(), analyzer.df['comp'].nunique()))
w = np.zeros(adj.shape)

for i, v in enumerate(analyzer.similarity_graph.nodes()):
    for u in analyzer.similarity_graph[v]:
        c = analyzer.df[analyzer.df['id'] == u]['comp'].values[0]
        adj[i][c] += 1
        w[i][c] += analyzer.similarity_graph[v][u]['similarity']

In [None]:
size = 10

hub_indices = np.argsort(np.sum(adj > 0, axis=1))[-size:]

nodes_list = list(analyzer.similarity_graph.nodes)
hub_pmids = [nodes_list[idx] for idx in hub_indices]

In [None]:
print('Hub nodes')
print(analyzer.df[analyzer.df['id'].isin(hub_pmids)][['id', 'title']])