# Pubtrends

This Jupyter Notebook can be used to perform basic publication analysis.

**IMPORTANT** 
Turn on experimental features in config file!

## Getting Started

1. Define the `SEARCH_QUERY` variable in the cell below with a list of keywords that describe the science branch of your interest.
2. Run all cells & see the results.

In [None]:
SEARCH_QUERY = 'human aging'

## Publication Analysis

In [None]:
import logging
import json


from bokeh.plotting import show, output_notebook
from matplotlib import pyplot as plt


from pysrc.papers.config import PubtrendsConfig
from pysrc.papers.db.pm_postgres_loader import PubmedPostgresLoader
from pysrc.papers.db.ss_postgres_loader import SemanticScholarPostgresLoader
from pysrc.papers.analyzer import PapersAnalyzer
from pysrc.papers.plot.plotter import Plotter
from pysrc.papers.utils import SORT_MOST_CITED, SORT_MOST_RECENT, cut_authors_list

logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s')
output_notebook()
%matplotlib inline

In [None]:
SEARCH_SORT = SORT_MOST_CITED
SEARCH_PAPERS = 1000

In [None]:
config = PubtrendsConfig(test=False)
config.feature_evolution_enabled = True
loader = PubmedPostgresLoader(config)
analyzer = PapersAnalyzer(loader, config)
try:
    ids = analyzer.search_terms(SEARCH_QUERY, limit=SEARCH_PAPERS, sort=SEARCH_SORT)
    analyzer.analyze_papers(ids, SEARCH_QUERY)
finally:
    loader.close_connection()
    analyzer.teardown()

# Report plots

In [None]:
plotter = Plotter(analyzer)

In [None]:
show(plotter.papers_by_year())

In [None]:
from pysrc.papers.analysis.text import get_frequent_tokens, get_topic_word_cloud_data

freq_kwds = get_frequent_tokens(analyzer.top_cited_df, query=analyzer.query)
wc, _ = plotter.papers_word_cloud_and_callback(freq_kwds)
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.show()

# Trends

In [None]:
show(plotter.top_cited_papers())

In [None]:
show(plotter.most_cited_per_year_papers())

In [None]:
show(plotter.fastest_growth_per_year_papers())

# Single paper citations dynamics

In [None]:
show(plotter.paper_citations_per_year(analyzer.df, analyzer.df['id'].values[0]))

## Topics a.k.a. Clusters in the Co-citation Graph

In [None]:
show(plotter.topic_years_distribution())

In [None]:
show(plotter.heatmap_topics_similarity())

In [None]:
show(plotter.topics_hierarchy())

In [None]:
# First cluster papers
show(plotter.topics_info_and_word_cloud_and_callback()[0][0])

# Other Features

## Frequent keywords timeline

In [None]:
from pysrc.papers.analysis.text import get_frequent_tokens
freq_kwds = get_frequent_tokens(analyzer.top_cited_df, query=analyzer.query)
show(plotter.plot_keywords_frequencies(freq_kwds))

## Authors graph

In [None]:
from pysrc.papers.analysis.metadata import popular_authors, popular_journals, build_authors_similarity_graph, \
    compute_authors_citations_and_papers, cluster_authors

logging.info("Analyzing groups of similar authors")
authors_citations, authors_papers = compute_authors_citations_and_papers(analyzer.df)
author_productivity = {a: np.log1p(authors_citations.get(a, 1)) * p
                            for a, p in authors_papers.items()}
min_author_productivity = np.percentile(list(author_productivity.values()), 95)
min_author_productivity

In [None]:
authors_similarity_graph = build_authors_graph(
    analyzer.df, analyzer.texts_similarity, analyzer.citations_graph, 
    analyzer.cocit_grouped_df, analyzer.bibliographic_coupling_df,
    lambda a: author_productivity[a] >= min_author_productivity
)
logging.info(f'Built authors graph - {len(authors_graph.nodes())} nodes and {len(authors_graph.edges())} edges')

In [None]:
authors_clusters = cluster_authors(authors_similarity_graph, analyzer.similarity)

In [None]:
show(plotter.authors_graph())

In [None]:
AUTHORS_PER_COMP = 20
group_authors = {}
top_authors = set([])
for group in sorted(set(authors_clusters.values())):
    authors = [a for a in authors_clusters.keys() if authors_clusters[a] == group]
    authors.sort(key=lambda a: author_productivity[a], reverse=True)
    top = authors[:TOP_CITED_AUTHORS_PER_COMP]
    top_authors.update(top)
    group_authors[group] = ", ".join(top)
    print(f'#{group} ({len(authors)}) {", ".join(top)}' + (', ...' if len(authors) > AUTHORS_PER_COMP else ''))

## Papers per group of authors

In [None]:
paper_groups = np.zeros(shape=(len(analyzer.df), len(set(authors_clusters.values()))))
for i, row in analyzer.df[['authors']].iterrows():
    for a in row[0].split(', '):
        if a in authors_clusters:
            group = authors_clusters[a]
            paper_groups[i, group] += 1 / part_sizes[group]
groups = np.argmax(paper_groups, axis=1)
papers_assigned = paper_groups.sum(axis=1) > 0
groups_partition = {pid: groups[i] for i, pid in enumerate(analyzer.df['id']) if papers_assigned[i]}

In [None]:
groups_part_sizes = {c: sum([groups_partition[node] == c for node in groups_partition.keys()]) 
                     for c in set(groups_partition.values())}
logging.info(f'Components: {groups_part_sizes}')

In [None]:
# for pid, gp in groups_partition.items():
#     if groups_part_sizes[gp] < 10:
#         groups_partition[pid] = -1  # Join

In [None]:
from pysrc.papers.analysis.topics import get_topics_description

groups_pids = pd.DataFrame(groups_partition.items(), columns=['id', 'comp']). \
                groupby('comp')['id'].apply(list).to_dict()
groups_description = get_topics_description(
    analyzer.df.iloc[np.flatnonzero(papers_assigned), :], groups_pids,
    corpus_terms, corpus_counts[np.flatnonzero(papers_assigned), :],
    query=analyzer.query,
    n_words=analyzer.TOPIC_DESCRIPTION_WORDS
)

In [None]:
groups_df = pd.DataFrame(columns=['group', 'authors', 'papers', 'keywords'])
for g, pids in groups_pids.items():
    if g in group_authors and g in groups_description:
        groups_df.loc[len(groups_df)] = (g, group_authors[g], len(pids), 
                                         ', '.join(v[0] for v in groups_description[g][:10]))

display(groups_df)

## Topic Evolution

In [None]:
evolution_data, keywords_data = plotter.topic_evolution()
show(evolution_data)
print(keywords_data)

## PageRank for Citation Analysis

In [None]:
import networkx as nx

# Apply PageRank algorithm with damping factor of 0.5
pr_nx = nx.pagerank(analyzer.citations_graph, alpha=0.5, tol=1e-9)

In [None]:
ancestor = dict.fromkeys(analyzer.citations_graph, (0, 0))

# Select ancestor with highest PR for each node
for v in analyzer.citations_graph:
    for u in analyzer.citations_graph[v]:
        anc, pr = ancestor[u]
        if pr_nx[v] > pr:
            ancestor[u] = (v, pr_nx[v])

In [None]:
PRG = nx.DiGraph()
for v, anc in ancestor.items():
    u, pr = anc
    if pr > 0:
        PRG.add_edge(u, v)

In [None]:
start, end = zip(*list(PRG.edges()))

In [None]:
from bokeh.plotting import figure
from bokeh.models import GraphRenderer, StaticLayoutProvider, Circle, HoverTool, MultiLine
from bokeh.models.graphs import NodesAndLinkedEdges

node_indices = list(filter(lambda node: len(analyzer.df[analyzer.df['id'] == node]) > 0, list(PRG.nodes())))

years = []
year_counts = {}
titles = []
pageranks = []
size = []
for node in node_indices:
    sel = analyzer.df[analyzer.df['id'] == node]
    year = sel['year'].values[0]
    
    if not year in year_counts:
        year_counts[year] = 1
    else:
        year_counts[year] += 1
    years.append(year)
    
    titles.append(sel['title'].values[0])
    pageranks.append(pr_nx[node] * 100)
    size.append(pr_nx[node] * 1000)
max_year_count = max(list(year_counts.values()))
min_year, max_year = min(years), max(years)

plot = figure(title="PageRank applied to citation filtering", 
              x_range=(min_year - 1, max_year+1), y_range=(0, max_year_count + 1),
              tools="", toolbar_location=None)

TOOLTIPS = """
    <div style="max-width: 320px">
        <div>
            <span style="font-size: 12px; font-weight: bold;">@title</span>
        </div>
        <div>
            <span style="font-size: 11px;">Year</span>
            <span style="font-size: 10px;">@year</span>
        </div>
        <div>
            <span style="font-size: 11px;">PMID</span>
            <span style="font-size: 10px;">@id</span>
        </div>
        <div>
            <span style="font-size: 11px;">PageRank</span>
            <span style="font-size: 10px;">@pagerank</span>
        </div>
    </div>
"""

plot.add_tools(HoverTool(tooltips=TOOLTIPS))

graph = GraphRenderer()

graph.node_renderer.data_source.add(node_indices, 'index')
graph.node_renderer.data_source.data['id'] = node_indices
graph.node_renderer.data_source.data['year'] = years
graph.node_renderer.data_source.data['title'] = titles
graph.node_renderer.data_source.data['pagerank'] = pageranks
graph.node_renderer.data_source.data['size'] = size
# graph.edge_renderer.data_source.data = dict(start=start, end=end)

### start of layout code   
x = [analyzer.df[analyzer.df['id'] == pmid]['year'].values[0] for pmid in node_indices]
y = []
tmp_year_counts = {}
for node in node_indices:
    year = analyzer.df[analyzer.df['id'] == node]['year'].values[0]
    if not year in tmp_year_counts:
        tmp_year_counts[year] = 1
    else:
        tmp_year_counts[year] += 1
    y.append(tmp_year_counts[year])

graph_layout = dict(zip(node_indices, zip(x, y)))
graph.layout_provider = StaticLayoutProvider(graph_layout=graph_layout)

graph.node_renderer.glyph = Circle(size='size', fill_color='blue')
graph.node_renderer.hover_glyph = Circle(size='size', fill_color='green')

# graph.edge_renderer.glyph = MultiLine(line_color='black', line_alpha=1, line_width=1)
# graph.edge_renderer.hover_glyph = MultiLine(line_color='green', line_width=2)

graph.inspection_policy = NodesAndLinkedEdges()

plot.min_border_left = 75
plot.renderers.append(graph)

show(plot)

### Top Papers by PageRank

In [None]:
for pmid, pagerank in sorted(pr_nx.items(), key=lambda el: el[1], reverse=True)[:10]:
    print(f"{(100*pagerank):.2f} {analyzer.df[analyzer.df['id'] == pmid]['title'].values[0]}")

### PageRank and citation ranking correlation

In [None]:
import numpy as np
from scipy.stats import spearmanr

analyzer.df['citation_rank'] = analyzer.df['total'].rank(method='first', ascending=False)
pagerank_rank = sorted(pr_nx.items(), key=lambda el: el[1], reverse=True)

r = np.zeros((len(pagerank_rank), 2))
for i, (pmid, pr) in enumerate(pagerank_rank):
    sel = analyzer.df[analyzer.df['id'] == pmid]
    if len(sel) > 0:
        r[i, 0] = i
        r[i, 1] = int(sel['citation_rank'].values[0])
        
TOP_X = [5, 10, 30, 50, 100]
for x in TOP_X:
    rho, _ = spearmanr(r[:x, 0], r[:x, 1])
    print(f'Spearman correlation coefficient for top {x}: {rho}')

## Hub nodes

In [None]:
# Very slow!

import numpy as np

adj = np.zeros((analyzer.similarity_graph.number_of_nodes(), analyzer.df['comp'].nunique()))
w = np.zeros(adj.shape)

for i, v in enumerate(analyzer.similarity_graph.nodes()):
    for u in analyzer.similarity_graph[v]:
        c = analyzer.df[analyzer.df['id'] == u]['comp'].values[0]
        adj[i][c] += 1
        w[i][c] += analyzer.similarity_graph[v][u]['similarity']

In [None]:
size = 10

hub_indices = np.argsort(np.sum(adj > 0, axis=1))[-size:]

nodes_list = list(analyzer.similarity_graph.nodes)
hub_pmids = [nodes_list[idx] for idx in hub_indices]

In [None]:
print('Hub nodes')
print(analyzer.df[analyzer.df['id'].isin(hub_pmids)][['id', 'title']])