# Pubtrends

Experimental notebook for hypothesis testing and development purposes.

**IMPORTANT** 
Turn on experimental features in config file!

## Getting Started

1. Define the `SEARCH_QUERY` variable in the cell below with a list of keywords that describe the science branch of your interest.
2. Run all cells & see the results.

In [None]:
SEARCH_QUERY = 'human aging'

## Publication Analysis

In [None]:
import logging
import json
import seaborn as sns
import numpy as np
import pandas as pd
from scipy import stats
from collections import Counter


from bokeh.plotting import figure
from bokeh.plotting import show, output_notebook
from matplotlib import pyplot as plt


from pysrc.papers.config import PubtrendsConfig
from pysrc.papers.db.pm_postgres_loader import PubmedPostgresLoader
from pysrc.papers.db.ss_postgres_loader import SemanticScholarPostgresLoader
from pysrc.papers.analyzer import PapersAnalyzer
from pysrc.papers.plot.plotter import Plotter
from pysrc.papers.utils import SORT_MOST_CITED, SORT_MOST_RECENT, cut_authors_list

logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s: %(message)s')
logger = logging.getLogger('notebook')

output_notebook()
%matplotlib inline

In [None]:
SEARCH_SORT = SORT_MOST_CITED
SEARCH_PAPERS = 1000

In [None]:
config = PubtrendsConfig(test=False)
config.feature_evolution_enabled = True
loader = PubmedPostgresLoader(config)
analyzer = PapersAnalyzer(loader, config)
try:
    ids = analyzer.search_terms(SEARCH_QUERY, limit=SEARCH_PAPERS, sort=SEARCH_SORT)
    analyzer.analyze_papers(ids, SEARCH_QUERY)
finally:
    loader.close_connection()
    analyzer.teardown()

# Report plots

In [None]:
plotter = Plotter(analyzer)

In [None]:
show(plotter.papers_by_year())

In [None]:
from pysrc.papers.analysis.text import get_frequent_tokens, get_topic_word_cloud_data

freq_kwds = get_frequent_tokens(analyzer.top_cited_df, query=analyzer.query)
wc, _ = plotter.papers_word_cloud_and_callback(freq_kwds)
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.show()

# Trends

In [None]:
# show(plotter.top_cited_papers())

In [None]:
show(plotter.most_cited_per_year_papers())

In [None]:
show(plotter.fastest_growth_per_year_papers())

## Frequent keywords timeline

In [None]:
from pysrc.papers.analysis.text import get_frequent_tokens

freq_kwds = get_frequent_tokens(analyzer.top_cited_df, query=analyzer.query)

In [None]:
print('Original keywords frequencies')
show(plotter.plot_keywords_frequencies(freq_kwds))

In [None]:
from statsmodels.nonparametric.smoothers_lowess import lowess
from holoviews import opts
import holoviews as hv
hv.extension('bokeh')


from pysrc.papers.plot.plot_preprocessor import PlotPreprocessor

def plot_keywords_frequencies_smooth(freq_kwds, n=20):
    keywords_df, years = PlotPreprocessor.frequent_keywords_data(
        freq_kwds, analyzer.df, analyzer.corpus_terms, analyzer.corpus_counts, n
    )

    logging.debug('Local weighted linear regression smoothing')
    dfs = []
    for kwd in set(keywords_df['keyword']):
        t = keywords_df.loc[keywords_df['keyword'] == kwd].copy()
        t['number'] = lowess(t['number'], t['year'], frac=0.2, return_sorted=False)
        dfs.append(t)
        
    keywords_df = pd.concat(dfs)

    # Define the value dimensions
    max_numbers = keywords_df['number'].max()
    vdim = hv.Dimension('number', range=(-10, max_numbers + 10))

    # Define the dataset
    ds = hv.Dataset(keywords_df, vdims=vdim)
    curves = ds.to(hv.Curve, 'year', groupby='keyword').overlay().redim(
        year=dict(range=(min(years) - 1, max(years) + 5)))

    # Define a function to get the text annotations
    max_year = ds['year'].max()
    label_df = keywords_df[keywords_df.year == max_year].copy().reset_index(drop=True)

    # Update layout for better labels representation
    label_df.sort_values(by='number', inplace=True)
    if len(label_df) > 1:
        label_df['number'] = [i * max_numbers / (len(label_df) - 1) for i in range(len(label_df))]
    label_df.sort_values(by='keyword', inplace=True)
    labels = hv.Labels(label_df, ['year', 'number'], 'keyword')

    overlay = curves * labels

    cmap = Plotter.factors_colormap(len(label_df))
    palette = [Plotter.color_to_rgb(cmap(i)).to_hex() for i in range(len(label_df))]
    overlay.opts(
        opts.Curve(show_frame=False, labelled=[], tools=['hover'],
                   width=600, height=600, show_legend=False,
                   xticks=list(reversed(range(max(years), min(years), -5))),
                   color=hv.Cycle(values=palette), alpha=0.8, line_width=2, show_grid=True),
        opts.Labels(text_color='keyword', cmap=palette, text_align='left'),
        opts.NdOverlay(batched=False,
                       gridstyle={'grid_line_dash': [6, 4], 'grid_line_width': 1, 'grid_bounds': (0, 100)})
    )
    p = hv.render(overlay, backend='bokeh')
    p.xaxis.axis_label = 'Year'
    p.yaxis.axis_label = 'Number of papers'
    p.sizing_mode = 'stretch_width'
    return p


In [None]:
print('Smooth version of keyword frequencies')
show(plot_keywords_frequencies_smooth(freq_kwds))

# Single paper citations dynamics

In [None]:
show(plotter.paper_citations_per_year(analyzer.df, analyzer.df['id'].values[0]))

# Topics analysis

In [None]:
# First cluster papers
# show(plotter.topics_info_and_word_cloud_and_callback()[0][0])

In [None]:
show(plotter.topic_years_distribution())

In [None]:
show(plotter.structure_graph())

In [None]:
show(plotter.heatmap_topics_similarity())

In [None]:
similarity_df, topics = PlotPreprocessor.topics_similarity_data(
    analyzer.similarity_graph, analyzer.partition
)

similarity_df['type'] = ['Inside' if x == y else 'Outside' 
                         for (x, y) in zip(similarity_df['comp_x'], similarity_df['comp_y'])]
sns.displot(similarity_df, x="similarity", hue="type", kind="kde")
plt.show()

In [None]:
print('Looks not useful at all for louvain clustering')
show(plotter.topics_hierarchy())

## Similarities analysis

We hope that the distribution of similarities edge weights illustrates that majority of linked nodes are insignificantly similar in terms of their attributes.

In [None]:
bibcoupling_array = np.zeros(len(analyzer.similarity_graph.edges))
cocitations_array = np.zeros(len(analyzer.similarity_graph.edges))
citations_array = np.zeros(len(analyzer.similarity_graph.edges))
similarities_array = np.zeros(len(analyzer.similarity_graph.edges))
text_similarities_array = np.zeros(len(analyzer.similarity_graph.edges))

for i, (u, v, data) in enumerate(analyzer.similarity_graph.edges(data=True)):
    bibcoupling_array[i] = np.log1p(data.get('bibcoupling', 0))
    cocitations_array[i] = np.log1p(data.get('cocitation', 0))
    citations_array[i] = data.get('citation', 0)
    text_similarities_array[i] = data.get('text', 0)
    similarities_array[i] = PapersAnalyzer.similarity(data)
    
fig = plt.figure(figsize=(5 * 4, 5))
ax = plt.subplot(1, 4, 1)
print(f'Bibcoupling, non-zero {np.count_nonzero(bibcoupling_array)} of {len(bibcoupling_array)}')
bibcoupling_array = bibcoupling_array[np.nonzero(bibcoupling_array)]
print(stats.describe(bibcoupling_array))
sns.kdeplot(bibcoupling_array)
plt.title('Bibcoupling')
# plt.show()

ax = plt.subplot(1, 4, 2)
print(f'Co-citations, non-zero {np.count_nonzero(cocitations_array)} of {len(cocitations_array)}')
cocitations_array = cocitations_array[np.nonzero(cocitations_array)]
print(stats.describe(cocitations_array))
sns.kdeplot(cocitations_array)
plt.title('Co-citations')
# plt.show()

ax = plt.subplot(1, 4, 3)
print(f'Text similarities, non-zero {np.count_nonzero(text_similarities_array)} of {len(text_similarities_array)}')
text_similarities_array = text_similarities_array[np.nonzero(text_similarities_array)]
print(stats.describe(text_similarities_array))
sns.kdeplot(text_similarities_array)
plt.title('Text')
# plt.show

ax = plt.subplot(1, 4, 4)
print(f'Similarities, non-zero {np.count_nonzero(similarities_array)} of {len(similarities_array)}')
print(stats.describe(similarities_array))
sns.kdeplot(similarities_array)
plt.title('Similarity')

plt.show()

print(f'Citations, non-zero {np.count_nonzero(citations_array)} of {len(citations_array)}')

### Additional text similarities exploration
We use cutoff = 0.1 as min text similarity, and limit those to 20 max.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

print('Analyze similarities between all papers')
cos_similarities = cosine_similarity(analyzer.corpus_counts)
cos_similarities_array = cos_similarities.reshape(-1)
print(stats.describe(cos_similarities_array))
print('Q1', np.percentile(cos_similarities_array, 25), 
      'Q2', np.percentile(cos_similarities_array, 50), 
      'Q3', np.percentile(cos_similarities_array, 75))

fig = plt.figure(figsize=(5 * 2, 5))
ax = plt.subplot(1, 2, 1)
sns.kdeplot(cos_similarities_array)
plt.title('Cosine similarities among all papers')
# plt.show()

print('Analyze similarities between papers with direct citations')
pid_indx = {pid: i for i, pid in enumerate(analyzer.df['id'])}
cited_cos_similarities = []
for i, (u, v, data) in enumerate(analyzer.similarity_graph.edges(data=True)):
    if data.get('citation', 0) != 0:
        cited_cos_similarities.append(cos_similarities[pid_indx[u], pid_indx[v]])

print(stats.describe(cited_cos_similarities))
print('Q1', np.percentile(cited_cos_similarities, 25), 
      'Q2', np.percentile(cited_cos_similarities, 50), 
      'Q3', np.percentile(cited_cos_similarities, 75))
ax = plt.subplot(1, 2, 2)
sns.kdeplot(cited_cos_similarities)
plt.title('Cosine similarity between cited papers')
          
plt.show()                                     

# Similarities graph embeddings with Node2Vec

In [None]:
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import pairwise_distances
from sklearn import preprocessing

import numpy as np

from stellargraph.data import BiasedRandomWalk
from stellargraph import StellarGraph

from gensim.models import Word2Vec

import warnings
import collections
from stellargraph import datasets
from IPython.display import display, HTML
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
import networkx as nx

def node2vec(graph, weight_func):
    logger.debug('Translating nx graph into stellar graph representation')
    g_weighted = nx.Graph()
    for u, v, data in graph.edges(data=True):
        g_weighted.add_edge(u, v, weight=weight_func(data))

    G = StellarGraph.from_networkx(g_weighted, node_type_default="id", edge_type_default="similarity")
    logger.debug(G.info())
    
    logger.debug('Performing random walks')
    walk_length = 100  # maximum length of a random walk
    rw = BiasedRandomWalk(G)

    weighted_walks = rw.run(
        nodes=G.nodes(),  # root nodes
        length=walk_length,  # maximum length of a random walk
        n=10,  # number of random walks per root node
        p=0.5,  # Defines (unormalised) probability, 1/p, of returning to source node
        q=2.0,  # Defines (unormalised) probability, 1/q, for moving away from source node
        weighted=True,  # for weighted random walks
        seed=42,  # random seed fixed for reproducibility
    )
    logger.debug(f"Number of random walks: {len(weighted_walks)}")    
    logger.debug('Representation learning using word2vec')
    weighted_model = Word2Vec(
        weighted_walks, size=128, window=5, min_count=0, sg=1, workers=1, iter=1
    )
    # Retrieve node embeddings and corresponding subjects
    node_ids = weighted_model.wv.index2word  # list of node IDs
    weighted_node_embeddings = (
        weighted_model.wv.vectors
    )
    return node_ids, weighted_node_embeddings

In [None]:
node_ids, weighted_node_embeddings = node2vec(analyzer.similarity_graph, 
                                              weight_func=lambda d: PapersAnalyzer.similarity(d))

## Embeddings visualization

In [None]:
logger.debug('Apply t-SNE transformation on node embeddings')
tsne = TSNE(n_components=2, random_state=42)
weighted_node_embeddings_2d = tsne.fit_transform(weighted_node_embeddings)

In [None]:
# from umap import UMAP 

# logger.debug('Apply UMAP transformation on node embeddings')
# umap = UMAP(n_components=2, random_state=42)
# weighted_node_embeddings_2d = umap.fit_transform(weighted_node_embeddings)

In [None]:
# Build dataframe combining information about papers and projected coordinates
df = analyzer.df[['id', 'title', 'year', 'type', 'total', 'authors', 'journal', 'comp']].copy()
indx = [pid_indx[pid] for pid in node_ids]
df['d1'] = pd.Series(index=indx, data=weighted_node_embeddings_2d[:, 0])
df['d2'] = pd.Series(index=indx, data=weighted_node_embeddings_2d[:, 1])

In [None]:
from bokeh.models import ColumnDataSource, CustomJS
from bokeh.models import HoverTool


from pysrc.papers.utils import cut_authors_list


def plot_embeddings(df, clusters):
    cmap = Plotter.factors_colormap(len(set(clusters)))
    palette = dict(zip(sorted(set(clusters)), [Plotter.color_to_rgb(cmap(i)).to_hex() 
                                               for i in range(len(set(clusters)))]))

    # Size is based on the citations number, at least 1
    df['size'] = 1 + 2 * np.log1p(df['total'])

    # Split authors
    df['authors'] = df['authors'].apply(lambda authors: cut_authors_list(authors))

    ds = ColumnDataSource(df)
    # Add clusters coloring
    ds.add([palette[c] for c in clusters], 'color')
    p = figure(plot_width=600, plot_height=600,
               tools="hover,pan,tap,wheel_zoom,box_zoom,reset,save")
    p.sizing_mode = 'stretch_width'
    p.xaxis.axis_label = 'd1'
    p.yaxis.axis_label = 'd2'

    p.hover.tooltips = plotter._html_tooltips([
        ("Author(s)", '@authors'),
        ("Journal", '@journal'),
        ("Year", '@year'),
        ("Type", '@type'),
        ("Cited by", '@total paper(s) total')])
    p.circle(x='d1', y='d2', fill_alpha=0.8, source=ds, size='size',
             line_color='black', fill_color='color', legend_field='comp')
    p.legend.location = None
    show(p)

In [None]:
print('Plot default clusters in embeddings coordinates')
plot_embeddings(df, analyzer.df['comp'])

## Clustering of embeddings

In [None]:
from sklearn.cluster import AgglomerativeClustering

# Min cluster size
TOPIC_MIN_SIZE = 20
# Max number of topics should be "deliverable"
TOPICS_MAX_NUMBER = 20

logger.debug('Looking for an appropriate number of clusters')
r = TOPICS_MAX_NUMBER + 1
l = 1

while l < r - 2:
    n_clusters = int((l + r) / 2)
    logger.debug(f'l {l} r {r} n_clusters {n_clusters}')
    model = AgglomerativeClustering(n_clusters=n_clusters).fit(weighted_node_embeddings)
    clusters = model.labels_.astype(int)
    clusters_counter = Counter(clusters)
    min_size = clusters_counter.most_common()[-1][1]
    logger.debug(f'min_size {min_size}')
    if min_size < TOPIC_MIN_SIZE:
        r = n_clusters + 1
    elif min_size > TOPIC_MIN_SIZE:
        l = n_clusters
    else:
        break

logger.debug(f'Number of clusters = {n_clusters}')        
logger.debug('Reorder clusters by size descending')
clusters_reord = np.zeros(len(clusters), dtype=int)    
for i, (c, n) in enumerate(clusters_counter.most_common()):
    clusters_reord[clusters == c] = i
clusters = clusters_reord
    
print('Cluster sizes')
t = pd.DataFrame({'Cluster': clusters, 
                  'size': np.ones(len(clusters))}).groupby(['Cluster']).sum().astype(int).reset_index()    
display(t)
sns.barplot(data=t, x='Cluster', y='size')
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
from pysrc.papers.analysis.topics import get_topics_description

print('Computing clusters keywords')
clusters_pids = pd.DataFrame(dict(id=node_ids, comp=clusters)).groupby('comp')['id'].apply(list).to_dict()

clusters_description = get_topics_description(
    analyzer.df, clusters_pids,
    analyzer.corpus_terms, analyzer.corpus_counts,
    query=analyzer.query,
    n_words=analyzer.TOPIC_DESCRIPTION_WORDS
)

### Clustering dendrogram

In [None]:
from scipy.cluster.hierarchy import dendrogram

def plot_dendrogram(model, **kwargs):

    # Children of hierarchical clustering
    children = model.children_

    # Distances between each pair of children
    # Since we don't have this information, we can use a uniform one for plotting
    distance = np.arange(children.shape[0])

    # The number of observations contained in each cluster level
    no_of_observations = np.arange(2, children.shape[0]+2)

    # Create linkage matrix and then plot the dendrogram
    linkage_matrix = np.column_stack([children, distance, no_of_observations]).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)
    
plot_dendrogram(model, truncate_mode='level', p=10)

In [None]:
from more_itertools import unique_everseen

def compute_clusters_dendrogram_children(m):
    leaves_map = dict(enumerate(m.labels_))
    nodes_map = {}
    children = []
    for i, (u, v) in enumerate(m.children_):
        u_cluster = leaves_map[u] if u in leaves_map else nodes_map[u] if u in nodes_map else None
        v_cluster = leaves_map[v] if v in leaves_map else nodes_map[v] if v in nodes_map else None
#         print(f'{u}: {u_cluster}, {v}: {v_cluster}')
        node = len(leaves_map) + i
        if u_cluster is not None and v_cluster is not None:
            if u_cluster != v_cluster:
                nodes_map[node] = None
#                 print('Node', node, None)
                children.append((u, v, node))
#                 print('Added', (u, v, node))
            else:
                nodes_map[node] = u_cluster
#                 print('Node', node, u_cluster)
        else:
            nodes_map[node] = u_cluster or v_cluster
#             print('Node', node, u_cluster or v_cluster)
            children.append((u, v, node))
#             print('Added', (u, v, node))
    
    def rwc(v):
        if v in leaves_map:
            return leaves_map[v]
        elif v in nodes_map:
            res = nodes_map[v]
            return res if res is not None else v
        else:
            return v
    children = [(rwc(u), rwc(v), rwc(n)) for u, v, n in children]
    return children

def convert_clusters_paths(m):
    logger.debug('Computing dendrogram for clusters')
    clusters_dendrogram_children = compute_clusters_dendrogram_children(m)
    logger.debug('Converting agglomerative clustering clusters dendrogram format to louvain')
    paths = [[p] for p in sorted(set(m.labels_))]
    for i, (u, v, n) in enumerate(clusters_dendrogram_children):
        for p in paths:
            if p[i] == u or p[i] == v:
                p.append(n)
            else:
                p.append(p[i])

    logger.debug('Radix sort or paths to ensure no overlaps')
    for i in range(len(clusters_dendrogram_children)):
        paths.sort(key=lambda p: p[i])
        # Reorder next level to keep order of previous if possible
        if i != len(clusters_dendrogram_children):
            order = dict((v, i) for i, v in enumerate(unique_everseen(p[i + 1] for p in paths)))
            for p in paths:
                p[i + 1] = order[p[i + 1]]
    leaves_order = dict((v, i) for i, v in enumerate(unique_everseen(p[0] for p in paths)))
    return paths, leaves_order

In [None]:
paths, leaves_order = convert_clusters_paths(model)

In [None]:
import math
from math import pi, sin, cos, fabs
from bokeh.colors import RGB

from pysrc.papers.utils import contrast_color

def topics_hierarchy_with_keywords(kwd_df, comp_sizes, paths, leaves_order):
    # Configure dimensions
    p = figure(x_range=[-190, 190],
               y_range=[-160, 160],
               tools="save",
               width=600, height=600)
    x_coefficient = 1.5  # Ellipse x coefficient
    y_delta = 60  # Extra space near pi / 2 and 3 * pi / 2
    n_topics = len(leaves_order)
    radius = 80  # Radius of circular dendrogram
    dendrogram_len = len(paths[0])
    d_radius = radius / (dendrogram_len + 2)
    d_degree = 2 * pi / n_topics
    char_delta = 2  # Multiplier to compute approximate width of text
    delta = 5  # Space between dendrogram and text
    max_words = min(5, max(1, int(120 / n_topics)))

    # Leaves coordinates
    leaves_degrees = dict((v, i * d_degree) for v, i in leaves_order.items())

    # Draw levels
    for i in range(2, dendrogram_len + 1): 
        p.ellipse(0, 0, fill_alpha=0, line_color='lightgray', line_alpha=0.5, 
                    width=2 * d_radius * i,
                    height=2 * d_radius * i,
                    line_dash='dotted')

    # Draw dendrogram - from bottom to top
    ds = leaves_degrees.copy()
    for i in range(1, dendrogram_len):
        next_ds = {}
        for path in paths:
            if path[i] not in next_ds:
                next_ds[path[i]] = []
            next_ds[path[i]].append(ds[path[i - 1]])
        for v, nds in next_ds.items():
            next_ds[v] = np.mean(nds)

        for path in paths:
            current_d = ds[path[i - 1]]
            next_d = next_ds[path[i]]
            p.line([cos(current_d) * d_radius * (dendrogram_len + 2 - i),
                    cos(next_d) * d_radius * (dendrogram_len + 2 - i - 1)],
                   [sin(current_d) * d_radius * (dendrogram_len + 2 - i),
                    sin(next_d) * d_radius * (dendrogram_len + 2 - i - 1)],
                   line_color='lightgray')
        ds = next_ds

    # Draw leaves
    n_comps = len(comp_sizes)
    cmap = Plotter.factors_colormap(n_comps)
    topics_colors = dict((i, Plotter.color_to_rgb(cmap(i))) for i in range(n_comps))
    xs = [cos(d) * d_radius * (dendrogram_len + 1) for _, d in leaves_degrees.items()]
    ys = [sin(d) * d_radius * (dendrogram_len + 1) for _, d in leaves_degrees.items()]
    sizes = [20 + int(min(10, math.log(comp_sizes[v]))) for v, _ in leaves_degrees.items()]
    comps = [v + 1 for v, _ in leaves_degrees.items()]
    colors = [topics_colors[v] for v, _ in leaves_degrees.items()]
    ds = ColumnDataSource(data=dict(x=xs, y=ys, size=sizes, comps=comps, color=colors))
    p.circle(x='x', y='y', size='size', fill_color='color', line_color='black', source=ds)

    def contrast_color_rbg(rgb):
        cr, cg, cb = contrast_color(rgb.r, rgb.g, rgb.b)
        return RGB(cr, cg, cb)

    # Topics labels
    p.text(x=[cos(d) * d_radius * (dendrogram_len + 1) - char_delta * len(str(v + 1))
              for v, d in leaves_degrees.items()],
           y=[sin(d) * d_radius * (dendrogram_len + 1) for _, d in leaves_degrees.items()],
           text=[str(v + 1) for v, _ in leaves_degrees.items()],
           text_baseline='middle', text_font_size='10pt',
           text_color=contrast_color_rbg(topics_colors[v]))

    # Show words for components - most popular words per component
    topics = leaves_order.keys()
    words2show = PlotPreprocessor.topics_words(kwd_df, max_words, topics)

    # Visualize words
    for v, d in leaves_degrees.items():
        if v not in words2show:  # No super-specific words for topic
            continue
        words = words2show[v]
        xs = []
        ys = []
        for i, word in enumerate(words):
            wd = d + d_degree * (i - len(words) / 2) / len(words)
            # Make word degree in range 0 - 2 * pi
            if wd < 0:
                wd += 2 * pi
            elif wd > 2 * pi:
                wd -= 2 * pi
            xs.append(cos(wd) * (radius * x_coefficient + delta))
            y = sin(wd) * (radius + delta)
            # Additional vertical space around pi/2 and 3*pi/2
            if pi / 4 <= wd < 3 * pi / 4:
                y += (pi / 4 - fabs(pi / 2 - wd)) * y_delta
            elif 5 * pi / 4 <= wd < 7 * pi / 4:
                y -= (pi / 4 - fabs(3 * pi / 2 - wd)) * y_delta
            ys.append(y)

        # Different text alignment for left | right parts
        p.text(x=[x for x in xs if x > 0], y=[y for i, y in enumerate(ys) if xs[i] > 0],
               text=[w for i, w in enumerate(words) if xs[i] > 0],
               text_align='left', text_baseline='middle', text_font_size='10pt',
               text_color=topics_colors[v])
        p.text(x=[x for x in xs if x <= 0], y=[y for i, y in enumerate(ys) if xs[i] <= 0],
               text=[w for i, w in enumerate(words) if xs[i] <= 0],
               text_align='right', text_baseline='middle', text_font_size='10pt',
               text_color=topics_colors[v])

    p.sizing_mode = 'stretch_width'
    p.axis.major_tick_line_color = None
    p.axis.minor_tick_line_color = None
    p.axis.major_label_text_color = None
    p.axis.major_label_text_font_size = '0pt'
    p.axis.axis_line_color = None
    p.grid.grid_line_color = None
    p.outline_line_color = None
    return p


In [None]:
kwds = [(comp, ','.join([f'{t}:{v:.3f}' for t, v in vs[:analyzer.TOPIC_DESCRIPTION_WORDS]]))
        for comp, vs in clusters_description.items()]
kwd_df = pd.DataFrame(kwds, columns=['comp', 'kwd'])
comp_sizes = Counter(clusters)
show(topics_hierarchy_with_keywords(kwd_df, comp_sizes, paths, leaves_order))

## Clusters visualization

In [None]:
original_df_clusters = analyzer.df['comp'].copy()
original_partition = analyzer.partition

df['comp'] = pd.Series(index=indx, data=clusters)
clusters_partition = dict(zip(df['id'], df['comp']))

analyzer.df['comp'] = df['comp']
analyzer.partition = clusters_partition

In [None]:
print('TODO')
# show(plotter.topic_years_distribution())

In [None]:
print('Structure graph visualization with new clusters')
show(plotter.structure_graph())

In [None]:
show(plotter.heatmap_topics_similarity())

In [None]:
similarity_df, topics = PlotPreprocessor.topics_similarity_data(
    analyzer.similarity_graph, clusters_partition
)

similarity_df['type'] = ['Inside' if x == y else 'Outside' 
                         for (x, y) in zip(similarity_df['comp_x'], similarity_df['comp_y'])]
sns.displot(similarity_df, x="similarity", hue="type", kind="kde")
plt.show()

## Clusters visualization

In [None]:
from bokeh.models import GraphRenderer, StaticLayoutProvider, Circle, HoverTool, MultiLine
from bokeh.models.graphs import NodesAndLinkedEdges


def structure_graph(g, df):
    nodes = df['id']
    graph = GraphRenderer()
    comps = df['comp']
    cmap = Plotter.factors_colormap(len(set(comps)))
    palette = dict(zip(sorted(set(comps)), [Plotter.color_to_rgb(cmap(i)).to_hex()
                                            for i in range(len(set(comps)))]))

    graph.node_renderer.data_source.add(df['id'], 'index')
    graph.node_renderer.data_source.data['id'] = df['id']
    graph.node_renderer.data_source.data['title'] = df['title']
    graph.node_renderer.data_source.data['authors'] = df['authors']
    graph.node_renderer.data_source.data['journal'] = df['journal']
    graph.node_renderer.data_source.data['year'] = df['year']
    graph.node_renderer.data_source.data['cited'] = df['total']
    # Limit size
    graph.node_renderer.data_source.data['size'] = df['total'] * 20 / df['total'].max() + 5
    graph.node_renderer.data_source.data['topic'] = [c + 1 for c in comps]
    graph.node_renderer.data_source.data['color'] = [palette[c] for c in comps]

    graph.edge_renderer.data_source.data = dict(start=[u for u, _ in g.edges],
                                                end=[v for _, v in g.edges])

    # start of layout code
    x = df['d1']
    y = df['d2']
    xrange = max(x) - min(x)
    yrange = max(y) - min(y)
    p = figure(plot_width=600,
               plot_height=600,
               x_range=(min(x) - 0.05 * xrange, max(x) + 0.05 * xrange), 
               y_range=(min(y) - 0.05 * yrange, max(y) + 0.05 * yrange),
               tools="pan,tap,wheel_zoom,box_zoom,reset,save")
    p.xaxis.major_tick_line_color = None  # turn off x-axis major ticks
    p.xaxis.minor_tick_line_color = None  # turn off x-axis minor ticks
    p.yaxis.major_tick_line_color = None  # turn off y-axis major ticks
    p.yaxis.minor_tick_line_color = None  # turn off y-axis minor ticks
    p.xaxis.major_label_text_font_size = '0pt'  # preferred method for removing tick labels
    p.yaxis.major_label_text_font_size = '0pt'  # preferred method for removing tick labels
    p.grid.grid_line_color = None
    p.outline_line_color = None
    p.sizing_mode = 'stretch_width'

    tooltip = """
    <div style="max-width: 500px">
        <div>
            <span style="font-size: 12px; font-weight: bold;">@title</span>
        </div>
        <div>
            <span style="font-size: 11px; font-weight: bold;">Author(s)</span>
            <span style="font-size: 10px;">@authors</span>
        </div>
        <div>
            <span style="font-size: 11px; font-weight: bold;">Journal</span>
            <span style="font-size: 10px;">@journal</span>
        </div>
        <div>
            <span style="font-size: 11px; font-weight: bold;">Year</span>
            <span style="font-size: 10px;">@year</span>
        </div>
        <div>
            <span style="font-size: 11px; font-weight: bold;">Cited</span>
            <span style="font-size: 10px;">@cited</span>
        </div>
        <div>
            <span style="font-size: 11px; font-weight: bold;">Topic</span>
            <span style="font-size: 10px;">@topic</span>
        </div>
    </div>
    """

    p.add_tools(HoverTool(tooltips=tooltip))

    graph_layout = dict(zip(nodes, zip(x, y)))
    graph.layout_provider = StaticLayoutProvider(graph_layout=graph_layout)

    graph.node_renderer.glyph = Circle(size='size', fill_color='color')
    graph.node_renderer.hover_glyph = Circle(size='size', fill_color='green')

    graph.edge_renderer.glyph = MultiLine(line_color='grey', line_alpha=0.1, line_width=1)
    graph.edge_renderer.hover_glyph = MultiLine(line_color='blue', line_alpha=1.0, line_width=2)

    graph.inspection_policy = NodesAndLinkedEdges()

    p.renderers.append(graph)
    return p


In [None]:
from pysrc.papers.analysis.graph import local_sparse

print('Visualize structure graph using projected coordinates')
show(structure_graph(local_sparse(analyzer.similarity_graph,  0.5), df))

In [None]:
print('Original louvain community clusters shown on projection coordinates')
t = df.copy()
t['comp'] = original_df_clusters # Restore original clusters
show(structure_graph(local_sparse(analyzer.similarity_graph,  0.5), t))

## Authors graph

In [None]:
import numpy as np

from pysrc.papers.analysis.metadata import popular_authors, popular_journals, build_authors_similarity_graph, \
    compute_authors_citations_and_papers, cluster_authors

logging.info("Analyzing groups of similar authors")
authors_citations, authors_papers = compute_authors_citations_and_papers(analyzer.df)
authors_productivity = {a: np.log1p(authors_citations.get(a, 1)) * p for a, p in authors_papers.items()}
min_threshold = np.percentile(list(authors_productivity.values()), 95)
min_threshold

In [None]:
import networkx as nx
import community
import itertools

logger = logging.getLogger('Test')

authors_similarity_graph = build_authors_similarity_graph(
    analyzer.df, analyzer.texts_similarity, analyzer.citations_graph, 
    analyzer.cocit_grouped_df, analyzer.bibliographic_coupling_df,
    check_author_func=lambda a: authors_productivity[a] >= min_threshold
)

# authors_similarity_graph = analyzer.authors_similarity_graph
logging.info(f'Built authors graph - '
             f'{len(authors_similarity_graph.nodes())} nodes and {len(authors_similarity_graph.edges())} edges')

In [None]:
logger.debug('Compute aggregated similarity using co-authorship')
for _, _, d in authors_similarity_graph.edges(data=True):
    d['similarity'] = 100 * d.get('authorship', 0) + PapersAnalyzer.similarity(d)

### Node2vec embeddings for authors graph

In [None]:
authors_node_ids, authors_weighted_node_embeddings = node2vec(authors_similarity_graph, 
                                                              weight_func=lambda d: d['similarity'])

In [None]:
logger.debug('Apply t-SNE transformation on node embeddings')
authors_tsne = TSNE(n_components=2, random_state=42)
authors_weighted_node_embeddings_2d = tsne.fit_transform(authors_weighted_node_embeddings)

In [None]:
# Build dataframe combining information about authors and projected coordinates
authors_df = pd.DataFrame(dict(author=authors_node_ids, 
                               d1=authors_weighted_node_embeddings_2d[:, 0],
                               d2=authors_weighted_node_embeddings_2d[:, 1]))
authors_df['cited'] = [authors_citations[a] for a in authors_df['author']]
authors_df['papers'] = [authors_papers[a] for a in authors_df['author']]
authors_df['size'] = [1 + authors_productivity[a] for a in authors_df['author']]
# Limit max size
authors_df['size'] = authors_df['size'] * 20 / authors_df['size'].max() + 3

In [None]:
ds = ColumnDataSource(authors_df)
p = figure(plot_width=600, plot_height=600,
           tools="hover,pan,tap,wheel_zoom,box_zoom,reset,save", toolbar_location="right",
           tooltips=[("Author", '@author'),("Papers", '@papers'), ("Cited", '@cited')])
p.sizing_mode = 'stretch_width'
p.xaxis.axis_label = 'd1'
p.yaxis.axis_label = 'd2'

p.circle(x='d1', y='d2', fill_alpha=0.8, source=ds, size='size',
         line_color='black', fill_color='blue', legend_field='author')
p.legend.location = None
show(p)

## Authors clustering

In [None]:
authors_clusters = cluster_authors(authors_similarity_graph, analyzer.similarity)

In [None]:
print('Graph based visualization of authors graph')
show(plotter.authors_graph())

In [None]:
def plot_authors_clusters(authors_df):
    cmap = Plotter.factors_colormap(len(set(authors_df['cluster'])))
    palette = dict(zip(sorted(set(authors_df['cluster'])), 
                       [Plotter.color_to_rgb(cmap(i)).to_hex() for i in range(len(set(authors_df['cluster'])))]))
    authors_df['color'] = [palette[c] for c in authors_df['cluster']]

    ds = ColumnDataSource(authors_df)
    p = figure(plot_width=600, plot_height=600,
               tools="hover,pan,tap,wheel_zoom,box_zoom,reset,save", toolbar_location="right",
               tooltips=[("Author", '@author'),("Papers", '@papers'), ("Cited", '@cited'), ('Cluster', '@cluster')])
    p.sizing_mode = 'stretch_width'
    p.xaxis.axis_label = 'd1'
    p.yaxis.axis_label = 'd2'

    p.circle(x='d1', y='d2', fill_alpha=0.8, source=ds, size='size',
             line_color='black', fill_color='color', legend_field='author')
    p.legend.location = None
    show(p)


In [None]:
authors_df['cluster'] = [authors_clusters[a] for a in authors_df['author']]
plot_authors_clusters(authors_df)

## Authors clustering based on embeddings and number of clusters in papers

In [None]:
logger.debug(f'Clustering authors based on embeddings')
model = AgglomerativeClustering(n_clusters=2 * n_clusters).fit(authors_weighted_node_embeddings)
authors_clusters = dict(zip(authors_node_ids, model.labels_.astype(int)))
authors_clusters_counter = Counter(authors_clusters.values())
print(authors_clusters_counter)

In [None]:
authors_df['cluster'] = [authors_clusters[a] for a in authors_df['author']]
plot_authors_clusters(authors_df)

In [None]:
AUTHORS_PER_COMP = 20
group_authors = {}
for group in sorted(set(authors_clusters.values())):
    authors = [a for a in authors_clusters.keys() if authors_clusters[a] == group]
    authors.sort(key=lambda a: authors_productivity[a], reverse=True)
    top = authors[:AUTHORS_PER_COMP]
    group_authors[group] = ", ".join(top)
    print(f'#{group} ({len(authors)}) {", ".join(top)}' + (', ...' if len(authors) > AUTHORS_PER_COMP else ''))

In [None]:
part_sizes = Counter(authors_clusters.values())
paper_groups = np.zeros(shape=(len(analyzer.df), len(set(authors_clusters.values()))))
for i, row in analyzer.df[['authors']].iterrows():
    for a in row[0].split(', '):
        if a in authors_clusters:
            group = authors_clusters[a]
            paper_groups[i, group] += 1 / part_sizes[group]
groups = np.argmax(paper_groups, axis=1)
papers_assigned = paper_groups.sum(axis=1) > 0
groups_partition = {pid: groups[i] for i, pid in enumerate(analyzer.df['id']) if papers_assigned[i]}

groups_part_sizes = {c: sum([groups_partition[node] == c for node in groups_partition.keys()]) 
                     for c in set(groups_partition.values())}
logging.info(f'Components: {groups_part_sizes}')

In [None]:
import pandas as pd
from pysrc.papers.analysis.topics import get_topics_description

groups_pids = pd.DataFrame(groups_partition.items(), columns=['id', 'comp']). \
                groupby('comp')['id'].apply(list).to_dict()
groups_description = get_topics_description(
    analyzer.df.iloc[np.flatnonzero(papers_assigned), :], groups_pids,
    analyzer.corpus_terms, analyzer.corpus_counts[np.flatnonzero(papers_assigned), :],
    query=analyzer.query,
    n_words=analyzer.TOPIC_DESCRIPTION_WORDS
)

In [None]:
groups_df = pd.DataFrame(columns=['group', 'authors', 'papers', 'keywords'], dtype=object)
for g, pids in groups_pids.items():
    if g in group_authors and g in groups_description:
        groups_df.loc[len(groups_df)] = (g, group_authors[g], len(pids), 
                                         ', '.join(v[0] for v in groups_description[g][:10]))

display(groups_df)

## Topic Evolution

In [None]:
evolution_data, keywords_data = plotter.topic_evolution()
show(evolution_data)
print(keywords_data)

## PageRank for Citation Analysis

In [None]:
import networkx as nx

# Apply PageRank algorithm with damping factor of 0.5
pr_nx = nx.pagerank(analyzer.citations_graph, alpha=0.5, tol=1e-9)

In [None]:
ancestor = dict.fromkeys(analyzer.citations_graph, (0, 0))

# Select ancestor with highest PR for each node
for v in analyzer.citations_graph:
    for u in analyzer.citations_graph[v]:
        anc, pr = ancestor[u]
        if pr_nx[v] > pr:
            ancestor[u] = (v, pr_nx[v])

In [None]:
PRG = nx.DiGraph()
for v, anc in ancestor.items():
    u, pr = anc
    if pr > 0:
        PRG.add_edge(u, v)

In [None]:
start, end = zip(*list(PRG.edges()))

In [None]:
from bokeh.models import GraphRenderer, StaticLayoutProvider, Circle, HoverTool, MultiLine
from bokeh.models.graphs import NodesAndLinkedEdges

node_indices = list(filter(lambda node: len(analyzer.df[analyzer.df['id'] == node]) > 0, list(PRG.nodes())))

years = []
year_counts = {}
titles = []
pageranks = []
size = []
for node in node_indices:
    sel = analyzer.df[analyzer.df['id'] == node]
    year = sel['year'].values[0]
    
    if not year in year_counts:
        year_counts[year] = 1
    else:
        year_counts[year] += 1
    years.append(year)
    
    titles.append(sel['title'].values[0])
    pageranks.append(pr_nx[node] * 100)
    size.append(pr_nx[node] * 1000)
max_year_count = max(list(year_counts.values()))
min_year, max_year = min(years), max(years)

plot = figure(title="PageRank applied to citation filtering", 
              x_range=(min_year - 1, max_year+1), y_range=(0, max_year_count + 1),
              tools="", toolbar_location=None)

TOOLTIPS = """
    <div style="max-width: 320px">
        <div>
            <span style="font-size: 12px; font-weight: bold;">@title</span>
        </div>
        <div>
            <span style="font-size: 11px;">Year</span>
            <span style="font-size: 10px;">@year</span>
        </div>
        <div>
            <span style="font-size: 11px;">PMID</span>
            <span style="font-size: 10px;">@id</span>
        </div>
        <div>
            <span style="font-size: 11px;">PageRank</span>
            <span style="font-size: 10px;">@pagerank</span>
        </div>
    </div>
"""

plot.add_tools(HoverTool(tooltips=TOOLTIPS))

graph = GraphRenderer()

graph.node_renderer.data_source.add(node_indices, 'index')
graph.node_renderer.data_source.data['id'] = node_indices
graph.node_renderer.data_source.data['year'] = years
graph.node_renderer.data_source.data['title'] = titles
graph.node_renderer.data_source.data['pagerank'] = pageranks
graph.node_renderer.data_source.data['size'] = size
# graph.edge_renderer.data_source.data = dict(start=start, end=end)

### start of layout code   
x = [analyzer.df[analyzer.df['id'] == pmid]['year'].values[0] for pmid in node_indices]
y = []
tmp_year_counts = {}
for node in node_indices:
    year = analyzer.df[analyzer.df['id'] == node]['year'].values[0]
    if not year in tmp_year_counts:
        tmp_year_counts[year] = 1
    else:
        tmp_year_counts[year] += 1
    y.append(tmp_year_counts[year])

graph_layout = dict(zip(node_indices, zip(x, y)))
graph.layout_provider = StaticLayoutProvider(graph_layout=graph_layout)

graph.node_renderer.glyph = Circle(size='size', fill_color='blue')
graph.node_renderer.hover_glyph = Circle(size='size', fill_color='green')

# graph.edge_renderer.glyph = MultiLine(line_color='black', line_alpha=1, line_width=1)
# graph.edge_renderer.hover_glyph = MultiLine(line_color='green', line_width=2)

graph.inspection_policy = NodesAndLinkedEdges()

plot.min_border_left = 75
plot.renderers.append(graph)

show(plot)

### Top Papers by PageRank

In [None]:
for pmid, pagerank in sorted(pr_nx.items(), key=lambda el: el[1], reverse=True)[:10]:
    print(f"{(100*pagerank):.2f} {analyzer.df[analyzer.df['id'] == pmid]['title'].values[0]}")

### PageRank and citation ranking correlation

In [None]:
import numpy as np
from scipy.stats import spearmanr

analyzer.df['citation_rank'] = analyzer.df['total'].rank(method='first', ascending=False)
pagerank_rank = sorted(pr_nx.items(), key=lambda el: el[1], reverse=True)

r = np.zeros((len(pagerank_rank), 2))
for i, (pmid, pr) in enumerate(pagerank_rank):
    sel = analyzer.df[analyzer.df['id'] == pmid]
    if len(sel) > 0:
        r[i, 0] = i
        r[i, 1] = int(sel['citation_rank'].values[0])
        
TOP_X = [5, 10, 30, 50, 100]
for x in TOP_X:
    rho, _ = spearmanr(r[:x, 0], r[:x, 1])
    print(f'Spearman correlation coefficient for top {x}: {rho}')