# Pubtrends

Experimental notebook for hypothesis testing and development purposes.

**IMPORTANT** 
Turn on experimental features in config file!

## Getting Started

1. Define the `SEARCH_QUERY` variable in the cell below with a list of keywords that describe the science branch of your interest.
2. Run all cells & see the results.

In [None]:
from pysrc.papers.utils import SORT_MOST_CITED

SEARCH_QUERY = 'Human Aging'
SEARCH_SORT = SORT_MOST_CITED
SEARCH_PAPERS = 100000

# File with ids to analyze
FILE = '/mnt/stripe/shpynov/pubtrends/pmid-humanaging-set.txt'

## Publication Analysis

In [None]:
import logging
import json
import seaborn as sns
import numpy as np
import pandas as pd
from scipy import stats
from collections import Counter


from bokeh.plotting import figure
from bokeh.plotting import show, output_notebook
from matplotlib import pyplot as plt


from pysrc.papers.config import PubtrendsConfig
from pysrc.papers.db.pm_postgres_loader import PubmedPostgresLoader
from pysrc.papers.db.ss_postgres_loader import SemanticScholarPostgresLoader
from pysrc.papers.analyzer import PapersAnalyzer
from pysrc.papers.plot.plotter import Plotter
from pysrc.papers.utils import SORT_MOST_CITED, SORT_MOST_RECENT, cut_authors_list

logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s: %(message)s')
logger = logging.getLogger('notebook')

# Avoid info message about compilation flags
import tensorflow as tf
# tf.get_logger().setLevel('ERROR')

output_notebook()
%matplotlib inline

In [None]:
config = PubtrendsConfig(test=False)
config.feature_evolution_enabled = True
loader = PubmedPostgresLoader(config)
analyzer = PapersAnalyzer(loader, config)
try:
    if FILE is not None:
        with open(FILE) as f:
            ids = [l.strip() for l in f.readlines()]
    else:
        ids = analyzer.search_terms(SEARCH_QUERY, limit=SEARCH_PAPERS, sort=SEARCH_SORT)
    analyzer.analyze_papers(ids, SEARCH_QUERY)
finally:
    loader.close_connection()
    analyzer.teardown()

# Report plots

In [None]:
plotter = Plotter(analyzer)

In [None]:
show(plotter.papers_by_year())

In [None]:
from pysrc.papers.analysis.text import get_frequent_tokens, get_topic_word_cloud_data

freq_kwds = get_frequent_tokens(analyzer.top_cited_df, query=analyzer.query)
wc, _ = plotter.papers_word_cloud_and_callback(freq_kwds)
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.show()

# Trends

In [None]:
# show(plotter.top_cited_papers())

In [None]:
show(plotter.most_cited_per_year_papers())

In [None]:
show(plotter.fastest_growth_per_year_papers())

## Frequent keywords timeline

In [None]:
from pysrc.papers.analysis.text import get_frequent_tokens

freq_kwds = get_frequent_tokens(analyzer.top_cited_df, query=analyzer.query)

In [None]:
print('Original keywords frequencies')
show(plotter.plot_keywords_frequencies(freq_kwds))

In [None]:
from bokeh.plotting import figure, output_file, save, reset_output, output_notebook

logging.info('Save frequent tokens to file')
output_file(filename="frequent.html", title="Frequents topics")
save(plotter.plot_keywords_frequencies(freq_kwds))
reset_output()
output_notebook()

# Single paper citations dynamics

In [None]:
show(plotter.paper_citations_per_year(analyzer.df, analyzer.df['id'].values[0]))

# Topics analysis

In [None]:
# First cluster papers
# show(plotter.topics_info_and_word_cloud_and_callback()[0][0])

In [None]:
show(plotter.topic_years_distribution())

## Topics similarity

In [None]:
from pysrc.papers.analysis.topics import compute_similarity_matrix
from itertools import product

def topics_similarity_data(similarity_graph, partition):
    similarity_matrix = compute_similarity_matrix(similarity_graph, PapersAnalyzer.similarity, partition)

    # c + 1 is used to start numbering with 1
    components = [str(c + 1) for c in sorted(set(partition.values()))]
    n_comps = len(components)
    similarity_topics_df = pd.DataFrame([
        {'comp_x': i, 'comp_y': j, 'similarity': similarity_matrix[i, j]}
        for i, j in product(range(n_comps), range(n_comps))
    ])
    similarity_topics_df['comp_x'] = similarity_topics_df['comp_x'].apply(lambda x: x + 1).astype(str)
    similarity_topics_df['comp_y'] = similarity_topics_df['comp_y'].apply(lambda x: x + 1).astype(str)
    return similarity_topics_df, components



similarity_df, topics = topics_similarity_data(
    analyzer.similarity_graph, analyzer.partition
)

similarity_df['type'] = ['Inside' if x == y else 'Outside' 
                         for (x, y) in zip(similarity_df['comp_x'], similarity_df['comp_y'])]
sns.displot(similarity_df, x="similarity", hue="type", kind="kde")
plt.show()

In [None]:
from bokeh.colors import RGB
from bokeh.models import LinearColorMapper, PrintfTickFormatter, ColorBar

def heatmap_topics_similarity(similarity_df, topics):
    logger.debug('Visualizing topics similarity with heatmap')

    step = 10
    cmap = plt.cm.get_cmap('PuBu', step)
    colors = [RGB(*[round(c * 255) for c in cmap(i)[:3]]) for i in range(step)]
    mapper = LinearColorMapper(palette=colors,
                               low=similarity_df.similarity.min(),
                               high=similarity_df.similarity.max())

    p = figure(x_range=topics, y_range=topics,
               x_axis_location="below", plot_width=600, plot_height=600,
               tools="hover,pan,tap,wheel_zoom,box_zoom,reset,save", toolbar_location="right",
               tooltips=[('Topic 1', '@comp_x'),
                         ('Topic 2', '@comp_y'),
                         ('Similarity', '@similarity')])

    p.sizing_mode = 'stretch_width'
    p.grid.grid_line_color = None
    p.axis.axis_line_color = None
    p.axis.major_tick_line_color = None
    p.axis.major_label_text_font_size = "10pt"
    p.axis.major_label_standoff = 0

    p.rect(x="comp_x", y="comp_y", width=1, height=1,
           source=similarity_df,
           fill_color={'field': 'similarity', 'transform': mapper},
           line_color=None)

    color_bar = ColorBar(color_mapper=mapper, major_label_text_font_size="10pt",
                         formatter=PrintfTickFormatter(format="%.2f"),
                         label_standoff=11, border_line_color=None, location=(0, 0))
    p.add_layout(color_bar, 'right')
    return p


show(heatmap_topics_similarity(similarity_df, topics))

### Similarities function analysis

We hope that the distribution of similarities edge weights illustrates that majority of linked nodes are insignificantly similar in terms of their attributes.

In [None]:
bibcoupling_array = np.zeros(len(analyzer.similarity_graph.edges))
cocitations_array = np.zeros(len(analyzer.similarity_graph.edges))
citations_array = np.zeros(len(analyzer.similarity_graph.edges))
similarities_array = np.zeros(len(analyzer.similarity_graph.edges))
text_similarities_array = np.zeros(len(analyzer.similarity_graph.edges))

for i, (u, v, data) in enumerate(analyzer.similarity_graph.edges(data=True)):
    bibcoupling_array[i] = np.log1p(data.get('bibcoupling', 0))
    cocitations_array[i] = np.log1p(data.get('cocitation', 0))
    citations_array[i] = data.get('citation', 0)
    text_similarities_array[i] = data.get('text', 0)
    similarities_array[i] = PapersAnalyzer.similarity(data)
    
fig = plt.figure(figsize=(5 * 4, 5))
ax = plt.subplot(1, 4, 1)
print(f'Bibcoupling, non-zero {np.count_nonzero(bibcoupling_array)} of {len(bibcoupling_array)}')
bibcoupling_array = bibcoupling_array[np.nonzero(bibcoupling_array)]
print(stats.describe(bibcoupling_array))
sns.kdeplot(bibcoupling_array)
plt.title('Bibcoupling')
# plt.show()

ax = plt.subplot(1, 4, 2)
print(f'Co-citations, non-zero {np.count_nonzero(cocitations_array)} of {len(cocitations_array)}')
cocitations_array = cocitations_array[np.nonzero(cocitations_array)]
print(stats.describe(cocitations_array))
sns.kdeplot(cocitations_array)
plt.title('Co-citations')
# plt.show()

ax = plt.subplot(1, 4, 3)
print(f'Text similarities, non-zero {np.count_nonzero(text_similarities_array)} of {len(text_similarities_array)}')
text_similarities_array = text_similarities_array[np.nonzero(text_similarities_array)]
print(stats.describe(text_similarities_array))
sns.kdeplot(text_similarities_array)
plt.title('Text')
# plt.show

ax = plt.subplot(1, 4, 4)
print(f'Similarities, non-zero {np.count_nonzero(similarities_array)} of {len(similarities_array)}')
print(stats.describe(similarities_array))
sns.kdeplot(similarities_array)
plt.title('Similarity')

plt.show()

print(f'Citations, non-zero {np.count_nonzero(citations_array)} of {len(citations_array)}')

## Additional Text analysis

In [None]:
from pysrc.papers.analysis.text import build_corpus, tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

logging.info('Analyzing text vectorization')
corpus = build_corpus(analyzer.df)
vectorizer = CountVectorizer(
     max_features=PapersAnalyzer.VECTOR_WORDS,
     min_df=0.01,
     max_df=0.5,
     tokenizer=lambda t: tokenize(t)
)
counts = vectorizer.fit_transform(corpus)
logger.debug(f'Vectorized corpus size {counts.shape}')
terms_counts = np.asarray(np.sum(counts, axis=0)).reshape(-1)
print(terms_counts.shape)
terms_freqs = terms_counts / len(analyzer.df)
logger.debug(f'Terms frequencies min={terms_freqs.min()}, max={terms_freqs.max()}, '
             f'mean={terms_freqs.mean()}, std={terms_freqs.std()}')

In [None]:
terms = vectorizer.get_feature_names()
print('Max frequent terms:', ', '.join(terms[k] for k in terms_freqs.argsort()[::-1][:20]))
print('Min frequent terms:', ', '.join(terms[k] for k in terms_freqs.argsort()[:20]))

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

print('Analyze similarities between all papers')
cos_similarities = cosine_similarity(analyzer.corpus_counts)
cos_similarities_array = cos_similarities.reshape(-1)
print(stats.describe(cos_similarities_array))
print('Q1', np.percentile(cos_similarities_array, 25), 
      'Q2', np.percentile(cos_similarities_array, 50), 
      'Q3', np.percentile(cos_similarities_array, 75))

fig = plt.figure(figsize=(5 * 2, 5))
ax = plt.subplot(1, 2, 1)
sns.kdeplot(cos_similarities_array)
plt.title('Cosine similarities among all papers')
# plt.show()

print('Analyze similarities between papers with direct citations')
pid_indx = {pid: i for i, pid in enumerate(analyzer.df['id'])}
cited_cos_similarities = []
for i, (u, v, data) in enumerate(analyzer.similarity_graph.edges(data=True)):
    if data.get('citation', 0) != 0:
        cited_cos_similarities.append(cos_similarities[pid_indx[u], pid_indx[v]])

print(stats.describe(cited_cos_similarities))
print('Q1', np.percentile(cited_cos_similarities, 25), 
      'Q2', np.percentile(cited_cos_similarities, 50), 
      'Q3', np.percentile(cited_cos_similarities, 75))
ax = plt.subplot(1, 2, 2)
sns.kdeplot(cited_cos_similarities)
plt.title('Cosine similarity between cited papers')
          
plt.show()                                     

In [None]:
G = analyzer.similarity_graph
degrees = [d for (n, d) in G.degree()]
plt.title('Similarity graph degrees')
sns.kdeplot(data=degrees)          
plt.show()  
print('Average degree', sum(degrees) / float(G.number_of_nodes()))

In [None]:
show(plotter.plot_similarity_graph())

# Similarities graph embeddings with various Node2Vec params

In [None]:
from pysrc.papers.analysis.node2vec import node2vec
from pysrc.papers.analysis.graph import local_sparse, to_weighted_graph

logger.debug('Preparing node2vec + tsne layout for similarity graph')
wsg = to_weighted_graph(analyzer.similarity_graph, weight_func=PapersAnalyzer.similarity)
e = 1.0
gs = local_sparse(wsg, e)
# Limit total number of edges to estimate walk probabilities
while e > 0.1 and gs.number_of_edges() / gs.number_of_nodes() > 20:
    e -= 0.1
    gs = local_sparse(wsg, e)
logger.debug(f'Sparse graph for node2vec e={e} nodes={gs.number_of_nodes()} edges={gs.number_of_edges()}')
node_ids, weighted_node_embeddings = node2vec(gs)

## Embeddings visualization

In [None]:
from sklearn.manifold import TSNE

logger.debug('Apply t-SNE transformation on node embeddings')
tsne = TSNE(n_components=2, random_state=42)
weighted_node_embeddings_2d = tsne.fit_transform(weighted_node_embeddings)

In [None]:
# from umap import UMAP 

# logger.debug('Apply UMAP transformation on node embeddings')
# umap = UMAP(n_components=2, random_state=42)
# weighted_node_embeddings_2d = umap.fit_transform(weighted_node_embeddings)

In [None]:
# Build dataframe combining information about papers and projected coordinates
df = analyzer.df[['id', 'title', 'year', 'type', 'abstract', 'total', 'authors', 'journal', 'comp',
                 'keywords', 'mesh']].copy()
pid_indx = {pid: i for i, pid in enumerate(df['id'])}
indx = [pid_indx[pid] for pid in node_ids]
df['x'] = pd.Series(index=indx, data=weighted_node_embeddings_2d[:, 0])
df['y'] = pd.Series(index=indx, data=weighted_node_embeddings_2d[:, 1])

In [None]:
from bokeh.models import ColumnDataSource, CustomJS
from bokeh.models import HoverTool


from pysrc.papers.utils import cut_authors_list


def plot_embeddings(df, clusters):
    cmap = Plotter.factors_colormap(len(set(clusters)))
    palette = dict(zip(sorted(set(clusters)), [Plotter.color_to_rgb(cmap(i)).to_hex() 
                                               for i in range(len(set(clusters)))]))

    df['size'] = 5 + df['total'] / df['total'].max() * 20

    # Split authors
    df['authors'] = df['authors'].apply(lambda authors: cut_authors_list(authors))

    ds = ColumnDataSource(df)
    # Add clusters coloring
    ds.add([palette[c] for c in clusters], 'color')
    p = figure(plot_width=600, plot_height=600,
               tools="hover,pan,tap,wheel_zoom,box_zoom,reset,save")
    p.sizing_mode = 'stretch_width'
    p.xaxis.axis_label = 'x'
    p.yaxis.axis_label = 'y'

    p.hover.tooltips = plotter._html_tooltips([
        ("Author(s)", '@authors'),
        ("Journal", '@journal'),
        ("Year", '@year'),
        ("Type", '@type'),
        ("Cited by", '@total paper(s) total'),
        ("Topic", '@comp')])
    p.circle(x='x', y='y', fill_alpha=0.8, source=ds, size='size',
             line_color='black', fill_color='color', legend_field='comp')
    p.legend.visible = False
    show(p)

In [None]:
print('Plot default Louvain clusters in embeddings coordinates')
plot_embeddings(df, analyzer.df['comp'])

## Clustering of embeddings

In [None]:
from sklearn.cluster import AgglomerativeClustering

def cluster_and_sort(x, min_cluster_size, max_clusters):
    """
    :param x: object representations (X x Features)
    :param min_cluster_size:
    :param max_clusters:
    :return: List[cluster], Hierarchical dendrogram of splits.
    """
    logger.debug('Looking for an appropriate number of clusters,'
                 f'min_cluster_size={min_cluster_size}, max_clusters={max_clusters}')
    r = min(int(x.shape[0] / min_cluster_size), max_clusters) + 1
    l = 1

    if l >= r - 2:
        return [0] * x.shape[0], None

    prev_min_size = None
    while l < r - 2:
        n_clusters = int((l + r) / 2)
        logger.debug(f'l = {l}; r = {r}; n_clusters = {n_clusters}')
        model = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward').fit(x)
        clusters_counter = Counter(model.labels_)
        assert len(clusters_counter.keys()) == n_clusters, "Incorrect clusters number"
        min_size = clusters_counter.most_common()[-1][1]
        # Track previous_min_size to cope with situation with super distant tiny clusters
        if prev_min_size != min_size and min_size < min_cluster_size or n_clusters > max_clusters:
            logger.debug(f'prev_min_size({prev_min_size}) != min_size({min_size}) < {min_cluster_size} or '
                         f'n_clusters = {n_clusters}  > {max_clusters}')
            r = n_clusters + 1
        else:
            l = n_clusters
        prev_min_size = min_size

    logger.debug(f'Number of clusters = {n_clusters}')
    logger.debug(f'Min cluster size = {prev_min_size}')
    logger.debug('Reorder clusters by size descending')
    reorder_map = {c: i for i, (c, _) in enumerate(clusters_counter.most_common())}
    return [reorder_map[c] for c in model.labels_], model.children_

In [None]:
clusters, dendrogram_children = cluster_and_sort(weighted_node_embeddings, 30, 50)

print('Cluster sizes')
t = pd.DataFrame({'cluster': clusters, 
                  'size': np.ones(len(clusters))}).groupby(['cluster']).sum().astype(int).reset_index()    
sns.barplot(data=t, x='cluster', y='size')
plt.tight_layout()
plt.show()

In [None]:
df['comp'] = pd.Series(index=indx, data=clusters, dtype=int)

In [None]:
def components_ratio_data(df):
    assigned_comps = df[df['comp'] >= 0]
    comp_size = dict(assigned_comps.groupby('comp')['id'].count())
    total_papers = sum(assigned_comps['comp'] >= 0)
    comps = list(comp_size.keys())
    ratios = [100 * comp_size[c] / total_papers for c in comps]

    # c + 1 is used to start numbering from 1
    comps = list(map(str, [c + 1 for c in comps]))
    return comps, ratios

def plot_components_ratio(df, plot_width=1200, plot_height=800):
    comps, ratios = components_ratio_data(df)
    n_comps = len(comps)
    cmap = Plotter.factors_colormap(n_comps)
    colors = [Plotter.color_to_rgb(cmap(i)) for i in range(n_comps)]
    source = ColumnDataSource(data=dict(comps=comps, ratios=ratios, colors=colors))

    p = figure(plot_width=plot_width, plot_height=plot_height,
               toolbar_location="right", tools="save", x_range=comps)
    p.vbar(x='comps', top='ratios', width=0.8, fill_alpha=0.5, color='colors', source=source)
    p.hover.tooltips = [("Topic", '@comps'), ("Amount", '@ratios %')]
    p.sizing_mode = 'stretch_width'
    p.xaxis.axis_label = 'Topic'
    p.yaxis.axis_label = 'Percentage of papers'
    p.xgrid.grid_line_color = None
    p.ygrid.grid_line_color = None
    p.axis.minor_tick_line_color = None
    p.outline_line_color = None

    return p

show(plot_components_ratio(df))

In [None]:
from bokeh.plotting import figure, output_file, save, reset_output, output_notebook

logging.info('Save topics ratios to file')
output_file(filename="sizes.html", title="Topics sizes")
save(plot_components_ratio(df))
reset_output()
output_notebook()

In [None]:
import pandas as pd
from pysrc.papers.analysis.topics import get_topics_description

print('Computing clusters keywords')
clusters_pids = pd.DataFrame(dict(id=node_ids, comp=clusters)).groupby('comp')['id'].apply(list).to_dict()

clusters_description = get_topics_description(
    analyzer.df, clusters_pids,
    analyzer.corpus_terms, analyzer.corpus_counts,
    query=analyzer.query,
    n_words=analyzer.TOPIC_DESCRIPTION_WORDS
)

In [None]:
kwds = [(comp, ','.join([f'{t}:{v:.3f}' for t, v in vs[:20]]))
        for comp, vs in clusters_description.items()]
kwd_df = pd.DataFrame(kwds, columns=['comp', 'kwd'])
display(kwd_df.head())

## Clusters visualization

In [None]:
print('Plot clusters in embeddings coordinates')
plot_embeddings(df, df['comp'])

In [None]:
clusters_partition = dict(zip(df['id'], df['comp']))

similarity_df, topics = topics_similarity_data(
    analyzer.similarity_graph, clusters_partition
)

similarity_df['type'] = ['Inside' if x == y else 'Outside' 
                         for (x, y) in zip(similarity_df['comp_x'], similarity_df['comp_y'])]
sns.displot(similarity_df, x="similarity", hue="type", kind="kde")
plt.show()

In [None]:
show(heatmap_topics_similarity(similarity_df, topics))

In [None]:
from bokeh.plotting import figure, output_file, save, reset_output, output_notebook

logging.info('Save similarity heatmap to file')
output_file(filename="similarity.html", title="Topics mean similarity")
save(heatmap_topics_similarity(similarity_df, topics))
reset_output()
output_notebook()

In [None]:
from pysrc.papers.plot.plot_preprocessor import PlotPreprocessor

max_year, min_year = df['year'].max(), df['year'].min()
plot_components, data = PlotPreprocessor.component_size_summary_data(
    df, sorted(set(df['comp'])), min_year, max_year
)

show(Plotter._topics_years_distribution(df, kwd_df, plot_components, data, max_year, min_year))

In [None]:
from bokeh.plotting import figure, output_file, save, reset_output, output_notebook

logging.info('Save topics years to file')
output_file(filename="years.html", title="Topics by years")
save(Plotter._topics_years_distribution(df, kwd_df, plot_components, data, max_year, min_year))
reset_output()
output_notebook()

### Topics hierarchy

In [None]:
import math
from math import pi, sin, cos, fabs, pow
from bokeh.colors import RGB
from queue import PriorityQueue

from pysrc.papers.utils import rgb2hex


from more_itertools import unique_everseen

def compute_clusters_dendrogram_children(clusters, children):
    leaves_map = dict(enumerate(clusters))
    nodes_map = {}
    clusters_children = []
    for i, (u, v) in enumerate(children):
        u_cluster = leaves_map[u] if u in leaves_map else nodes_map[u]
        v_cluster = leaves_map[v] if v in leaves_map else nodes_map[v]
        node = len(leaves_map) + i
        if u_cluster is not None and v_cluster is not None:
            if u_cluster != v_cluster:
                nodes_map[node] = None  # Different clusters
                clusters_children.append((u, v, node))
            else:
                nodes_map[node] = u_cluster
        else:
            nodes_map[node] = None  # Different clusters
            clusters_children.append((u, v, node))

    def rwc(v):
        if v in leaves_map:
            return leaves_map[v]
        elif v in nodes_map:
            res = nodes_map[v]
            return res if res is not None else v
        else:
            return v

    # Rename nodes to clusters
    result = [(rwc(u), rwc(v), rwc(n)) for u, v, n in clusters_children]
#     logger.debug(f'Clusters based dendrogram children {result}')
    return result


def convert_clusters_dendrogram_to_paths(clusters, children):
    logger.debug('Converting agglomerate clustering clusters dendrogram format to path for visualization')
    paths = [[p] for p in sorted(set(clusters))]
    for i, (u, v, n) in enumerate(children):
        for p in paths:
            if p[i] == u or p[i] == v:
                p.append(n)
            else:
                p.append(p[i])
#     logger.debug(f'Paths {paths}')
    logger.debug('Radix sort or paths to ensure no overlaps')
    for i in range(len(children)):
        paths.sort(key=lambda p: p[i])
        # Reorder next level to keep order of previous if possible
        if i != len(children):
            order = dict((v, i) for i, v in enumerate(unique_everseen(p[i + 1] for p in paths)))
            for p in paths:
                p[i + 1] = order[p[i + 1]]
    leaves_order = dict((v, i) for i, v in enumerate(unique_everseen(p[0] for p in paths)))
    return paths, leaves_order

def contrast_color(rgb):
    r, g, b = rgb.r, rgb.g, rgb.b
    """
    Light foreground for dark background and vice verse.
    Idea Taken from https://stackoverflow.com/a/1855903/418358
    """
    # Counting the perceptive luminance - human eye favors green color...
    if 1 - (0.299 * r + 0.587 * g + 0.114 * b) / 255 < 0.5:
        return RGB(0, 0, 0)
    else:
        return RGB(255, 255, 255)


def topics_words(kwd_df, max_words):
    words2show = {}
    for _, row in kwd_df.iterrows():
        comp, kwds = row[0], row[1]        
        if kwds != '':  # Correctly process empty freq_kwds encoding
            words2show[comp] = [p.split(':')[0] for p in kwds.split(',')[:max_words]]
    return words2show


def topics_hierarchy_with_keywords(df, kwd_df, clusters, dendrogram_children, 
                                   max_words=3, plot_width=1200, plot_height=800):
    comp_sizes = Counter(df['comp'])
    logger.debug('Computing dendrogram for clusters')
    if dendrogram_children is None:
        return None
    clusters_dendrogram = compute_clusters_dendrogram_children(clusters, dendrogram_children)
    paths, leaves_order = convert_clusters_dendrogram_to_paths(clusters, clusters_dendrogram)

    # Configure dimensions
    p = figure(x_range=(-180, 180),
               y_range=(-160, 160),
               tools="save",
               width=plot_width, height=plot_height)
    x_coefficient = 1.2  # Ellipse x coefficient
    y_delta = 40  # Extra space near pi / 2 and 3 * pi / 2
    n_topics = len(leaves_order)
    radius = 100  # Radius of circular dendrogram
    dendrogram_len = len(paths[0])
    d_radius = radius / dendrogram_len
    d_degree = 2 * pi / n_topics

    # Leaves coordinates
    leaves_degrees = dict((v, i * d_degree) for v, i in leaves_order.items())

    # Draw dendrogram - from bottom to top
    ds = leaves_degrees.copy()
    for i in range(1, dendrogram_len):
        next_ds = {}
        for path in paths:
            if path[i] not in next_ds:
                next_ds[path[i]] = []
            next_ds[path[i]].append(ds[path[i - 1]])
        for v, nds in next_ds.items():
            next_ds[v] = np.mean(nds)

        for path in paths:
            current_d = ds[path[i - 1]]
            next_d = next_ds[path[i]]
            p.line([cos(current_d) * d_radius * (dendrogram_len - i),
                    cos(next_d) * d_radius * (dendrogram_len - i - 1)],
                   [sin(current_d) * d_radius * (dendrogram_len - i),
                    sin(next_d) * d_radius * (dendrogram_len - i - 1)],
                   line_color='lightgray')
        ds = next_ds

    # Draw leaves
    n_comps = len(comp_sizes)
    cmap = Plotter.factors_colormap(n_comps)
    topics_colors = dict((i, Plotter.color_to_rgb(cmap(i))) for i in range(n_comps))
    xs = [cos(d) * d_radius * (dendrogram_len - 1) for _, d in leaves_degrees.items()]
    ys = [sin(d) * d_radius * (dendrogram_len - 1) for _, d in leaves_degrees.items()]
    sizes = [20 + int(min(10, math.log(comp_sizes[v]))) for v, _ in leaves_degrees.items()]
    comps = [v + 1 for v, _ in leaves_degrees.items()]
    colors = [topics_colors[v] for v, _ in leaves_degrees.items()]
    ds = ColumnDataSource(data=dict(x=xs, y=ys, size=sizes, comps=comps, color=colors))
    p.circle(x='x', y='y', size='size', fill_color='color', line_color='black', source=ds)

    # Topics labels
    p.text(x=[cos(d) * d_radius * (dendrogram_len - 1) for _, d in leaves_degrees.items()],
           y=[sin(d) * d_radius * (dendrogram_len - 1) for _, d in leaves_degrees.items()],
           text=[str(v + 1) for v, _ in leaves_degrees.items()],
           text_align='center', text_baseline='middle', text_font_size='10pt',
           text_color=[contrast_color(topics_colors[v]) for v, _ in leaves_degrees.items()])

    # Show words for components - most popular words per component
    topics = leaves_order.keys()
    words2show = topics_words(kwd_df, max_words)

    # Visualize words
    for v, d in leaves_degrees.items():
        if v not in words2show:  # No super-specific words for topic
            continue
        words = words2show[v]
        xs = []
        ys = []
        for i, word in enumerate(words):
            wd = d + d_degree * (i - len(words) / 2) / len(words)
            # Make word degree in range 0 - 2 * pi
            if wd < 0:
                wd += 2 * pi
            elif wd > 2 * pi:
                wd -= 2 * pi
            xs.append(cos(wd) * radius * x_coefficient)
            y = sin(wd) * radius
            # Additional vertical space around pi/2 and 3*pi/2
            if pi / 4 <= wd < 3 * pi / 4:
                y += pow(pi / 4 - fabs(pi / 2 - wd), 1.5) * y_delta
            elif 5 * pi / 4 <= wd < 7 * pi / 4:
                y -= pow(pi / 4 - fabs(3 * pi / 2 - wd), 1.5) * y_delta
            ys.append(y)

        # Different text alignment for left | right parts
        p.text(x=[x for x in xs if x > 0], y=[y for i, y in enumerate(ys) if xs[i] > 0],
               text=[w for i, w in enumerate(words) if xs[i] > 0],
               text_align='left', text_baseline='middle', text_font_size='10pt',
               text_color=topics_colors[v])
        p.text(x=[x for x in xs if x <= 0], y=[y for i, y in enumerate(ys) if xs[i] <= 0],
               text=[w for i, w in enumerate(words) if xs[i] <= 0],
               text_align='right', text_baseline='middle', text_font_size='10pt',
               text_color=topics_colors[v])

    p.sizing_mode = 'stretch_width'
    p.axis.major_tick_line_color = None
    p.axis.minor_tick_line_color = None
    p.axis.major_label_text_color = None
    p.axis.major_label_text_font_size = '0pt'
    p.axis.axis_line_color = None
    p.grid.grid_line_color = None
    p.outline_line_color = None
    return p



In [None]:
logging.info('Plotting topics hierarchy with keywords')
show(topics_hierarchy_with_keywords(df, kwd_df, clusters, dendrogram_children, max_words=2))

In [None]:
from bokeh.plotting import figure, output_file, save, reset_output, output_notebook

logging.info('Save topics hierarchy with keywords to file')
output_file(filename="topics.html", title="Topics dendrogram")
save(topics_hierarchy_with_keywords(df, kwd_df, clusters, dendrogram_children, 
                                    max_words=3, plot_height=1200, plot_width=1200))
reset_output()
output_notebook()

## Plot and save similarity graph

In [None]:
df['tags'] = [','.join(t for t, _ in clusters_description[c][:5]) for c in df['comp']]

logging.info('Saving papers and components dataframes')
df.to_csv('papers.csv', index=False)
t = kwd_df.copy()
t['comp'] += 1
t.to_csv('tags.csv', index=False)

In [None]:
from bokeh.models import GraphRenderer, StaticLayoutProvider, Circle, HoverTool, MultiLine, LabelSet

from bokeh.models.graphs import NodesAndLinkedEdges


def similarity_graph(g, df, plot_width=600, plot_height=600):
    nodes = df['id']
    graph = GraphRenderer()
    comps = df['comp']
    cmap = Plotter.factors_colormap(len(set(comps)))
    palette = dict(zip(sorted(set(comps)), [Plotter.color_to_rgb(cmap(i)).to_hex()
                                            for i in range(len(set(comps)))]))

    graph.node_renderer.data_source.add(df['id'], 'index')
    graph.node_renderer.data_source.data['id'] = df['id']
    graph.node_renderer.data_source.data['title'] = df['title']
    graph.node_renderer.data_source.data['authors'] = df['authors']
    graph.node_renderer.data_source.data['journal'] = df['journal']
    graph.node_renderer.data_source.data['year'] = df['year']
    graph.node_renderer.data_source.data['cited'] = df['total']
    graph.node_renderer.data_source.data['tags'] = df['tags']
    # Limit size
    graph.node_renderer.data_source.data['size'] = df['total'] * 20 / df['total'].max() + 5
    graph.node_renderer.data_source.data['topic'] = [c + 1 for c in comps]
    graph.node_renderer.data_source.data['color'] = [palette[c] for c in comps]

    graph.edge_renderer.data_source.data = dict(start=[u for u, _ in g.edges],
                                                end=[v for _, v in g.edges])

    # start of layout code
    x = df['x']
    y = df['y']
    xrange = max(x) - min(x)
    yrange = max(y) - min(y)
    p = figure(plot_width=plot_width,
               plot_height=plot_height,
               x_range=(min(x) - 0.05 * xrange, max(x) + 0.05 * xrange), 
               y_range=(min(y) - 0.05 * yrange, max(y) + 0.05 * yrange),
               tools="pan,tap,wheel_zoom,box_zoom,reset,save")
    p.xaxis.major_tick_line_color = None  # turn off x-axis major ticks
    p.xaxis.minor_tick_line_color = None  # turn off x-axis minor ticks
    p.yaxis.major_tick_line_color = None  # turn off y-axis major ticks
    p.yaxis.minor_tick_line_color = None  # turn off y-axis minor ticks
    p.xaxis.major_label_text_font_size = '0pt'  # preferred method for removing tick labels
    p.yaxis.major_label_text_font_size = '0pt'  # preferred method for removing tick labels
    p.grid.grid_line_color = None
    p.outline_line_color = None
    p.sizing_mode = 'stretch_width'

    p.add_tools(HoverTool(tooltips=plotter._html_tooltips([
            ("Author(s)", '@authors'),
            ("Journal", '@journal'),
            ("Year", '@year'),
            ("Cited by", '@total paper(s) total'),
            ("Topic", '@topic'),
            ("Tags", '@tags')])))


    graph_layout = dict(zip(nodes, zip(x, y)))
    graph.layout_provider = StaticLayoutProvider(graph_layout=graph_layout)

    graph.node_renderer.glyph = Circle(size='size', fill_alpha=0.7, line_alpha=0.7, fill_color='color')
    graph.node_renderer.hover_glyph = Circle(size='size', fill_alpha=1.0, line_alpha=1.0, fill_color='color')

    graph.edge_renderer.glyph = MultiLine(line_color='lightgrey', line_alpha=0.5, line_width=1)
    graph.edge_renderer.hover_glyph = MultiLine(line_color='grey', line_alpha=1.0, line_width=2)

    graph.inspection_policy = NodesAndLinkedEdges()
    p.renderers.append(graph)
    
    #Add Labels
    lxs = [df.loc[df['comp'] == c]['x'].mean() for c in sorted(set(comps))]
    lys = [df.loc[df['comp'] == c]['y'].mean() for c in sorted(set(comps))]
    comp_labels = [f"#{c + 1}" for c in sorted(set(comps))]
    source = ColumnDataSource({'x': lxs, 'y': lys, 'name': comp_labels})
    labels = LabelSet(x='x', y='y', text='name', source=source, 
                      background_fill_color='white', text_font_size='11px', background_fill_alpha=.9)
    p.renderers.append(labels)

    return p


#### Save similarity plots to html

In [None]:
from pysrc.papers.analysis.graph import local_sparse

logging.info('Visualize structure graph using projected coordinates')
wsg = to_weighted_graph(analyzer.similarity_graph, weight_func=PapersAnalyzer.similarity)

# Limit total number of edges in sparse graph
gs = wsg
e = 1.0
while e > 0.1 and gs.number_of_edges() / gs.number_of_nodes() > 3:
    e -= 0.1
    logging.info(f'Testing e={e}')
    gs = local_sparse(wsg, e)
logger.info(f'Sparse grap e={e} nodes={gs.number_of_nodes()} edges={gs.number_of_edges()}')

show(similarity_graph(gs, df))

In [None]:
from bokeh.plotting import figure, output_file, save, reset_output, output_notebook

logging.info('Saving papers similarity graph for bokeh')
output_file(filename="papers.html", title="Papers similarity graph")
save(similarity_graph(gs, df, plot_width=1200, plot_height=1200))
reset_output()
output_notebook()

In [None]:
import json
import jinja2
import networkx as nx

from pysrc.papers.utils import cut_authors_list

logging.info('Saving papers similarity graph for cytoscape.js')

topics_tags = {c: ','.join(t for t, _ in clusters_description[c][:5]) for c in sorted(set(df['comp']))}

logger.debug('Creating graph')
gss = nx.Graph()
for (u, v) in gs.edges():
    gss.add_edge(u, v)
for n in gs.nodes():
    if not gss.has_node(n):
        gss.add_node(n)

logger.debug('Collect attributes for nodes')
attrs = {}
for node in df['id']:
    sel = df[df['id'] == node]
    attrs[node] = dict(
        title=sel['title'].values[0],
        authors=cut_authors_list(sel['authors'].values[0]),
        journal=sel['journal'].values[0],
        year=int(sel['year'].values[0]),
        cited=int(sel['total'].values[0]),
        topic=int(sel['comp'].values[0]),
        # These can be heavy
        abstract=sel['abstract'].values[0],
        mesh=sel['mesh'].values[0],
        keywords=sel['keywords'].values[0],
    )
    
nx.set_node_attributes(gss, attrs)
graph_cs = nx.cytoscape_data(gss)['elements']

logger.debug('Layout')
maxy = df['y'].max()
for node_cs in graph_cs['nodes']:
            nid = node_cs['data']['id']
            sel = df.loc[df['id'] == nid]
            # Adjust vertical axis with bokeh graph
            node_cs['position'] = dict(x=int(sel['x'].values[0] * 8), 
                                       y=int((maxy - sel['y'].values[0]) * 6))

with open('papers_template.html') as f:
    text = f.read()


html = jinja2.Environment(loader=jinja2.BaseLoader()).from_string(text).render(
    topics_palette_json=json.dumps(Plotter.topics_palette(df)),
    topics_description_json=json.dumps(topics_tags),
    graph_cytoscape_json=json.dumps(graph_cs)
)

with open('papers_interactive.html', 'w') as f:
    f.write(html)

logger.debug('Done')

## Authors graph

In [None]:
def fa(authors, first_last_only=True):
    return authors if len(authors) <= 2 or not first_last_only else [authors[0], authors[-1]]

In [None]:
from tqdm.auto import tqdm

def compute_authors_citations_and_papers(df):
    logger.debug('Compute author citations')
    author_citations = {}
    for i, row in tqdm(df[['authors', 'total']].iterrows()):
        authors = fa(row['authors'].split(', '))
        for a in authors:
            author_citations[a] = author_citations.get(a, 0) + row['total']

    logger.debug('Compute number of papers per author')
    author_papers = {}
    for i, row in df[['title', 'authors']].iterrows():
        authors = fa(row['authors'].split(', '))
        for a in authors:
            author_papers[a] = author_papers.get(a, 0) + 1

    return author_citations, author_papers

In [None]:
import numpy as np
from pysrc.papers.analysis.metadata import popular_authors, popular_journals

logging.info("Analyzing groups of similar authors")
authors_citations, authors_papers = compute_authors_citations_and_papers(analyzer.df)
logging.info(f"Authors {len(authors_papers)}")
min_threshold = np.percentile(list(authors_papers.values()), 90)
logging.info(f'Min papers for author {min_threshold}')
logging.info(f'Filtered authors: {sum(v >= min_threshold for v in authors_papers.values())}')

In [None]:
def build_authors_similarity_graph(df,
                                   cocit_grouped_df, bibcoupling_df, citations_graph, texts_similarity,
                                   check_author_func=lambda a: True):
    logger.debug('Processing papers')
    result = nx.Graph()
    for _, row in tqdm(df[['authors']].iterrows()):
        authors = fa(row[0].split(', '))
        for i in range(len(authors)):
            for j in range(i + 1, len(authors)):
                a1 = authors[i]
                a2 = authors[j]
                if check_author_func(a1) and check_author_func(a2):
                    update_edge(result, a1, a2, 'authorship', 1)

    logger.debug('Processing co-citations')
    for el in tqdm(cocit_grouped_df[['cited_1', 'cited_2', 'total']].values):
        start, end, cocitation = str(el[0]), str(el[1]), float(el[2])
        authors1 = fa(df.loc[df['id'] == start]['authors'].values[0].split(', '))
        authors2 = fa(df.loc[df['id'] == end]['authors'].values[0].split(', '))
        for a1, a2 in itertools.product(authors1, authors2):
            if check_author_func(a1) and check_author_func(a2):
                update_edge(result, a1, a2, 'cocitation', cocitation)

    logger.debug('Bibliographic coupling')
    if len(bibcoupling_df) > 0:
        for el in tqdm(bibcoupling_df[['citing_1', 'citing_2', 'total']].values):
            start, end, bibcoupling = str(el[0]), str(el[1]), float(el[2])
            authors1 = fa(df.loc[df['id'] == start]['authors'].values[0].split(', '))
            authors2 = fa(df.loc[df['id'] == end]['authors'].values[0].split(', '))
            for a1, a2 in itertools.product(authors1, authors2):
                if check_author_func(a1) and check_author_func(a2):
                    update_edge(result, a1, a2, 'bibcoupling', bibcoupling)

    logger.debug('Text similarity')
    pids = list(df['id'])
    if len(df) >= 2:
        for i, pid1 in enumerate(tqdm(df['id'])):
            similarity_queue = texts_similarity[i]
            while not similarity_queue.empty():
                similarity, j = similarity_queue.get()
                pid2 = pids[j]
                authors1 = fa(df.loc[df['id'] == pid1]['authors'].values[0].split(', '))
                authors2 = fa(df.loc[df['id'] == pid2]['authors'].values[0].split(', '))
                for a1, a2 in itertools.product(authors1, authors2):
                    if check_author_func(a1) and check_author_func(a2):
                        update_edge(result, a1, a2, 'text', similarity)

    logger.debug('Citations')
    for u, v in tqdm(citations_graph.edges):
        authors1 = fa(df.loc[df['id'] == u]['authors'].values[0].split(', '))
        authors2 = fa(df.loc[df['id'] == v]['authors'].values[0].split(', '))
        for a1, a2 in itertools.product(authors1, authors2):
            if check_author_func(a1) and check_author_func(a2):
                update_edge(result, a1, a2, 'citation', 1)

    return result


def update_edge(graph, a1, a2, name, value):
    if a1 == a2:
        return
    if a1 > a2:
        a1, a2 = a2, a1
    if not graph.has_edge(a1, a2):
        graph.add_edge(a1, a2)
    edge = graph[a1][a2]
    edge[name] = edge.get(name, 0) + value

In [None]:
import networkx as nx
import community
import itertools

logger = logging.getLogger('Test')

authors_similarity_graph = build_authors_similarity_graph(
    analyzer.df, analyzer.cocit_grouped_df,
    analyzer.bibliographic_coupling_df,
    analyzer.citations_graph,
    analyzer.texts_similarity,
    check_author_func=lambda a: authors_papers[a] >= min_threshold
)

logging.info(f'Built authors graph - '
             f'{len(authors_similarity_graph.nodes())} nodes and {len(authors_similarity_graph.edges())} edges')

### Node2vec embeddings for authors graph

In [None]:
logger.debug('Compute aggregated similarity using co-authorship')
ga = to_weighted_graph(authors_similarity_graph, 
                       weight_func=lambda d: 100 * d.get('authorship', 0) + PapersAnalyzer.similarity(d))
e = 1.0
gs = local_sparse(ga, e)
# Limit total number of edges to estimate walk probabilities
while e > 0.1 and gs.number_of_edges() / gs.number_of_nodes() > 10:
    e -= 0.1
    gs = local_sparse(ga, e)
logger.debug(f'Sparse graph for node2vec e={e} nodes={gs.number_of_nodes()} edges={gs.number_of_edges()}')
authors_node_ids, authors_weighted_node_embeddings = node2vec(gs)

In [None]:
logger.debug('Apply t-SNE transformation on node embeddings')
authors_tsne = TSNE(n_components=2, random_state=42)
authors_weighted_node_embeddings_2d = tsne.fit_transform(authors_weighted_node_embeddings)

In [None]:
# Build dataframe combining information about authors and projected coordinates
authors_df = pd.DataFrame(dict(author=authors_node_ids, 
                               x=authors_weighted_node_embeddings_2d[:, 0],
                               y=authors_weighted_node_embeddings_2d[:, 1]))
authors_df['cited'] = [authors_citations[a] for a in authors_df['author']]
authors_df['papers'] = [authors_papers[a] for a in authors_df['author']]
authors_df['size'] = [1 + 10 * np.log1p(authors_citations[a]) for a in authors_df['author']]
# Limit max size
authors_df['size'] = authors_df['size'] * 10 / authors_df['size'].max() + 3

In [None]:
authors_df['cluster'] = 0
authors_df.head()

In [None]:
def plot_authors(authors_df, plot_width=600, plot_height=600):
    clusters = set(authors_df['cluster'])
    cmap = Plotter.factors_colormap(len(clusters))
    palette = dict(zip(sorted(clusters), 
                       [Plotter.color_to_rgb(cmap(i)).to_hex() for i in range(len(clusters))]))
    authors_df['color'] = [palette[c] for c in authors_df['cluster']]

    ds = ColumnDataSource(authors_df)
    del authors_df['color']
    x = authors_df['x']
    y = authors_df['y']
    xrange = max(x) - min(x)
    yrange = max(y) - min(y)
    p = figure(plot_width=plot_width, plot_height=plot_height,
               x_range=(min(x) - 0.05 * xrange, max(x) + 0.05 * xrange), 
               y_range=(min(y) - 0.05 * yrange, max(y) + 0.05 * yrange),    
               tools="hover,pan,tap,wheel_zoom,box_zoom,reset,save",
               tooltips=[("Author", '@author'),
                         ("Papers", '@papers'),
                         ("Cited by", '@cited'),
                         ("Cluster", '@cluster'),
                         ("Tags", '@tags')])

    p.xaxis.major_tick_line_color = None  # turn off x-axis major ticks
    p.xaxis.minor_tick_line_color = None  # turn off x-axis minor ticks
    p.yaxis.major_tick_line_color = None  # turn off y-axis major ticks
    p.yaxis.minor_tick_line_color = None  # turn off y-axis minor ticks
    p.xaxis.major_label_text_font_size = '0pt'  # preferred method for removing tick labels
    p.yaxis.major_label_text_font_size = '0pt'  # preferred method for removing tick labels
    p.grid.grid_line_color = None
    p.outline_line_color = None
    p.sizing_mode = 'stretch_width'

    p.circle(x='x', y='y', fill_alpha=0.8, source=ds, size='size',
             line_color='black', fill_color='color')

    lxs = [authors_df.loc[authors_df['cluster'] == c]['x'].mean() for c in sorted(clusters)]
    lys = [authors_df.loc[authors_df['cluster'] == c]['y'].mean() for c in sorted(clusters)]
    cluster_labels = [f'#{c}' for c in sorted(clusters)]
    source = ColumnDataSource({'x': lxs, 'y': lys, 'name': cluster_labels})
    labels = LabelSet(x='x', y='y', text='name', source=source, 
                      background_fill_color='white', text_font_size='11px', background_fill_alpha=.9)
    p.renderers.append(labels)

        
    return p

In [None]:
authors_df['cluster'] = 0
authors_df['tags'] = 'n/a'
show(plot_authors(authors_df)) 

## Authors clustering

### Hierachical clustering

In [None]:
author_clusters, _ = cluster_and_sort(authors_weighted_node_embeddings, 10, 100)

print('Cluster sizes')
t = pd.DataFrame({'cluster': author_clusters, 
                  'size': np.ones(len(author_clusters))}).groupby(['cluster']).sum().astype(int).reset_index()    
sns.barplot(data=t, x='cluster', y='size')
plt.tight_layout()
plt.show()

In [None]:
authors_df['cluster'] = author_clusters
display(authors_df.head())

logging.info('Saving authors and groups dataframes')
authors_df.to_csv('authors.csv', index=False)

### Use ego-splitting to compute possible overlapping groups of authors
Taken from https://github.com/benedekrozemberczki/EgoSplitting

In [None]:
import community
import networkx as nx
from tqdm import tqdm

class EgoNetSplitter(object):
    """An implementation of `"Ego-Splitting" see:
    https://www.eecs.yorku.ca/course_archive/2017-18/F/6412/reading/kdd17p145.pdf
    From the KDD '17 paper "Ego-Splitting Framework: from Non-Overlapping to Overlapping Clusters".
    The tool first creates the egonets of nodes.
    A persona-graph is created which is clustered by the Louvain method.
    The resulting overlapping cluster memberships are stored as a dictionary.
    Args:
        resolution (float): Resolution parameter of Python Louvain. Default 1.0.
    """
    def __init__(self, resolution=1.0):
        self.resolution = resolution

    def _create_egonet(self, node):
        """
        Creating an ego net, extracting personas and partitioning it.

        Args:
            node: Node ID for egonet (ego node).
        """
        ego_net_minus_ego = self.graph.subgraph(self.graph.neighbors(node))
        components = {i: n for i, n in enumerate(nx.connected_components(ego_net_minus_ego))}
        new_mapping = {}
        personalities = []
        for k, v in components.items():
            personalities.append(self.index)
            for other_node in v:
                new_mapping[other_node] = self.index
            self.index = self.index+1
        self.components[node] = new_mapping
        self.personalities[node] = personalities

    def _create_egonets(self):
        """
        Creating an egonet for each node.
        """
        self.components = {}
        self.personalities = {}
        self.index = 0
        print("Creating egonets.")
        for node in tqdm(self.graph.nodes()):
            self._create_egonet(node)

    def _map_personalities(self):
        """
        Mapping the personas to new nodes.
        """
        self.personality_map = {p: n for n in self.graph.nodes() for p in self.personalities[n]}

    def _get_new_edge_ids(self, edge):
        """
        Getting the new edge identifiers.
        Args:
            edge: Edge being mapped to the new identifiers.
        """
        return (self.components[edge[0]][edge[1]], self.components[edge[1]][edge[0]])

    def _create_persona_graph(self):
        """
        Create a persona graph using the egonet components.
        """
        print("Creating the persona graph.")
        self.persona_graph_edges = [self._get_new_edge_ids(e) for e in tqdm(self.graph.edges())]
        self.persona_graph = nx.from_edgelist(self.persona_graph_edges)

    def _create_partitions(self):
        """
        Creating a non-overlapping clustering of nodes in the persona graph.
        """
        print("Clustering the persona graph.")
        self.partitions = community.best_partition(self.persona_graph, resolution=self.resolution)
        self.overlapping_partitions = {node: [] for node in self.graph.nodes()}
        for node, membership in self.partitions.items():
            self.overlapping_partitions[self.personality_map[node]].append(membership)

    def fit(self, graph):
        """
        Fitting an Ego-Splitter clustering model.

        Arg types:
            * **graph** *(NetworkX graph)* - The graph to be clustered.
        """
        self.graph = graph
        self._create_egonets()
        self._map_personalities()
        self._create_persona_graph()
        self._create_partitions()

    def get_memberships(self):
        r"""Getting the cluster membership of nodes.
        Return types:
            * **memberships** *(dictionary of lists)* - Cluster memberships.
        """
        return self.overlapping_partitions


In [None]:
splitter = EgoNetSplitter(0.8)
splitter.fit(authors_similarity_graph)

ego_clusters = []
for a, cs in splitter.overlapping_partitions.items():
    ego_clusters.extend(cs)
print('Total clusters', len(set(ego_clusters)))
print('Clusters', Counter(ego_clusters))

## Analyze authors group topics

In [None]:
from pysrc.papers.analysis.text import compute_tfidf

def compute_groups_topics(authors_df):
    logging.info('Computing groups of authors topics')
    groups_counts = \
        np.zeros(shape=(len(set(authors_df['cluster'])), analyzer.corpus_counts.shape[1]), dtype=np.float64)

    part_sizes = Counter(authors_df['cluster'])
    authors_clusters_map=dict(zip(authors_df['author'], authors_df['cluster']))

    for i, row in tqdm(analyzer.df[['authors']].iterrows()):
        for a in row[0].split(', '):
            if a in authors_clusters_map:
                group = authors_clusters_map[a]
                groups_counts[group, :] += analyzer.corpus_counts[i, :] / part_sizes[group]

    tfidf = compute_tfidf(groups_counts)

    logging.info('Take terms with the largest tfidf for topics')
    result = {}
    for g in range(groups_counts.shape[0]):
        counter = Counter()
        for i, t in enumerate(analyzer.corpus_terms):
            counter[t] += tfidf[g, i]
        # Ignore terms with insignificant frequencies
        result[g] = [(t, f) for t, f in counter.most_common(10) if f > 0]
    return result

groups_topics = compute_groups_topics(authors_df)
kwds = [(g, ','.join(f'{t}:{v:.3f}' for t, v in vs)) for g, vs in groups_topics.items()]
logging.info('Description\n' + '\n'.join(f'{g}: {kwd}' for g, kwd in kwds))

In [None]:
groups_df = pd.DataFrame(columns=['group', 'authors', 'keywords'], dtype=object)
for g in sorted(set(authors_df['cluster'])):
    authors = ', '.join(authors_df.loc[authors_df['cluster'] == g]['author'])
    groups_df.loc[len(groups_df)] = (g, authors, ','.join(t for t, _ in groups_topics[g]))

display(groups_df.head())

In [None]:
logging.info('Saving groups of authors with keywords')
groups_df.to_csv('groups.csv', index=False)

In [None]:
authors_df['tags'] = [', '.join(f'{t}:{v:.3f}' for t, v in groups_topics[c][:5]) for c in authors_df['cluster']]
show(plot_authors(authors_df)) 

In [None]:
logging.info('Saving author groups graph for bokeh')
output_file(filename="authors.html", title="Authors similarity graph")

save(plot_authors(authors_df, plot_width=1600, plot_height=1200))
reset_output()
output_notebook()

## Topic Evolution

In [None]:
evolution_data, keywords_data = plotter.topic_evolution()
show(evolution_data)
print(keywords_data)

## PageRank for Citation Analysis

In [None]:
import networkx as nx

# Apply PageRank algorithm with damping factor of 0.5
pr_nx = nx.pagerank(analyzer.citations_graph, alpha=0.5, tol=1e-9)

In [None]:
ancestor = dict.fromkeys(analyzer.citations_graph, (0, 0))

# Select ancestor with highest PR for each node
for v in analyzer.citations_graph:
    for u in analyzer.citations_graph[v]:
        anc, pr = ancestor[u]
        if pr_nx[v] > pr:
            ancestor[u] = (v, pr_nx[v])

In [None]:
PRG = nx.DiGraph()
for v, anc in ancestor.items():
    u, pr = anc
    if pr > 0:
        PRG.add_edge(u, v)

In [None]:
start, end = zip(*list(PRG.edges()))

In [None]:
from bokeh.models import GraphRenderer, StaticLayoutProvider, Circle, HoverTool, MultiLine
from bokeh.models.graphs import NodesAndLinkedEdges

node_indices = list(filter(lambda node: len(analyzer.df[analyzer.df['id'] == node]) > 0, list(PRG.nodes())))

years = []
year_counts = {}
titles = []
pageranks = []
size = []
for node in node_indices:
    sel = analyzer.df[analyzer.df['id'] == node]
    year = sel['year'].values[0]
    
    if not year in year_counts:
        year_counts[year] = 1
    else:
        year_counts[year] += 1
    years.append(year)
    
    titles.append(sel['title'].values[0])
    pageranks.append(pr_nx[node] * 100)
    size.append(pr_nx[node] * 1000)
max_year_count = max(list(year_counts.values()))
min_year, max_year = min(years), max(years)

plot = figure(title="PageRank applied to citation filtering", 
              x_range=(min_year - 1, max_year+1), y_range=(0, max_year_count + 1),
              tools="", toolbar_location=None)

TOOLTIPS = """
    <div style="max-width: 320px">
        <div>
            <span style="font-size: 12px; font-weight: bold;">@title</span>
        </div>
        <div>
            <span style="font-size: 11px;">Year</span>
            <span style="font-size: 10px;">@year</span>
        </div>
        <div>
            <span style="font-size: 11px;">PMID</span>
            <span style="font-size: 10px;">@id</span>
        </div>
        <div>
            <span style="font-size: 11px;">PageRank</span>
            <span style="font-size: 10px;">@pagerank</span>
        </div>
    </div>
"""

plot.add_tools(HoverTool(tooltips=TOOLTIPS))

graph = GraphRenderer()

graph.node_renderer.data_source.add(node_indices, 'index')
graph.node_renderer.data_source.data['id'] = node_indices
graph.node_renderer.data_source.data['year'] = years
graph.node_renderer.data_source.data['title'] = titles
graph.node_renderer.data_source.data['pagerank'] = pageranks
graph.node_renderer.data_source.data['size'] = size
# graph.edge_renderer.data_source.data = dict(start=start, end=end)

### start of layout code   
x = [analyzer.df[analyzer.df['id'] == pmid]['year'].values[0] for pmid in node_indices]
y = []
tmp_year_counts = {}
for node in node_indices:
    year = analyzer.df[analyzer.df['id'] == node]['year'].values[0]
    if not year in tmp_year_counts:
        tmp_year_counts[year] = 1
    else:
        tmp_year_counts[year] += 1
    y.append(tmp_year_counts[year])

graph_layout = dict(zip(node_indices, zip(x, y)))
graph.layout_provider = StaticLayoutProvider(graph_layout=graph_layout)

graph.node_renderer.glyph = Circle(size='size', fill_color='blue')
graph.node_renderer.hover_glyph = Circle(size='size', fill_color='green')

# graph.edge_renderer.glyph = MultiLine(line_color='black', line_alpha=1, line_width=1)
# graph.edge_renderer.hover_glyph = MultiLine(line_color='green', line_width=2)

graph.inspection_policy = NodesAndLinkedEdges()

plot.min_border_left = 75
plot.renderers.append(graph)

show(plot)

### Top Papers by PageRank

In [None]:
for pmid, pagerank in sorted(pr_nx.items(), key=lambda el: el[1], reverse=True)[:10]:
    print(f"{(100*pagerank):.2f} {analyzer.df[analyzer.df['id'] == pmid]['title'].values[0]}")

### PageRank and citation ranking correlation

In [None]:
import numpy as np
from scipy.stats import spearmanr

analyzer.df['citation_rank'] = analyzer.df['total'].rank(method='first', ascending=False)
pagerank_rank = sorted(pr_nx.items(), key=lambda el: el[1], reverse=True)

r = np.zeros((len(pagerank_rank), 2))
for i, (pmid, pr) in enumerate(pagerank_rank):
    sel = analyzer.df[analyzer.df['id'] == pmid]
    if len(sel) > 0:
        r[i, 0] = i
        r[i, 1] = int(sel['citation_rank'].values[0])
        
TOP_X = [5, 10, 30, 50, 100]
for x in TOP_X:
    rho, _ = spearmanr(r[:x, 0], r[:x, 1])
    print(f'Spearman correlation coefficient for top {x}: {rho}')