# Pubtrends

Experimental notebook for hypothesis testing and development purposes.

**IMPORTANT** 
Turn on experimental features in config file!

## Getting Started

1. Define the `SEARCH_QUERY` variable in the cell below with a list of keywords that describe the science branch of your interest.
2. Run all cells & see the results.

In [None]:
SEARCH_QUERY = 'human aging'

## Publication Analysis

In [None]:
import logging
import json
import seaborn as sns
import numpy as np
import pandas as pd
from scipy import stats
from collections import Counter


from bokeh.plotting import figure
from bokeh.plotting import show, output_notebook
from matplotlib import pyplot as plt


from pysrc.papers.config import PubtrendsConfig
from pysrc.papers.db.pm_postgres_loader import PubmedPostgresLoader
from pysrc.papers.db.ss_postgres_loader import SemanticScholarPostgresLoader
from pysrc.papers.analyzer import PapersAnalyzer
from pysrc.papers.plot.plotter import Plotter
from pysrc.papers.utils import SORT_MOST_CITED, SORT_MOST_RECENT, cut_authors_list

logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s')
logger = logging.getLogger('notebook')

# Avoid info message about compilation flags
import tensorflow as tf
tf.get_logger().setLevel('ERROR')

output_notebook()
%matplotlib inline

In [None]:
SEARCH_SORT = SORT_MOST_CITED
SEARCH_PAPERS = 500

In [None]:
config = PubtrendsConfig(test=False)
config.feature_evolution_enabled = True
loader = PubmedPostgresLoader(config)
analyzer = PapersAnalyzer(loader, config)
try:
    ids = analyzer.search_terms(SEARCH_QUERY, limit=SEARCH_PAPERS, sort=SEARCH_SORT)
    analyzer.analyze_papers(ids, SEARCH_QUERY)
finally:
    loader.close_connection()
    analyzer.teardown()

# Report plots

In [None]:
plotter = Plotter(analyzer)

In [None]:
show(plotter.papers_by_year())

In [None]:
from pysrc.papers.analysis.text import get_frequent_tokens, get_topic_word_cloud_data

freq_kwds = get_frequent_tokens(analyzer.top_cited_df, query=analyzer.query)
wc, _ = plotter.papers_word_cloud_and_callback(freq_kwds)
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.show()

# Trends

In [None]:
# show(plotter.top_cited_papers())

In [None]:
show(plotter.most_cited_per_year_papers())

In [None]:
show(plotter.fastest_growth_per_year_papers())

## Frequent keywords timeline

In [None]:
from pysrc.papers.analysis.text import get_frequent_tokens

freq_kwds = get_frequent_tokens(analyzer.top_cited_df, query=analyzer.query)

In [None]:
print('Original keywords frequencies')
show(plotter.plot_keywords_frequencies(freq_kwds))

# Single paper citations dynamics

In [None]:
show(plotter.paper_citations_per_year(analyzer.df, analyzer.df['id'].values[0]))

# Topics analysis

In [None]:
# First cluster papers
# show(plotter.topics_info_and_word_cloud_and_callback()[0][0])

In [None]:
show(plotter.topic_years_distribution())

In [None]:
show(plotter.heatmap_topics_similarity())

In [None]:
from pysrc.papers.plot.plot_preprocessor import PlotPreprocessor

similarity_df, topics = PlotPreprocessor.topics_similarity_data(
    analyzer.similarity_graph, analyzer.partition
)

similarity_df['type'] = ['Inside' if x == y else 'Outside' 
                         for (x, y) in zip(similarity_df['comp_x'], similarity_df['comp_y'])]
sns.displot(similarity_df, x="similarity", hue="type", kind="kde")
plt.show()

In [None]:
show(plotter.topics_hierarchy_with_keywords())

## Similarities analysis

We hope that the distribution of similarities edge weights illustrates that majority of linked nodes are insignificantly similar in terms of their attributes.

In [None]:
bibcoupling_array = np.zeros(len(analyzer.similarity_graph.edges))
cocitations_array = np.zeros(len(analyzer.similarity_graph.edges))
citations_array = np.zeros(len(analyzer.similarity_graph.edges))
similarities_array = np.zeros(len(analyzer.similarity_graph.edges))
text_similarities_array = np.zeros(len(analyzer.similarity_graph.edges))

for i, (u, v, data) in enumerate(analyzer.similarity_graph.edges(data=True)):
    bibcoupling_array[i] = np.log1p(data.get('bibcoupling', 0))
    cocitations_array[i] = np.log1p(data.get('cocitation', 0))
    citations_array[i] = data.get('citation', 0)
    text_similarities_array[i] = data.get('text', 0)
    similarities_array[i] = PapersAnalyzer.similarity(data)
    
fig = plt.figure(figsize=(5 * 4, 5))
ax = plt.subplot(1, 4, 1)
print(f'Bibcoupling, non-zero {np.count_nonzero(bibcoupling_array)} of {len(bibcoupling_array)}')
bibcoupling_array = bibcoupling_array[np.nonzero(bibcoupling_array)]
print(stats.describe(bibcoupling_array))
sns.kdeplot(bibcoupling_array)
plt.title('Bibcoupling')
# plt.show()

ax = plt.subplot(1, 4, 2)
print(f'Co-citations, non-zero {np.count_nonzero(cocitations_array)} of {len(cocitations_array)}')
cocitations_array = cocitations_array[np.nonzero(cocitations_array)]
print(stats.describe(cocitations_array))
sns.kdeplot(cocitations_array)
plt.title('Co-citations')
# plt.show()

ax = plt.subplot(1, 4, 3)
print(f'Text similarities, non-zero {np.count_nonzero(text_similarities_array)} of {len(text_similarities_array)}')
text_similarities_array = text_similarities_array[np.nonzero(text_similarities_array)]
print(stats.describe(text_similarities_array))
sns.kdeplot(text_similarities_array)
plt.title('Text')
# plt.show

ax = plt.subplot(1, 4, 4)
print(f'Similarities, non-zero {np.count_nonzero(similarities_array)} of {len(similarities_array)}')
print(stats.describe(similarities_array))
sns.kdeplot(similarities_array)
plt.title('Similarity')

plt.show()

print(f'Citations, non-zero {np.count_nonzero(citations_array)} of {len(citations_array)}')

### Additional text similarities exploration
We use cutoff = 0.1 as min text similarity, and limit those to 20 max.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

print('Analyze similarities between all papers')
cos_similarities = cosine_similarity(analyzer.corpus_counts)
cos_similarities_array = cos_similarities.reshape(-1)
print(stats.describe(cos_similarities_array))
print('Q1', np.percentile(cos_similarities_array, 25), 
      'Q2', np.percentile(cos_similarities_array, 50), 
      'Q3', np.percentile(cos_similarities_array, 75))

fig = plt.figure(figsize=(5 * 2, 5))
ax = plt.subplot(1, 2, 1)
sns.kdeplot(cos_similarities_array)
plt.title('Cosine similarities among all papers')
# plt.show()

print('Analyze similarities between papers with direct citations')
pid_indx = {pid: i for i, pid in enumerate(analyzer.df['id'])}
cited_cos_similarities = []
for i, (u, v, data) in enumerate(analyzer.similarity_graph.edges(data=True)):
    if data.get('citation', 0) != 0:
        cited_cos_similarities.append(cos_similarities[pid_indx[u], pid_indx[v]])

print(stats.describe(cited_cos_similarities))
print('Q1', np.percentile(cited_cos_similarities, 25), 
      'Q2', np.percentile(cited_cos_similarities, 50), 
      'Q3', np.percentile(cited_cos_similarities, 75))
ax = plt.subplot(1, 2, 2)
sns.kdeplot(cited_cos_similarities)
plt.title('Cosine similarity between cited papers')
          
plt.show()                                     

In [None]:
G = analyzer.similarity_graph
degrees = [d for (n, d) in G.degree()]
plt.title('Similarity graph degrees')
sns.kdeplot(data=degrees)          
plt.show()  
print('Average degree', sum(degrees) / float(G.number_of_nodes()))

In [None]:
show(plotter.structure_graph())

# Similarities graph embeddings with various Node2Vec params

In [None]:
from pysrc.papers.analysis.node2vec import node2vec
from pysrc.papers.analysis.graph import to_weighted_graph

g = to_weighted_graph(analyzer.similarity_graph, weight_func=PapersAnalyzer.similarity)
node_ids, weighted_node_embeddings = node2vec(g, walk_length=100, walks_per_node=10, vector_size=64)

## Embeddings visualization

In [None]:
from sklearn.manifold import TSNE

logger.debug('Apply t-SNE transformation on node embeddings')
tsne = TSNE(n_components=2, random_state=42)
weighted_node_embeddings_2d = tsne.fit_transform(weighted_node_embeddings)

In [None]:
# from umap import UMAP 

# logger.debug('Apply UMAP transformation on node embeddings')
# umap = UMAP(n_components=2, random_state=42)
# weighted_node_embeddings_2d = umap.fit_transform(weighted_node_embeddings)

In [None]:
# Build dataframe combining information about papers and projected coordinates
df = analyzer.df[['id', 'title', 'year', 'type', 'total', 'authors', 'journal', 'comp']].copy()
indx = [pid_indx[pid] for pid in node_ids]
df['x'] = pd.Series(index=indx, data=weighted_node_embeddings_2d[:, 0])
df['y'] = pd.Series(index=indx, data=weighted_node_embeddings_2d[:, 1])

In [None]:
from bokeh.models import ColumnDataSource, CustomJS
from bokeh.models import HoverTool


from pysrc.papers.utils import cut_authors_list


def plot_embeddings(df, clusters):
    cmap = Plotter.factors_colormap(len(set(clusters)))
    palette = dict(zip(sorted(set(clusters)), [Plotter.color_to_rgb(cmap(i)).to_hex() 
                                               for i in range(len(set(clusters)))]))

    df['size'] = 5 + df['total'] / df['total'].max() * 20

    # Split authors
    df['authors'] = df['authors'].apply(lambda authors: cut_authors_list(authors))

    ds = ColumnDataSource(df)
    # Add clusters coloring
    ds.add([palette[c] for c in clusters], 'color')
    p = figure(plot_width=600, plot_height=600,
               tools="hover,pan,tap,wheel_zoom,box_zoom,reset,save")
    p.sizing_mode = 'stretch_width'
    p.xaxis.axis_label = 'x'
    p.yaxis.axis_label = 'y'

    p.hover.tooltips = plotter._html_tooltips([
        ("Author(s)", '@authors'),
        ("Journal", '@journal'),
        ("Year", '@year'),
        ("Type", '@type'),
        ("Cited by", '@total paper(s) total')])
    p.circle(x='x', y='y', fill_alpha=0.8, source=ds, size='size',
             line_color='black', fill_color='color', legend_field='comp')
    p.legend.visible = False
    show(p)

In [None]:
print('Plot default clusters in embeddings coordinates')
plot_embeddings(df, analyzer.df['comp'])

## Clustering of embeddings

In [None]:
from pysrc.papers.analysis.topics import cluster_and_sort

clusters, children = cluster_and_sort(weighted_node_embeddings, 10, 20)

print('Cluster sizes')
t = pd.DataFrame({'cluster': clusters, 
                  'size': np.ones(len(clusters))}).groupby(['cluster']).sum().astype(int).reset_index()    
sns.barplot(data=t, x='cluster', y='size')
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
from pysrc.papers.analysis.topics import get_topics_description

print('Computing clusters keywords')
clusters_pids = pd.DataFrame(dict(id=node_ids, comp=clusters)).groupby('comp')['id'].apply(list).to_dict()

clusters_description = get_topics_description(
    analyzer.df, clusters_pids,
    analyzer.corpus_terms, analyzer.corpus_counts,
    query=analyzer.query,
    n_words=analyzer.TOPIC_DESCRIPTION_WORDS
)

In [None]:
kwds = [(comp, ','.join([f'{t}:{v:.3f}' for t, v in vs[:20]]))
        for comp, vs in clusters_description.items()]
kwd_df = pd.DataFrame(kwds, columns=['comp', 'kwd'])
display(kwd_df)

## Clusters visualization

In [None]:
df['comp'] = pd.Series(index=indx, data=clusters, dtype=int)

In [None]:
print('Plot clusters in embeddings coordinates')
plot_embeddings(df, df['comp'])

In [None]:
clusters_partition = dict(zip(df['id'], df['comp']))

similarity_df, topics = PlotPreprocessor.topics_similarity_data(
    analyzer.similarity_graph, clusters_partition
)

similarity_df['type'] = ['Inside' if x == y else 'Outside' 
                         for (x, y) in zip(similarity_df['comp_x'], similarity_df['comp_y'])]
sns.displot(similarity_df, x="similarity", hue="type", kind="kde")
plt.show()

In [None]:
# show(plotter.heatmap_topics_similarity())

In [None]:
from bokeh.models import GraphRenderer, StaticLayoutProvider, Circle, HoverTool, MultiLine
from bokeh.models.graphs import NodesAndLinkedEdges


def structure_graph(g, df):
    nodes = df['id']
    graph = GraphRenderer()
    comps = df['comp']
    cmap = Plotter.factors_colormap(len(set(comps)))
    palette = dict(zip(sorted(set(comps)), [Plotter.color_to_rgb(cmap(i)).to_hex()
                                            for i in range(len(set(comps)))]))

    graph.node_renderer.data_source.add(df['id'], 'index')
    graph.node_renderer.data_source.data['id'] = df['id']
    graph.node_renderer.data_source.data['title'] = df['title']
    graph.node_renderer.data_source.data['authors'] = df['authors']
    graph.node_renderer.data_source.data['journal'] = df['journal']
    graph.node_renderer.data_source.data['year'] = df['year']
    graph.node_renderer.data_source.data['cited'] = df['total']
    # Limit size
    graph.node_renderer.data_source.data['size'] = df['total'] * 20 / df['total'].max() + 5
    graph.node_renderer.data_source.data['topic'] = [c + 1 for c in comps]
    graph.node_renderer.data_source.data['color'] = [palette[c] for c in comps]

    graph.edge_renderer.data_source.data = dict(start=[u for u, _ in g.edges],
                                                end=[v for _, v in g.edges])

    # start of layout code
    x = df['x']
    y = df['y']
    xrange = max(x) - min(x)
    yrange = max(y) - min(y)
    p = figure(plot_width=600,
               plot_height=600,
               x_range=(min(x) - 0.05 * xrange, max(x) + 0.05 * xrange), 
               y_range=(min(y) - 0.05 * yrange, max(y) + 0.05 * yrange),
               tools="pan,tap,wheel_zoom,box_zoom,reset,save")
    p.xaxis.major_tick_line_color = None  # turn off x-axis major ticks
    p.xaxis.minor_tick_line_color = None  # turn off x-axis minor ticks
    p.yaxis.major_tick_line_color = None  # turn off y-axis major ticks
    p.yaxis.minor_tick_line_color = None  # turn off y-axis minor ticks
    p.xaxis.major_label_text_font_size = '0pt'  # preferred method for removing tick labels
    p.yaxis.major_label_text_font_size = '0pt'  # preferred method for removing tick labels
    p.grid.grid_line_color = None
    p.outline_line_color = None
    p.sizing_mode = 'stretch_width'

    tooltip = """
    <div style="max-width: 500px">
        <div>
            <span style="font-size: 12px; font-weight: bold;">@title</span>
        </div>
        <div>
            <span style="font-size: 11px; font-weight: bold;">Author(s)</span>
            <span style="font-size: 10px;">@authors</span>
        </div>
        <div>
            <span style="font-size: 11px; font-weight: bold;">Journal</span>
            <span style="font-size: 10px;">@journal</span>
        </div>
        <div>
            <span style="font-size: 11px; font-weight: bold;">Year</span>
            <span style="font-size: 10px;">@year</span>
        </div>
        <div>
            <span style="font-size: 11px; font-weight: bold;">Cited</span>
            <span style="font-size: 10px;">@cited</span>
        </div>
        <div>
            <span style="font-size: 11px; font-weight: bold;">Topic</span>
            <span style="font-size: 10px;">@topic</span>
        </div>
    </div>
    """

    p.add_tools(HoverTool(tooltips=tooltip))

    graph_layout = dict(zip(nodes, zip(x, y)))
    graph.layout_provider = StaticLayoutProvider(graph_layout=graph_layout)

    graph.node_renderer.glyph = Circle(size='size', fill_color='color')
    graph.node_renderer.hover_glyph = Circle(size='size', fill_color='green')

    graph.edge_renderer.glyph = MultiLine(line_color='grey', line_alpha=0.1, line_width=1)
    graph.edge_renderer.hover_glyph = MultiLine(line_color='blue', line_alpha=1.0, line_width=2)

    graph.inspection_policy = NodesAndLinkedEdges()

    p.renderers.append(graph)
    return p


In [None]:
from pysrc.papers.analysis.graph import local_sparse

print('Visualize structure graph using projected coordinates')
show(structure_graph(local_sparse(analyzer.similarity_graph,  0.5), df))

## Authors graph

In [None]:
def compute_authors_citations_and_papers(df):
    logger.debug('Compute author citations')
    author_citations = {}
    for i, row in df[['authors', 'total']].iterrows():
        authors = row['authors'].split(', ')
        #     authors = authors if len(authors) <= 2 else [authors[0], authors[-1]]
        for a in authors:
            author_citations[a] = author_citations.get(a, 0) + row['total']

    logger.debug('Compute number of papers per author')
    author_papers = {}
    for i, row in df[['title', 'authors']].iterrows():
        authors = row['authors'].split(', ')
        #     authors = authors if len(authors) <= 2 else [authors[0], authors[-1]]
        for a in authors:
            author_papers[a] = author_papers.get(a, 0) + 1

    return author_citations, author_papers

In [None]:
import numpy as np
from pysrc.papers.analysis.metadata import popular_authors, popular_journals

logging.info("Analyzing groups of similar authors")
authors_citations, authors_papers = compute_authors_citations_and_papers(analyzer.df)
authors_productivity = {a: np.log1p(authors_citations.get(a, 1)) * p for a, p in authors_papers.items()}
min_threshold = np.percentile(list(authors_productivity.values()), 95)
min_threshold

In [None]:
def build_authors_similarity_graph(df,
                                   cocit_grouped_df, bibcoupling_df, citations_graph, texts_similarity,
                                   first_last_only=True, check_author_func=lambda a: True):
    logger.debug('Processing papers')
    result = nx.Graph()
    for _, row in df[['authors']].iterrows():
        authors = row[0].split(', ')
        if first_last_only:
            authors = authors if len(authors) <= 2 else [authors[0], authors[-1]]
        for i in range(len(authors)):
            for j in range(i + 1, len(authors)):
                a1 = authors[i]
                a2 = authors[j]
                if check_author_func(a1) and check_author_func(a2):
                    update_edge(result, a1, a2, 'authorship', 1)

    logger.debug('Processing co-citations')
    for el in cocit_grouped_df[['cited_1', 'cited_2', 'total']].values:
        start, end, cocitation = str(el[0]), str(el[1]), float(el[2])
        authors1 = df.loc[df['id'] == start]['authors'].values[0].split(', ')
        authors2 = df.loc[df['id'] == end]['authors'].values[0].split(', ')
        if first_last_only:
            authors1 = authors1 if len(authors1) <= 2 else [authors1[0], authors1[-1]]
            authors2 = authors2 if len(authors2) <= 2 else [authors2[0], authors2[-1]]
        for a1, a2 in itertools.product(authors1, authors2):
            if check_author_func(a1) and check_author_func(a2):
                update_edge(result, a1, a2, 'cocitation', cocitation)

    logger.debug('Bibliographic coupling')
    if len(bibcoupling_df) > 0:
        for el in bibcoupling_df[['citing_1', 'citing_2', 'total']].values:
            start, end, bibcoupling = str(el[0]), str(el[1]), float(el[2])
            authors1 = df.loc[df['id'] == start]['authors'].values[0].split(', ')
            authors2 = df.loc[df['id'] == end]['authors'].values[0].split(', ')
            if first_last_only:
                authors1 = authors1 if len(authors1) <= 2 else [authors1[0], authors1[-1]]
                authors2 = authors2 if len(authors2) <= 2 else [authors2[0], authors2[-1]]
            for a1, a2 in itertools.product(authors1, authors2):
                if check_author_func(a1) and check_author_func(a2):
                    update_edge(result, a1, a2, 'bibcoupling', bibcoupling)

    logger.debug('Text similarity')
    pids = list(df['id'])
    if len(df) >= 2:
        for i, pid1 in enumerate(df['id']):
            similarity_queue = texts_similarity[i]
            while not similarity_queue.empty():
                similarity, j = similarity_queue.get()
                pid2 = pids[j]
                authors1 = df.loc[df['id'] == pid1]['authors'].values[0].split(', ')
                authors2 = df.loc[df['id'] == pid2]['authors'].values[0].split(', ')
                if first_last_only:
                    authors1 = authors1 if len(authors1) <= 2 else [authors1[0], authors1[-1]]
                    authors2 = authors2 if len(authors2) <= 2 else [authors2[0], authors2[-1]]
                for a1, a2 in itertools.product(authors1, authors2):
                    if check_author_func(a1) and check_author_func(a2):
                        update_edge(result, a1, a2, 'text', similarity)

    logger.debug('Citations')
    for u, v in citations_graph.edges:
        authors1 = df.loc[df['id'] == u]['authors'].values[0].split(', ')
        authors2 = df.loc[df['id'] == v]['authors'].values[0].split(', ')
        if first_last_only:
            authors1 = authors1 if len(authors1) <= 2 else [authors1[0], authors1[-1]]
            authors2 = authors2 if len(authors2) <= 2 else [authors2[0], authors2[-1]]
        for a1, a2 in itertools.product(authors1, authors2):
            if check_author_func(a1) and check_author_func(a2):
                update_edge(result, a1, a2, 'citation', 1)

    return result


def update_edge(graph, a1, a2, name, value):
    if a1 == a2:
        return
    if a1 > a2:
        a1, a2 = a2, a1
    if not graph.has_edge(a1, a2):
        graph.add_edge(a1, a2)
    edge = graph[a1][a2]
    edge[name] = edge.get(name, 0) + value

In [None]:
import networkx as nx
import community
import itertools

logger = logging.getLogger('Test')

authors_similarity_graph = build_authors_similarity_graph(
    analyzer.df, analyzer.cocit_grouped_df,
    analyzer.bibliographic_coupling_df,
    analyzer.citations_graph,
    analyzer.texts_similarity,
    check_author_func=lambda a: authors_productivity[a] >= min_threshold
)

# authors_similarity_graph = analyzer.authors_similarity_graph
logging.info(f'Built authors graph - '
             f'{len(authors_similarity_graph.nodes())} nodes and {len(authors_similarity_graph.edges())} edges')

In [None]:
logger.debug('Compute aggregated similarity using co-authorship')
for _, _, d in authors_similarity_graph.edges(data=True):
    d['similarity'] = 100 * d.get('authorship', 0) + PapersAnalyzer.similarity(d)

### Node2vec embeddings for authors graph

In [None]:
ga = to_weighted_graph(authors_similarity_graph, weight_func=lambda d: d['similarity'])
authors_node_ids, authors_weighted_node_embeddings = node2vec(ga, walk_length=50, walks_per_node=10, vector_size=64)

In [None]:
logger.debug('Apply t-SNE transformation on node embeddings')
authors_tsne = TSNE(n_components=2, random_state=42)
authors_weighted_node_embeddings_2d = tsne.fit_transform(authors_weighted_node_embeddings)

In [None]:
# Build dataframe combining information about authors and projected coordinates
authors_df = pd.DataFrame(dict(author=authors_node_ids, 
                               d1=authors_weighted_node_embeddings_2d[:, 0],
                               d2=authors_weighted_node_embeddings_2d[:, 1]))
authors_df['cited'] = [authors_citations[a] for a in authors_df['author']]
authors_df['papers'] = [authors_papers[a] for a in authors_df['author']]
authors_df['size'] = [1 + authors_productivity[a] for a in authors_df['author']]
# Limit max size
authors_df['size'] = authors_df['size'] * 20 / authors_df['size'].max() + 3

In [None]:
ds = ColumnDataSource(authors_df)
p = figure(plot_width=600, plot_height=600,
           tools="hover,pan,tap,wheel_zoom,box_zoom,reset,save", toolbar_location="right",
           tooltips=[("Author", '@author'),("Papers", '@papers'), ("Cited", '@cited')])
p.sizing_mode = 'stretch_width'
p.xaxis.axis_label = 'd1'
p.yaxis.axis_label = 'd2'

p.circle(x='d1', y='d2', fill_alpha=0.8, source=ds, size='size',
         line_color='black', fill_color='blue', legend_field='author')
p.legend.location = None
show(p)

## Authors clustering

In [None]:
authors_clusters, _ = cluster_and_sort(authors_weighted_node_embeddings_2d, 10, 60)

In [None]:
def plot_authors_clusters(authors_df):
    cmap = Plotter.factors_colormap(len(set(authors_df['cluster'])))
    palette = dict(zip(sorted(set(authors_df['cluster'])), 
                       [Plotter.color_to_rgb(cmap(i)).to_hex() for i in range(len(set(authors_df['cluster'])))]))
    authors_df['color'] = [palette[c] for c in authors_df['cluster']]

    ds = ColumnDataSource(authors_df)
    p = figure(plot_width=600, plot_height=600,
               tools="hover,pan,tap,wheel_zoom,box_zoom,reset,save", toolbar_location="right",
               tooltips=[("Author", '@author'),("Papers", '@papers'), ("Cited", '@cited'), ('Cluster', '@cluster')])
    p.sizing_mode = 'stretch_width'
    p.xaxis.axis_label = 'd1'
    p.yaxis.axis_label = 'd2'

    p.circle(x='d1', y='d2', fill_alpha=0.8, source=ds, size='size',
             line_color='black', fill_color='color', legend_field='author')
    p.legend.location = None
    show(p)


In [None]:
authors_indx = {a: i for i, a in enumerate(authors_df['author'])}
authors_df['cluster'] = pd.Series(index=[authors_indx[a] for a in authors_node_ids], data=authors_clusters)
plot_authors_clusters(authors_df)

## Use ego-splitting to compute possible overlapping groups of authors
Taken from https://github.com/benedekrozemberczki/EgoSplitting

In [None]:
import community
import networkx as nx
from tqdm import tqdm

class EgoNetSplitter(object):
    """An implementation of `"Ego-Splitting" see:
    https://www.eecs.yorku.ca/course_archive/2017-18/F/6412/reading/kdd17p145.pdf
    From the KDD '17 paper "Ego-Splitting Framework: from Non-Overlapping to Overlapping Clusters".
    The tool first creates the egonets of nodes.
    A persona-graph is created which is clustered by the Louvain method.
    The resulting overlapping cluster memberships are stored as a dictionary.
    Args:
        resolution (float): Resolution parameter of Python Louvain. Default 1.0.
    """
    def __init__(self, resolution=1.0):
        self.resolution = resolution

    def _create_egonet(self, node):
        """
        Creating an ego net, extracting personas and partitioning it.

        Args:
            node: Node ID for egonet (ego node).
        """
        ego_net_minus_ego = self.graph.subgraph(self.graph.neighbors(node))
        components = {i: n for i, n in enumerate(nx.connected_components(ego_net_minus_ego))}
        new_mapping = {}
        personalities = []
        for k, v in components.items():
            personalities.append(self.index)
            for other_node in v:
                new_mapping[other_node] = self.index
            self.index = self.index+1
        self.components[node] = new_mapping
        self.personalities[node] = personalities

    def _create_egonets(self):
        """
        Creating an egonet for each node.
        """
        self.components = {}
        self.personalities = {}
        self.index = 0
        print("Creating egonets.")
        for node in tqdm(self.graph.nodes()):
            self._create_egonet(node)

    def _map_personalities(self):
        """
        Mapping the personas to new nodes.
        """
        self.personality_map = {p: n for n in self.graph.nodes() for p in self.personalities[n]}

    def _get_new_edge_ids(self, edge):
        """
        Getting the new edge identifiers.
        Args:
            edge: Edge being mapped to the new identifiers.
        """
        return (self.components[edge[0]][edge[1]], self.components[edge[1]][edge[0]])

    def _create_persona_graph(self):
        """
        Create a persona graph using the egonet components.
        """
        print("Creating the persona graph.")
        self.persona_graph_edges = [self._get_new_edge_ids(e) for e in tqdm(self.graph.edges())]
        self.persona_graph = nx.from_edgelist(self.persona_graph_edges)

    def _create_partitions(self):
        """
        Creating a non-overlapping clustering of nodes in the persona graph.
        """
        print("Clustering the persona graph.")
        self.partitions = community.best_partition(self.persona_graph, resolution=self.resolution)
        self.overlapping_partitions = {node: [] for node in self.graph.nodes()}
        for node, membership in self.partitions.items():
            self.overlapping_partitions[self.personality_map[node]].append(membership)

    def fit(self, graph):
        """
        Fitting an Ego-Splitter clustering model.

        Arg types:
            * **graph** *(NetworkX graph)* - The graph to be clustered.
        """
        self.graph = graph
        self._create_egonets()
        self._map_personalities()
        self._create_persona_graph()
        self._create_partitions()

    def get_memberships(self):
        r"""Getting the cluster membership of nodes.
        Return types:
            * **memberships** *(dictionary of lists)* - Cluster memberships.
        """
        return self.overlapping_partitions


In [None]:
splitter = EgoNetSplitter(5)
splitter.fit(authors_similarity_graph)
splitter.overlapping_partitions

## Number of clusters in papers

In [None]:
AUTHORS_PER_COMP = 20
group_authors = {}
for group in sorted(set(authors_clusters)):
    authors = list(authors_df.loc[authors_df['cluster'] == group]['author'])
    authors.sort(key=lambda a: authors_productivity[a], reverse=True)
    top = authors[:AUTHORS_PER_COMP]
    group_authors[group] = ", ".join(top)
    print(f'#{group} ({len(authors)}) {", ".join(top)}' + (', ...' if len(authors) > AUTHORS_PER_COMP else ''))

In [None]:
part_sizes = Counter(authors_clusters)
authors_clusters_map=dict(zip(authors_df['author'], authors_df['cluster']))
paper_groups = np.zeros(shape=(len(analyzer.df), len(set(authors_clusters))))
for i, row in analyzer.df[['authors']].iterrows():
    for a in row[0].split(', '):
        if a in authors_clusters_map:
            group = authors_clusters_map[a]
            paper_groups[i, group] += 1 / part_sizes[group]
groups = np.argmax(paper_groups, axis=1)
papers_assigned = paper_groups.sum(axis=1) > 0
groups_partition = {pid: groups[i] for i, pid in enumerate(analyzer.df['id']) if papers_assigned[i]}

groups_part_sizes = {c: sum([groups_partition[node] == c for node in groups_partition.keys()]) 
                     for c in set(groups_partition.values())}
logging.info(f'Components: {groups_part_sizes}')

In [None]:
import pandas as pd
from pysrc.papers.analysis.topics import get_topics_description

groups_pids = pd.DataFrame(groups_partition.items(), columns=['id', 'comp']). \
                groupby('comp')['id'].apply(list).to_dict()
groups_description = get_topics_description(
    analyzer.df.iloc[np.flatnonzero(papers_assigned), :], groups_pids,
    analyzer.corpus_terms, analyzer.corpus_counts[np.flatnonzero(papers_assigned), :],
    query=analyzer.query,
    n_words=analyzer.TOPIC_DESCRIPTION_WORDS
)

In [None]:
groups_df = pd.DataFrame(columns=['group', 'authors', 'papers', 'keywords'], dtype=object)
for g, pids in groups_pids.items():
    if g in group_authors and g in groups_description:
        groups_df.loc[len(groups_df)] = (g, group_authors[g], len(pids), 
                                         ', '.join(v[0] for v in groups_description[g][:10]))

display(groups_df)

## Topic Evolution

In [None]:
evolution_data, keywords_data = plotter.topic_evolution()
show(evolution_data)
print(keywords_data)

## PageRank for Citation Analysis

In [None]:
import networkx as nx

# Apply PageRank algorithm with damping factor of 0.5
pr_nx = nx.pagerank(analyzer.citations_graph, alpha=0.5, tol=1e-9)

In [None]:
ancestor = dict.fromkeys(analyzer.citations_graph, (0, 0))

# Select ancestor with highest PR for each node
for v in analyzer.citations_graph:
    for u in analyzer.citations_graph[v]:
        anc, pr = ancestor[u]
        if pr_nx[v] > pr:
            ancestor[u] = (v, pr_nx[v])

In [None]:
PRG = nx.DiGraph()
for v, anc in ancestor.items():
    u, pr = anc
    if pr > 0:
        PRG.add_edge(u, v)

In [None]:
start, end = zip(*list(PRG.edges()))

In [None]:
from bokeh.models import GraphRenderer, StaticLayoutProvider, Circle, HoverTool, MultiLine
from bokeh.models.graphs import NodesAndLinkedEdges

node_indices = list(filter(lambda node: len(analyzer.df[analyzer.df['id'] == node]) > 0, list(PRG.nodes())))

years = []
year_counts = {}
titles = []
pageranks = []
size = []
for node in node_indices:
    sel = analyzer.df[analyzer.df['id'] == node]
    year = sel['year'].values[0]
    
    if not year in year_counts:
        year_counts[year] = 1
    else:
        year_counts[year] += 1
    years.append(year)
    
    titles.append(sel['title'].values[0])
    pageranks.append(pr_nx[node] * 100)
    size.append(pr_nx[node] * 1000)
max_year_count = max(list(year_counts.values()))
min_year, max_year = min(years), max(years)

plot = figure(title="PageRank applied to citation filtering", 
              x_range=(min_year - 1, max_year+1), y_range=(0, max_year_count + 1),
              tools="", toolbar_location=None)

TOOLTIPS = """
    <div style="max-width: 320px">
        <div>
            <span style="font-size: 12px; font-weight: bold;">@title</span>
        </div>
        <div>
            <span style="font-size: 11px;">Year</span>
            <span style="font-size: 10px;">@year</span>
        </div>
        <div>
            <span style="font-size: 11px;">PMID</span>
            <span style="font-size: 10px;">@id</span>
        </div>
        <div>
            <span style="font-size: 11px;">PageRank</span>
            <span style="font-size: 10px;">@pagerank</span>
        </div>
    </div>
"""

plot.add_tools(HoverTool(tooltips=TOOLTIPS))

graph = GraphRenderer()

graph.node_renderer.data_source.add(node_indices, 'index')
graph.node_renderer.data_source.data['id'] = node_indices
graph.node_renderer.data_source.data['year'] = years
graph.node_renderer.data_source.data['title'] = titles
graph.node_renderer.data_source.data['pagerank'] = pageranks
graph.node_renderer.data_source.data['size'] = size
# graph.edge_renderer.data_source.data = dict(start=start, end=end)

### start of layout code   
x = [analyzer.df[analyzer.df['id'] == pmid]['year'].values[0] for pmid in node_indices]
y = []
tmp_year_counts = {}
for node in node_indices:
    year = analyzer.df[analyzer.df['id'] == node]['year'].values[0]
    if not year in tmp_year_counts:
        tmp_year_counts[year] = 1
    else:
        tmp_year_counts[year] += 1
    y.append(tmp_year_counts[year])

graph_layout = dict(zip(node_indices, zip(x, y)))
graph.layout_provider = StaticLayoutProvider(graph_layout=graph_layout)

graph.node_renderer.glyph = Circle(size='size', fill_color='blue')
graph.node_renderer.hover_glyph = Circle(size='size', fill_color='green')

# graph.edge_renderer.glyph = MultiLine(line_color='black', line_alpha=1, line_width=1)
# graph.edge_renderer.hover_glyph = MultiLine(line_color='green', line_width=2)

graph.inspection_policy = NodesAndLinkedEdges()

plot.min_border_left = 75
plot.renderers.append(graph)

show(plot)

### Top Papers by PageRank

In [None]:
for pmid, pagerank in sorted(pr_nx.items(), key=lambda el: el[1], reverse=True)[:10]:
    print(f"{(100*pagerank):.2f} {analyzer.df[analyzer.df['id'] == pmid]['title'].values[0]}")

### PageRank and citation ranking correlation

In [None]:
import numpy as np
from scipy.stats import spearmanr

analyzer.df['citation_rank'] = analyzer.df['total'].rank(method='first', ascending=False)
pagerank_rank = sorted(pr_nx.items(), key=lambda el: el[1], reverse=True)

r = np.zeros((len(pagerank_rank), 2))
for i, (pmid, pr) in enumerate(pagerank_rank):
    sel = analyzer.df[analyzer.df['id'] == pmid]
    if len(sel) > 0:
        r[i, 0] = i
        r[i, 1] = int(sel['citation_rank'].values[0])
        
TOP_X = [5, 10, 30, 50, 100]
for x in TOP_X:
    rho, _ = spearmanr(r[:x, 0], r[:x, 1])
    print(f'Spearman correlation coefficient for top {x}: {rho}')