# Pubtrends-experimental

Experimental notebook for hypothesis testing and development purposes.

In [None]:
from Bio import Entrez
Entrez.email = 'os@jetbrains.com'

In [None]:
QUERY = '((Aging) NOT (Review[Publication Type])) AND (("2015"[Date - Publication] : "2018"[Date - Publication]))'
handle = Entrez.esearch(db='pubmed', retmax='1000', retmode='xml', term=QUERY)
pmids = Entrez.read(handle)['IdList']
print(f'Found {len(pmids)} papers')

In [None]:
import os
from pysrc.papers.utils import SORT_MOST_CITED

SEARCH_QUERY = 'Aging 2018+'
SEARCH_SORT = SORT_MOST_CITED
SEARCH_PAPERS = 1000
OUTPUT = os.path.expanduser(f'~/pubtrends/{SEARCH_QUERY}')
! mkdir -p "{OUTPUT}"

# File with ids to analyze
FILE = os.path.expanduser(f'~/pubtrends/{SEARCH_QUERY}/pmid.itxt')

In [None]:
with open(FILE, 'w') as f:
    f.write('\n'.join(pmids))

## Publication Analysis

In [None]:
import logging
import seaborn as sns
import pandas as pd
from scipy import stats
from collections import Counter
from matplotlib import pyplot as plt
from bokeh.plotting import show, figure, output_file, save, reset_output, output_notebook
from bokeh.models import ColumnDataSource
from tqdm.auto import tqdm

from pysrc.papers.config import PubtrendsConfig
from pysrc.papers.db.pm_postgres_loader import PubmedPostgresLoader
from pysrc.papers.analyzer import PapersAnalyzer
from pysrc.papers.plot.plotter import Plotter

logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s: %(message)s')
logger = logging.getLogger('notebook')

# Avoid info message about compilation flags
# tf.get_logger().setLevel('ERROR')

output_notebook()

%matplotlib inline
%config InlineBackend.figure_format='retina'

In [None]:
from pysrc.papers.analyzer_files import AnalyzerFiles

config = PubtrendsConfig(test=False)
config.feature_evolution_enabled = True
loader = PubmedPostgresLoader(config)
analyzer = AnalyzerFiles(loader, config)

try:
    analyzer.analyze_ids(pmids, 'Pubmed', SEARCH_QUERY, SEARCH_PAPERS, SORT_MOST_CITED, 'medium')
finally:
    loader.close_connection()
    analyzer.teardown()

## Papers text similarities analysis
We hope that the distribution of similarities edge weights illustrates that majority of linked nodes are insignificantly similar in terms of their attributes.

In [None]:
import numpy as np

def analyze_similarities_features():
    bibcoupling_array = np.zeros(len(analyzer.papers_graph.edges))
    cocitations_array = np.zeros(len(analyzer.papers_graph.edges))
    citations_array = np.zeros(len(analyzer.papers_graph.edges))
    similarities_array = np.zeros(len(analyzer.papers_graph.edges))
    text_similarities_array = np.zeros(len(analyzer.papers_graph.edges))

    for i, (u, v, data) in enumerate(analyzer.papers_graph.edges(data=True)):
        bibcoupling_array[i] = np.log1p(data.get('bibcoupling', 0))
        cocitations_array[i] = np.log1p(data.get('cocitation', 0))
        citations_array[i] = data.get('citation', 0)
        text_similarities_array[i] = data.get('text', 0)
        similarities_array[i] = PapersAnalyzer.similarity(data)

    fig = plt.figure(figsize=(5 * 4, 5))
    ax = plt.subplot(1, 4, 1)
    print(f'Bibcoupling, non-zero {np.count_nonzero(bibcoupling_array)} of {len(bibcoupling_array)}')
    bibcoupling_array = bibcoupling_array[np.nonzero(bibcoupling_array)]
    print(stats.describe(bibcoupling_array))
    sns.kdeplot(bibcoupling_array)
    plt.title('Bibcoupling')
    # plt.show()

    ax = plt.subplot(1, 4, 2)
    print(f'Co-citations, non-zero {np.count_nonzero(cocitations_array)} of {len(cocitations_array)}')
    cocitations_array = cocitations_array[np.nonzero(cocitations_array)]
    print(stats.describe(cocitations_array))
    sns.kdeplot(cocitations_array)
    plt.title('Co-citations')
    # plt.show()

    ax = plt.subplot(1, 4, 3)
    print(f'Text similarities, non-zero {np.count_nonzero(text_similarities_array)} of {len(text_similarities_array)}')
    text_similarities_array = text_similarities_array[np.nonzero(text_similarities_array)]
    print(stats.describe(text_similarities_array))
    sns.kdeplot(text_similarities_array)
    plt.title('Text')
    # plt.show

    ax = plt.subplot(1, 4, 4)
    print(f'Similarities, non-zero {np.count_nonzero(similarities_array)} of {len(similarities_array)}')
    print(stats.describe(similarities_array))
    sns.kdeplot(similarities_array)
    plt.title('Similarity')

    plt.show()

    print(f'Citations, non-zero {np.count_nonzero(citations_array)} of {len(citations_array)}')
    
analyze_similarities_features()

## Additional Text analysis

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def analyze_similarities():
    print('Analyze similarities between tokens counts for all papers')
    cos_similarities = cosine_similarity(analyzer.corpus_counts)
    cos_similarities_array = cos_similarities.reshape(-1)
    print(stats.describe(cos_similarities_array))
    print('Q1', np.percentile(cos_similarities_array, 25), 
          'Q2', np.percentile(cos_similarities_array, 50), 
          'Q3', np.percentile(cos_similarities_array, 75))

    fig = plt.figure(figsize=(5 * 2, 5))
    ax = plt.subplot(1, 2, 1)
    sns.kdeplot(cos_similarities_array)
    plt.title('Cosine similarities among all papers')
    # plt.show()

    print('Analyze similarities between papers with direct citations')
    pid_indx = {pid: i for i, pid in enumerate(analyzer.df['id'])}
    cited_cos_similarities = []
    for i, (u, v, data) in enumerate(analyzer.papers_graph.edges(data=True)):
        if data.get('citation', 0) != 0:
            cited_cos_similarities.append(cos_similarities[pid_indx[u], pid_indx[v]])

    print(stats.describe(cited_cos_similarities))
    print('Q1', np.percentile(cited_cos_similarities, 25), 
          'Q2', np.percentile(cited_cos_similarities, 50), 
          'Q3', np.percentile(cited_cos_similarities, 75))
    ax = plt.subplot(1, 2, 2)
    sns.kdeplot(cited_cos_similarities)
    plt.title('Cosine similarity between cited papers')

    plt.show()   
    
analyze_similarities()    

In [None]:
G = analyzer.papers_graph
degrees = [d for (n, d) in G.degree()]
plt.title('Similarity graph degrees')
sns.kdeplot(data=degrees)          
plt.show()  
print('Average degree', sum(degrees) / float(G.number_of_nodes()))

## Authors graph analysis

In [None]:
from pysrc.papers.utils import cut_authors_list


def plot_embeddings(df, clusters):
    cmap = Plotter.factors_colormap(len(set(clusters)))
    palette = dict(zip(sorted(set(clusters)), [Plotter.color_to_rgb(cmap(i)).to_hex()
                                               for i in range(len(set(clusters)))]))

    df['size'] = 5 + df['total'] / df['total'].max() * 20

    # Split authors
    df['authors'] = df['authors'].apply(lambda authors: cut_authors_list(authors))

    ds = ColumnDataSource(df)
    # Add clusters coloring
    ds.add([palette[c] for c in clusters], 'color')
    p = figure(plot_width=600, plot_height=600,
               tools="hover,pan,tap,wheel_zoom,box_zoom,reset,save")
    p.sizing_mode = 'stretch_width'
    p.xaxis.axis_label = 'x'
    p.yaxis.axis_label = 'y'

    p.hover.tooltips = plotter._paper_html_tooltips([
        ("Author(s)", '@authors'),
        ("Journal", '@journal'),
        ("Year", '@year'),
        ("Type", '@type'),
        ("Cited by", '@total paper(s) total'),
        ("Topic", '@comp')])
    p.circle(x='x', y='y', fill_alpha=0.8, source=ds, size='size',
             line_color='black', fill_color='color', legend_field='comp')
    p.legend.visible = False
    show(p)

In [None]:
def fa(authors, first_last_only=True):
    return authors if len(authors) <= 2 or not first_last_only else [authors[0], authors[-1]]

In [None]:
def compute_authors_citations_and_papers(df):
    logger.debug('Compute author citations')
    author_citations = {}
    for i, row in tqdm(df[['authors', 'total']].iterrows()):
        authors = fa(row['authors'].split(', '))
        for a in authors:
            author_citations[a] = author_citations.get(a, 0) + row['total']

    logger.debug('Compute number of papers per author')
    author_papers = {}
    for i, row in df[['title', 'authors']].iterrows():
        authors = fa(row['authors'].split(', '))
        for a in authors:
            author_papers[a] = author_papers.get(a, 0) + 1

    return author_citations, author_papers

In [None]:
import numpy as np

logging.info("Analyzing groups of similar authors")
authors_citations, authors_papers = compute_authors_citations_and_papers(analyzer.df)
logging.info(f"Authors {len(authors_papers)}")
min_threshold = np.percentile(list(authors_papers.values()), 90)
logging.info(f'Min papers for author {min_threshold}')
logging.info(f'Filtered authors: {sum(v >= min_threshold for v in authors_papers.values())}')

In [None]:
import networkx as nx

def build_authors_similarity_graph(df,
                                   cocit_grouped_df, bibcoupling_df, cit_df,
                                   check_author_func=lambda a: True):
    logger.debug('Processing papers')
    result = nx.Graph()
    for _, row in tqdm(df[['authors']].iterrows()):
        authors = fa(row[0].split(', '))
        for i in range(len(authors)):
            for j in range(i + 1, len(authors)):
                a1 = authors[i]
                a2 = authors[j]
                if check_author_func(a1) and check_author_func(a2):
                    update_edge(result, a1, a2, 'authorship', 1)

    logger.debug('Processing co-citations')
    for el in tqdm(cocit_grouped_df[['cited_1', 'cited_2', 'total']].values):
        start, end, cocitation = str(el[0]), str(el[1]), float(el[2])
        authors1 = fa(df.loc[df['id'] == start]['authors'].values[0].split(', '))
        authors2 = fa(df.loc[df['id'] == end]['authors'].values[0].split(', '))
        for a1, a2 in itertools.product(authors1, authors2):
            if check_author_func(a1) and check_author_func(a2):
                update_edge(result, a1, a2, 'cocitation', cocitation)

    logger.debug('Bibliographic coupling')
    if len(bibcoupling_df) > 0:
        for el in tqdm(bibcoupling_df[['citing_1', 'citing_2', 'total']].values):
            start, end, bibcoupling = str(el[0]), str(el[1]), float(el[2])
            authors1 = fa(df.loc[df['id'] == start]['authors'].values[0].split(', '))
            authors2 = fa(df.loc[df['id'] == end]['authors'].values[0].split(', '))
            for a1, a2 in itertools.product(authors1, authors2):
                if check_author_func(a1) and check_author_func(a2):
                    update_edge(result, a1, a2, 'bibcoupling', bibcoupling)

    logger.debug('Citations')
    # Citations
    for start, end in zip(cit_df['id_out'], cit_df['id_in']):
        authors1 = fa(df.loc[df['id'] == start]['authors'].values[0].split(', '))
        authors2 = fa(df.loc[df['id'] == end]['authors'].values[0].split(', '))
        for a1, a2 in itertools.product(authors1, authors2):
            if check_author_func(a1) and check_author_func(a2):
                update_edge(result, a1, a2, 'citation', 1)

    return result


def update_edge(graph, a1, a2, name, value):
    if a1 == a2:
        return
    if a1 > a2:
        a1, a2 = a2, a1
    if not graph.has_edge(a1, a2):
        graph.add_edge(a1, a2)
    edge = graph[a1][a2]
    edge[name] = edge.get(name, 0) + value

In [None]:
import itertools

logger = logging.getLogger('Test')

authors_similarity_graph = build_authors_similarity_graph(
    analyzer.df, analyzer.cocit_grouped_df,
    analyzer.bibliographic_coupling_df,
    analyzer.cit_df,
    check_author_func=lambda a: authors_papers[a] >= min_threshold
)

logging.info(f'Built authors graph - '
             f'{len(authors_similarity_graph.nodes())} nodes and {len(authors_similarity_graph.edges())} edges')

### Node2vec embeddings for authors graph

In [None]:
from pysrc.papers.analysis.node2vec import node2vec
from pysrc.papers.analysis.graph import to_weighted_graph, sparse_graph

logger.debug('Compute aggregated similarity using co-authorship')
ga = to_weighted_graph(authors_similarity_graph, 
                       weight_func=lambda d: 100 * d.get('authorship', 0) + PapersAnalyzer.similarity(d))
gs = sparse_graph(ga, 10)
authors_node_ids = list(authors_similarity_graph.nodes)
authors_weighted_node_embeddings = node2vec(authors_node_ids, gs)

In [None]:
from sklearn.manifold import TSNE

logger.debug('Apply t-SNE transformation on node embeddings')
authors_tsne = TSNE(n_components=2, random_state=42)
authors_weighted_node_embeddings_2d = authors_tsne.fit_transform(authors_weighted_node_embeddings)

In [None]:
# Build dataframe combining information about authors and projected coordinates
authors_df = pd.DataFrame(dict(author=authors_node_ids, 
                               x=authors_weighted_node_embeddings_2d[:, 0],
                               y=authors_weighted_node_embeddings_2d[:, 1]))
authors_df['cited'] = [authors_citations[a] for a in authors_df['author']]
authors_df['papers'] = [authors_papers[a] for a in authors_df['author']]
authors_df['size'] = [1 + 10 * np.log1p(authors_citations[a]) for a in authors_df['author']]
# Limit max size
authors_df['size'] = authors_df['size'] * 10 / authors_df['size'].max() + 3

In [None]:
authors_df['cluster'] = 0
authors_df.head()

In [None]:
from bokeh.models import LabelSet


def plot_authors(authors_df, plot_width=600, plot_height=600):
    clusters = set(authors_df['cluster'])
    cmap = Plotter.factors_colormap(len(clusters))
    palette = dict(zip(sorted(clusters), 
                       [Plotter.color_to_rgb(cmap(i)).to_hex() for i in range(len(clusters))]))
    authors_df['color'] = [palette[c] for c in authors_df['cluster']]

    ds = ColumnDataSource(authors_df)
    del authors_df['color']
    x = authors_df['x']
    y = authors_df['y']
    xrange = max(x) - min(x)
    yrange = max(y) - min(y)
    p = figure(plot_width=plot_width, plot_height=plot_height,
               x_range=(min(x) - 0.05 * xrange, max(x) + 0.05 * xrange), 
               y_range=(min(y) - 0.05 * yrange, max(y) + 0.05 * yrange),    
               tools="hover,pan,tap,wheel_zoom,box_zoom,reset,save",
               tooltips=[("Author", '@author'),
                         ("Papers", '@papers'),
                         ("Cited by", '@cited'),
                         ("Cluster", '@cluster'),
                         ("Tags", '@tags')])

    p.xaxis.major_tick_line_color = None  # turn off x-axis major ticks
    p.xaxis.minor_tick_line_color = None  # turn off x-axis minor ticks
    p.yaxis.major_tick_line_color = None  # turn off y-axis major ticks
    p.yaxis.minor_tick_line_color = None  # turn off y-axis minor ticks
    p.xaxis.major_label_text_font_size = '0pt'  # preferred method for removing tick labels
    p.yaxis.major_label_text_font_size = '0pt'  # preferred method for removing tick labels
    p.grid.grid_line_color = None
    p.outline_line_color = None
    p.sizing_mode = 'stretch_width'

    p.circle(x='x', y='y', fill_alpha=0.8, source=ds, size='size',
             line_color='black', fill_color='color')

    lxs = [authors_df.loc[authors_df['cluster'] == c]['x'].mean() for c in sorted(clusters)]
    lys = [authors_df.loc[authors_df['cluster'] == c]['y'].mean() for c in sorted(clusters)]
    cluster_labels = [f'#{c}' for c in sorted(clusters)]
    source = ColumnDataSource({'x': lxs, 'y': lys, 'name': cluster_labels})
    labels = LabelSet(x='x', y='y', text='name', source=source, 
                      background_fill_color='white', text_font_size='11px', background_fill_alpha=.9)
    p.renderers.append(labels)

        
    return p

In [None]:
authors_df['cluster'] = 0
authors_df['tags'] = 'n/a'
show(plot_authors(authors_df)) 

## Authors clustering

In [None]:
from pysrc.papers.analysis.topics import cluster_and_sort

author_clusters, _ = cluster_and_sort(authors_weighted_node_embeddings, 10, 100)

print('Cluster sizes')
t = pd.DataFrame({'cluster': author_clusters, 
                  'size': np.ones(len(author_clusters))}).groupby(['cluster']).sum().astype(int).reset_index()    
sns.barplot(data=t, x='cluster', y='size')
plt.tight_layout()
plt.show()

In [None]:
authors_df['cluster'] = author_clusters
display(authors_df.head())

logging.info('Saving authors and groups dataframes')
authors_df.to_csv(f'{OUTPUT}/authors.csv', index=False)

### Use ego-splitting to compute possible overlapping groups of authors
Taken from https://github.com/benedekrozemberczki/EgoSplitting

In [None]:
import community
from tqdm import tqdm

class EgoNetSplitter(object):
    """An implementation of `"Ego-Splitting" see:
    https://www.eecs.yorku.ca/course_archive/2017-18/F/6412/reading/kdd17p145.pdf
    From the KDD '17 paper "Ego-Splitting Framework: from Non-Overlapping to Overlapping Clusters".
    The tool first creates the egonets of nodes.
    A persona-graph is created which is clustered by the Louvain method.
    The resulting overlapping cluster memberships are stored as a dictionary.
    Args:
        resolution (float): Resolution parameter of Python Louvain. Default 1.0.
    """
    def __init__(self, resolution=1.0):
        self.resolution = resolution

    def _create_egonet(self, node):
        """
        Creating an ego net, extracting personas and partitioning it.

        Args:
            node: Node ID for egonet (ego node).
        """
        ego_net_minus_ego = self.graph.subgraph(self.graph.neighbors(node))
        components = {i: n for i, n in enumerate(nx.connected_components(ego_net_minus_ego))}
        new_mapping = {}
        personalities = []
        for k, v in components.items():
            personalities.append(self.index)
            for other_node in v:
                new_mapping[other_node] = self.index
            self.index = self.index+1
        self.components[node] = new_mapping
        self.personalities[node] = personalities

    def _create_egonets(self):
        """
        Creating an egonet for each node.
        """
        self.components = {}
        self.personalities = {}
        self.index = 0
        print("Creating egonets.")
        for node in tqdm(self.graph.nodes()):
            self._create_egonet(node)

    def _map_personalities(self):
        """
        Mapping the personas to new nodes.
        """
        self.personality_map = {p: n for n in self.graph.nodes() for p in self.personalities[n]}

    def _get_new_edge_ids(self, edge):
        """
        Getting the new edge identifiers.
        Args:
            edge: Edge being mapped to the new identifiers.
        """
        return (self.components[edge[0]][edge[1]], self.components[edge[1]][edge[0]])

    def _create_persona_graph(self):
        """
        Create a persona graph using the egonet components.
        """
        print("Creating the persona graph.")
        self.persona_graph_edges = [self._get_new_edge_ids(e) for e in tqdm(self.graph.edges())]
        self.persona_graph = nx.from_edgelist(self.persona_graph_edges)

    def _create_partitions(self):
        """
        Creating a non-overlapping clustering of nodes in the persona graph.
        """
        print("Clustering the persona graph.")
        self.partitions = community.best_partition(self.persona_graph, resolution=self.resolution)
        self.overlapping_partitions = {node: [] for node in self.graph.nodes()}
        for node, membership in self.partitions.items():
            self.overlapping_partitions[self.personality_map[node]].append(membership)

    def fit(self, graph):
        """
        Fitting an Ego-Splitter clustering model.

        Arg types:
            * **graph** *(NetworkX graph)* - The graph to be clustered.
        """
        self.graph = graph
        self._create_egonets()
        self._map_personalities()
        self._create_persona_graph()
        self._create_partitions()

    def get_memberships(self):
        r"""Getting the cluster membership of nodes.
        Return types:
            * **memberships** *(dictionary of lists)* - Cluster memberships.
        """
        return self.overlapping_partitions


In [None]:
splitter = EgoNetSplitter(0.8)
splitter.fit(authors_similarity_graph)

ego_clusters = []
for a, cs in splitter.overlapping_partitions.items():
    ego_clusters.extend(cs)
print('Total clusters', len(set(ego_clusters)))
print('Clusters', Counter(ego_clusters))

## Analyze authors group topics

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer


def compute_tfidf(corpus_counts):
    logger.debug('Compute TF-IDF on tokens counts')
    tfidf_transformer = TfidfTransformer()
    tfidf = tfidf_transformer.fit_transform(corpus_counts)
    logger.debug(f'TFIDF shape {tfidf.shape}')
    return tfidf


def compute_groups_topics(authors_df):
    logging.info('Computing groups of authors topics')
    groups_counts = \
        np.zeros(shape=(len(set(authors_df['cluster'])), analyzer.corpus_counts.shape[1]), dtype=np.float64)

    part_sizes = Counter(authors_df['cluster'])
    authors_clusters_map=dict(zip(authors_df['author'], authors_df['cluster']))

    for i, row in tqdm(analyzer.df[['authors']].iterrows()):
        for a in row[0].split(', '):
            if a in authors_clusters_map:
                group = authors_clusters_map[a]
                groups_counts[group, :] += analyzer.corpus_counts[i, :] / part_sizes[group]

    tfidf = compute_tfidf(groups_counts)

    logging.info('Take terms with the largest tfidf for topics')
    result = {}
    for g in range(groups_counts.shape[0]):
        counter = Counter()
        for i, t in enumerate(analyzer.corpus_tokens):
            counter[t] += tfidf[g, i]
        # Ignore terms with insignificant frequencies
        result[g] = [(t, f) for t, f in counter.most_common(10) if f > 0]
    return result

groups_topics = compute_groups_topics(authors_df)
kwds = [(g, ','.join(f'{t}:{v:.3f}' for t, v in vs)) for g, vs in groups_topics.items()]
logging.info('Description\n' + '\n'.join(f'{g}: {kwd}' for g, kwd in kwds))

In [None]:
groups_df = pd.DataFrame(columns=['group', 'authors', 'keywords'], dtype=object)
for g in sorted(set(authors_df['cluster'])):
    authors = ', '.join(authors_df.loc[authors_df['cluster'] == g]['author'])
    groups_df.loc[len(groups_df)] = (g, authors, ','.join(t for t, _ in groups_topics[g]))

display(groups_df.head())

In [None]:
logging.info('Saving groups of authors with keywords')
groups_df.to_csv(f'{OUTPUT}/groups.csv', index=False)

In [None]:
authors_df['tags'] = [', '.join(f'{t}:{v:.3f}' for t, v in groups_topics[c][:5]) for c in authors_df['cluster']]
show(plot_authors(authors_df)) 

In [None]:
logging.info('Saving author groups graph for bokeh')
output_file(filename=f"{OUTPUT}/authors.html", title="Authors similarity graph")
save(plot_authors(authors_df, plot_width=1600, plot_height=1200))
reset_output()
output_notebook()

## PageRank for Citation Analysis

In [None]:
citations_graph = nx.Graph()
for start, end in zip(analyzer.cit_df['id_out'], analyzer.cit_df['id_in']):
    citations_graph.add_edge(start, end)

# Apply PageRank algorithm with damping factor of 0.5
pr_nx = nx.pagerank(citations_graph, alpha=0.5, tol=1e-9)

In [None]:
ancestor = dict.fromkeys(citations_graph, (0, 0))

# Select ancestor with highest PR for each node
for v in citations_graph:
    for u in citations_graph[v]:
        anc, pr = ancestor[u]
        if pr_nx[v] > pr:
            ancestor[u] = (v, pr_nx[v])

In [None]:
PRG = nx.DiGraph()
for v, anc in ancestor.items():
    u, pr = anc
    if pr > 0:
        PRG.add_edge(u, v)

In [None]:
start, end = zip(*list(PRG.edges()))

In [None]:
from bokeh.models import GraphRenderer, StaticLayoutProvider, Circle, HoverTool
from bokeh.models.graphs import NodesAndLinkedEdges

node_indices = list(filter(lambda node: len(analyzer.df[analyzer.df['id'] == node]) > 0, list(PRG.nodes())))

years = []
year_counts = {}
titles = []
pageranks = []
size = []
for node in node_indices:
    sel = analyzer.df[analyzer.df['id'] == node]
    year = sel['year'].values[0]
    
    if not year in year_counts:
        year_counts[year] = 1
    else:
        year_counts[year] += 1
    years.append(year)
    
    titles.append(sel['title'].values[0])
    pageranks.append(pr_nx[node] * 100)
    size.append(pr_nx[node] * 1000)
max_year_count = max(list(year_counts.values()))
min_year, max_year = min(years), max(years)

plot = figure(title="PageRank applied to citation filtering", 
              x_range=(min_year - 1, max_year+1), y_range=(0, max_year_count + 1),
              tools="", toolbar_location=None)

TOOLTIPS = """
    <div style="max-width: 320px">
        <div>
            <span style="font-size: 12px; font-weight: bold;">@title</span>
        </div>
        <div>
            <span style="font-size: 11px;">Year</span>
            <span style="font-size: 10px;">@year</span>
        </div>
        <div>
            <span style="font-size: 11px;">PMID</span>
            <span style="font-size: 10px;">@id</span>
        </div>
        <div>
            <span style="font-size: 11px;">PageRank</span>
            <span style="font-size: 10px;">@pagerank</span>
        </div>
    </div>
"""

plot.add_tools(HoverTool(tooltips=TOOLTIPS))

graph = GraphRenderer()

graph.node_renderer.data_source.add(node_indices, 'index')
graph.node_renderer.data_source.data['id'] = node_indices
graph.node_renderer.data_source.data['year'] = years
graph.node_renderer.data_source.data['title'] = titles
graph.node_renderer.data_source.data['pagerank'] = pageranks
graph.node_renderer.data_source.data['size'] = size
# graph.edge_renderer.data_source.data = dict(start=start, end=end)

### start of layout code   
x = [analyzer.df[analyzer.df['id'] == pmid]['year'].values[0] for pmid in node_indices]
y = []
tmp_year_counts = {}
for node in node_indices:
    year = analyzer.df[analyzer.df['id'] == node]['year'].values[0]
    if not year in tmp_year_counts:
        tmp_year_counts[year] = 1
    else:
        tmp_year_counts[year] += 1
    y.append(tmp_year_counts[year])

graph_layout = dict(zip(node_indices, zip(x, y)))
graph.layout_provider = StaticLayoutProvider(graph_layout=graph_layout)

graph.node_renderer.glyph = Circle(size='size', fill_color='blue')
graph.node_renderer.hover_glyph = Circle(size='size', fill_color='green')

# graph.edge_renderer.glyph = MultiLine(line_color='black', line_alpha=1, line_width=1)
# graph.edge_renderer.hover_glyph = MultiLine(line_color='green', line_width=2)

graph.inspection_policy = NodesAndLinkedEdges()

plot.min_border_left = 75
plot.renderers.append(graph)

show(plot)

### Top Papers by PageRank

In [None]:
for pmid, pagerank in sorted(pr_nx.items(), key=lambda el: el[1], reverse=True)[:10]:
    print(f"{(100*pagerank):.2f} {analyzer.df[analyzer.df['id'] == pmid]['title'].values[0]}")

### PageRank and citation ranking correlation

In [None]:
import numpy as np
from scipy.stats import spearmanr

analyzer.df['citation_rank'] = analyzer.df['total'].rank(method='first', ascending=False)
pagerank_rank = sorted(pr_nx.items(), key=lambda el: el[1], reverse=True)

r = np.zeros((len(pagerank_rank), 2))
for i, (pmid, pr) in enumerate(pagerank_rank):
    sel = analyzer.df[analyzer.df['id'] == pmid]
    if len(sel) > 0:
        r[i, 0] = i
        r[i, 1] = int(sel['citation_rank'].values[0])
        
TOP_X = [5, 10, 30, 50, 100]
for x in TOP_X:
    rho, _ = spearmanr(r[:x, 0], r[:x, 1])
    print(f'Spearman correlation coefficient for top {x}: {rho}')