# Pubtrends-experimental

Experimental notebook for hypothesis testing and development purposes.

# Config

In [None]:
import logging
import seaborn as sns
import pandas as pd
import numpy as np
from scipy import stats
from collections import Counter
from matplotlib import pyplot as plt
from bokeh.plotting import show, figure, output_file, save, reset_output, output_notebook
from bokeh.models import ColumnDataSource
from tqdm.auto import tqdm

from pysrc.config import PubtrendsConfig
from pysrc.papers.db.pm_postgres_loader import PubmedPostgresLoader
from pysrc.papers.utils import SORT_MOST_CITED

SEARCH_SORT = SORT_MOST_CITED
SEARCH_PAPERS = 10_000

logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s')
logger = logging.getLogger('notebook')

# Avoid info message about compilation flags
# tf.get_logger().setLevel('ERROR')

output_notebook()

%matplotlib inline
%config InlineBackend.figure_format='retina'

# Papers lookup

In [None]:
from pysrc.papers.analyzer import PapersAnalyzer

config = PubtrendsConfig(test=False)
loader = PubmedPostgresLoader(config)
analyzer = PapersAnalyzer(loader, config)

## By titles

In [None]:
titles = ['Title1', 'Title2']

In [None]:
import re
from pysrc.papers.db.postgres_utils import preprocess_quotes, preprocess_search_query_for_postgres
from pysrc.papers.utils import SORT_MOST_RECENT

pmids = []
for title in tqdm(titles):
    paperids = loader.search_key_value('title', title)
    if paperids:
        pmids.extend(paperids)
    else:
        print(f'NOT FOUND: {title}')

print('Found papers', len(pmids), 'of', len(titles))        

## By DOI

In [None]:
# from pysrc.papers.utils import cut_authors_list, crc32, \
#     preprocess_doi, preprocess_search_title, rgb2hex
# dois = [preprocess_doi(d) for d in dois]
# pmids = []
# for doi in tqdm(dois):
#    paperids = loader.find('doi', doi)
#    if paperids:
#        pmids.extend(paperids)
#    else:
#        print(doi)

## With Pubmed syntax

In [None]:
# import os
# from Bio import Entrez
# Entrez.email = 'os@jetbrains.com'
# QUERY = '((Aging) NOT (Review[Publication Type])) AND (("2015"[Date - Publication] : "2025"[Date - Publication]))'
# handle = Entrez.esearch(db='pubmed', retmax='1000', retmode='xml', term=QUERY)
# pmids = Entrez.read(handle)['IdList']
# print(f'Found {len(pmids)} papers')

## Regular search

In [None]:
# try:
#     pmids = analyzer.search_terms('Human Immune Aging', 1000, SORT_MOST_CITED)
#     analyzer.analyze_papers(pmids, 'bci', 20)
# finally:
#     loader.close_connection()
#     analyzer.teardown()

# Analysis

In [None]:
config.topic_min_size=5
try:
    analyzer.analyze_papers(pmids, 'Papers', 'Pubmed', SEARCH_PAPERS, SORT_MOST_RECENT, 10)
finally:
    loader.close_connection()
    analyzer.teardown()

In [None]:
from pysrc.papers.plot.plotter import Plotter
analyzer.search_ids = pmids
plotter = Plotter(config, analyzer)

In [None]:
show(plotter.plot_papers_by_year())

In [None]:
show(plotter.plot_top_cited_papers())

In [None]:
show(plotter.plot_most_cited_per_year_papers())

In [None]:
show(plotter.plot_fastest_growth_per_year_papers())

In [None]:
from pysrc.papers.analysis.text import get_frequent_tokens
from itertools import chain

freq_kwds = get_frequent_tokens(chain(*chain(*plotter.data.corpus)))
show(plotter.plot_keywords_frequencies(freq_kwds))

In [None]:
show(plotter.plot_papers_graph())

In [None]:
show(plotter.topics_hierarchy_with_keywords())

# Tokens embeddings

In [None]:
import numpy as np
from pysrc.papers.analysis.text import texts_embeddings, vectorize_corpus, tokens_embeddings

print('Compute global embeddings')
embeddings = tokens_embeddings(analyzer.corpus, analyzer.corpus_tokens)
analyzer.corpus_tokens = analyzer.corpus_tokens
print(f'Embeddings shape {embeddings.shape}')

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler

logger.debug('Computing PCA projection')
pca = PCA(n_components=15)
t = StandardScaler().fit_transform(embeddings)
pca_coords = pca.fit_transform(t)
logger.debug(f'Explained variation {int(np.sum(pca.explained_variance_ratio_) * 100)}%')

logger.debug('Apply TSNE transformation on papers PCA coords')
tsne_embeddings_2d = TSNE(n_components=2, random_state=42).fit_transform(pca_coords)
xs = tsne_embeddings_2d[:, 0]
ys = tsne_embeddings_2d[:, 1]

In [None]:
plt.figure(figsize=(5, 5))
plt.scatter(xs, ys, linewidths=0.1, color='black', alpha=0.1)
plt.xlabel('tSNE1')
plt.ylabel('tSNE2')
plt.title('All tokens in global word embedding space')
plt.show()

In [None]:
from pysrc.papers.utils import factors_colormap
import matplotlib.pyplot as plt
from pysrc.papers.plot.plot_preprocessor import PlotPreprocessor
from pysrc.papers.analysis.topics import get_topics_description
from itertools import chain

data = analyzer

n = 10

print('Show words for components')
topics_description = get_topics_description(
            data.df,
            data.corpus, data.corpus_tokens, data.corpus_counts,
            n_words=n
        )
kwd_df = PlotPreprocessor.compute_kwds(topics_description, n)
words2show = PlotPreprocessor.topics_words(kwd_df, n)
print(words2show)

words = list(chain(*words2show.values()))
print(f'Total words {len(words)}')

In [None]:
wxs = [xs[analyzer.corpus_tokens.index(w)] for w in words]
wys = [ys[analyzer.corpus_tokens.index(w)] for w in words]

cmap = factors_colormap(len(words2show))
colors = []
sizes = [analyzer.corpus_counts[:, analyzer.corpus_tokens.index(w)].sum() / len(analyzer.df) * 500
         for w in words]
for i, ws in words2show.items():
    colors.extend([cmap(i)] * len(ws))

plt.figure(figsize=(10, 10))
plt.scatter(wxs, wys, sizes=sizes, color=colors, alpha=0.5)
plt.xlabel('tSNE1')
plt.ylabel('tSNE2')
plt.title('Main keywords in global word embedding space')

for word, x, y in zip(words, wxs, wys):
    plt.annotate(word, xy=(x, y - 0.1), size=7)
plt.show()

## Topics visualization in embedded space

In [None]:
comp_pids = analyzer.df[['id', 'comp']].groupby('comp')['id'].apply(list).to_dict()
terms_freqs_per_comp = np.zeros(shape=(len(words2show), analyzer.corpus_counts.shape[1]), dtype=float)
for comp, pids in comp_pids.items():
    terms_freqs_per_comp[comp, :] = np.sum(analyzer.corpus_counts[np.flatnonzero(analyzer.df['id'].isin(pids)), :],
                                           axis=0) / len(pids)
print(terms_freqs_per_comp.shape)

In [None]:
import math

n = 500

ncols = 4
nrows = int(math.ceil(len(comp_pids) / ncols))
plt.subplots(nrows=nrows, ncols=ncols, figsize=(15, int(15 * nrows / ncols)))
for comp, _ in comp_pids.items():
    freqs = terms_freqs_per_comp[comp, :]
    freq_keywords_indx = freqs.argsort()[-n:][::-1]
    ax = plt.subplot(nrows, ncols, comp + 1)
    wxs = [xs[i] for i in freq_keywords_indx]
    wys = [ys[i] for i in freq_keywords_indx]
    words = [analyzer.corpus_tokens[i] for i in freq_keywords_indx]
    sizes = [freqs[i] * 50 for i in freq_keywords_indx]
    ax.scatter(wxs, wys, marker='o', sizes=sizes, color=cmap(comp), alpha=0.5)
#     plt.xlabel('tSNE1')
#     plt.ylabel('tSNE2')
#     plt.title(f'Topic {comp + 1} papers frequent keywords in global word embedding space')
#     for word, x, y in zip(words, wxs, wys):
#         plt.annotate(word, xy=(x, y-0.1))
plt.show()

# Authors analysis

In [None]:
from pysrc.papers.utils import cut_authors_list, color_to_rgb


def plot_embeddings(df, clusters):
    cmap = factors_colormap(len(set(clusters)))
    palette = dict(zip(sorted(set(clusters)), [color_to_rgb(cmap(i)).to_hex()
                                               for i in range(len(set(clusters)))]))

    df['size'] = 5 + df['total'] / df['total'].max() * 20

    # Split authors
    df['authors'] = df['authors'].apply(lambda authors: cut_authors_list(authors))

    ds = ColumnDataSource(df)
    # Add clusters coloring
    ds.add([palette[c] for c in clusters], 'color')
    p = figure(width=600, height=600,
               tools="hover,pan,tap,wheel_zoom,box_zoom,reset,save")
    p.sizing_mode = 'stretch_width'
    p.xaxis.axis_label = 'x'
    p.yaxis.axis_label = 'y'

    p.hover.tooltips = plotter._paper_html_tooltips([
        ("Author(s)", '@authors'),
        ("Journal", '@journal'),
        ("Year", '@year'),
        ("Type", '@type'),
        ("Cited by", '@total paper(s) total'),
        ("Topic", '@comp')])
    p.scatter(x='x', y='y', fill_alpha=0.8, source=ds, size='size',
             line_color='black', fill_color='color', legend_field='comp')
    p.legend.visible = False
    show(p)

In [None]:
def fa(authors, first_last_only=True):
    return authors if len(authors) <= 2 or not first_last_only else [authors[0], authors[-1]]

## Authors graph

In [None]:
def compute_authors_citations_and_papers(df):
    logger.debug('Compute author citations')
    author_citations = {}
    for i, row in tqdm(df[['authors', 'total']].iterrows()):
        authors = fa(row['authors'].split(', '))
        for a in authors:
            author_citations[a] = author_citations.get(a, 0) + row['total']

    logger.debug('Compute number of papers per author')
    author_papers = {}
    for i, row in df[['id', 'title', 'authors']].iterrows():
        pmid = row['id']
        authors = fa(row['authors'].split(', '))
        for a in authors:
            if a not in author_papers:
                author_papers[a] = []    
            author_papers[a].append(pmid)

    return author_citations, author_papers

In [None]:
logging.info("Analyzing top 20% of authors")
authors_citations, authors_papers = compute_authors_citations_and_papers(analyzer.df)
logging.info(f"Total first and last authors {len(authors_papers)}")
min_threshold = np.percentile([len(ps) for ps in authors_papers.values()], 80)
logging.info(f'Min papers for author {min_threshold}')
logging.info(f'Filtered authors: {sum(len(v) >= min_threshold for v in authors_papers.values())}')

In [None]:
import networkx as nx


def build_authors_similarity_graph(df,
                                   cocit_grouped_df, bibcoupling_df, cit_df,
                                   check_author_func=lambda a: True):
    logger.debug('Processing papers')
    result = nx.Graph()
    for _, row in tqdm(df[['authors']].iterrows()):
        authors = fa(row[0].split(', '))
        for i in range(len(authors)):
            for j in range(i + 1, len(authors)):
                a1 = authors[i]
                a2 = authors[j]
                if check_author_func(a1) and check_author_func(a2):
                    update_edge(result, a1, a2, 'authorship', 1)

    logger.debug('Processing co-citations')
    for el in tqdm(cocit_grouped_df[['cited_1', 'cited_2', 'total']].values):
        start, end, cocitation = str(el[0]), str(el[1]), float(el[2])
        authors1 = fa(df.loc[df['id'] == start]['authors'].values[0].split(', '))
        authors2 = fa(df.loc[df['id'] == end]['authors'].values[0].split(', '))
        for a1, a2 in itertools.product(authors1, authors2):
            if check_author_func(a1) and check_author_func(a2):
                update_edge(result, a1, a2, 'cocitation', cocitation)

    logger.debug('Bibliographic coupling')
    if len(bibcoupling_df) > 0:
        for el in tqdm(bibcoupling_df[['citing_1', 'citing_2', 'total']].values):
            start, end, bibcoupling = str(el[0]), str(el[1]), float(el[2])
            authors1 = fa(df.loc[df['id'] == start]['authors'].values[0].split(', '))
            authors2 = fa(df.loc[df['id'] == end]['authors'].values[0].split(', '))
            for a1, a2 in itertools.product(authors1, authors2):
                if check_author_func(a1) and check_author_func(a2):
                    update_edge(result, a1, a2, 'bibcoupling', bibcoupling)

    logger.debug('Citations')
    # Citations
    for start, end in zip(cit_df['id_out'], cit_df['id_in']):
        authors1 = fa(df.loc[df['id'] == start]['authors'].values[0].split(', '))
        authors2 = fa(df.loc[df['id'] == end]['authors'].values[0].split(', '))
        for a1, a2 in itertools.product(authors1, authors2):
            if check_author_func(a1) and check_author_func(a2):
                update_edge(result, a1, a2, 'citation', 1)

    return result


def update_edge(graph, a1, a2, name, value):
    if a1 == a2:
        return
    if a1 > a2:
        a1, a2 = a2, a1
    if not graph.has_edge(a1, a2):
        graph.add_edge(a1, a2)
    edge = graph[a1][a2]
    edge[name] = edge.get(name, 0) + value

In [None]:
import itertools

logger = logging.getLogger('Test')

authors_similarity_graph = build_authors_similarity_graph(
    analyzer.df, analyzer.cocit_grouped_df,
    analyzer.bibliographic_coupling_df,
    analyzer.cit_df,
    check_author_func=lambda a: len(authors_papers[a]) >= min_threshold
)

logging.info(f'Built authors graph - '
             f'{len(authors_similarity_graph.nodes())} nodes and {len(authors_similarity_graph.edges())} edges')

## Node2vec embeddings for authors graph

In [None]:
from pysrc.papers.analysis.node2vec import node2vec
from pysrc.papers.analysis.graph import sparse_graph, similarity

def to_weighted_graph(graph, weight_func, key='weight'):
    logger.debug('Creating weighted graph')
    g = nx.Graph()
    for u, v, data in graph.edges(data=True):
        w = weight_func(data)
        if np.isnan(w):
            raise Exception(f'Weight is NaN {w}')
        elif w < 0:
            raise Exception(f'Weight is < 0 {w}')
        elif w != 0:
            g.add_edge(u, v, **{key: w})
    # Ensure all the nodes present
    for v in graph.nodes:
        if not g.has_node(v):
            g.add_node(v)
    return g


logger.debug('Compute aggregated similarity using co-authorship')
ga = to_weighted_graph(authors_similarity_graph,
                       weight_func=lambda d: 100 * d.get('authorship', 0) + similarity(d))
gs = sparse_graph(ga, 10)
authors_node_ids = list(authors_similarity_graph.nodes)
authors_weighted_node_embeddings = node2vec(authors_node_ids, gs)
print(authors_weighted_node_embeddings.shape)

## Authors papers embeddings

In [None]:
from pysrc.papers.analysis.text import texts_embeddings
papers_text_embeddings = texts_embeddings(
    analyzer.corpus_counts, analyzer.corpus_tokens_embedding
)
print(papers_text_embeddings.shape)

In [None]:
authors_node_ids = list(authors_similarity_graph.nodes)
papers_idx = {pmid: idx for idx, pmid in enumerate(analyzer.df['id'])}
authors_papers_embeddings = np.zeros((len(authors_node_ids), papers_text_embeddings.shape[1]))
for i, a in enumerate(authors_node_ids):
    for pmid in authors_papers[a]:
        authors_papers_embeddings[i, :] += papers_text_embeddings[papers_idx[pmid], :]
    authors_papers_embeddings[i, :] /= len(authors_papers[a])
authors_papers_embeddings.shape

## Merge embeddings

In [None]:
from pysrc.config import *
authors_embeddings = (authors_weighted_node_embeddings * GRAPH_EMBEDDINGS_FACTOR +
                      authors_papers_embeddings * TEXT_EMBEDDINGS_FACTOR
                      ) / (GRAPH_EMBEDDINGS_FACTOR + TEXT_EMBEDDINGS_FACTOR)

## Plot

In [None]:
AUTHORS_HIGHLIGTHS = ['A1', 'A2']

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

logger.debug('Computing PCA projection')
pca = PCA(n_components=min(len(authors_embeddings), PCA_COMPONENTS))
t = StandardScaler().fit_transform(authors_embeddings)
authors_pca_coords = pca.fit_transform(t)
logger.debug(f'Explained variation {int(np.sum(pca.explained_variance_ratio_) * 100)}%')

logger.debug('Apply t-SNE transformation on node embeddings')
authors_tsne = TSNE(n_components=2, random_state=42, perplexity=min(30, authors_pca_coords.shape[0] - 1))
authors_weighted_node_embeddings_2d = authors_tsne.fit_transform(authors_pca_coords)

In [None]:
# Build dataframe combining information about authors and projected coordinates
authors_df = pd.DataFrame(dict(author=authors_node_ids,
                               x=authors_weighted_node_embeddings_2d[:, 0],
                               y=authors_weighted_node_embeddings_2d[:, 1]))
authors_df['cited'] = [authors_citations[a] for a in authors_df['author']]
authors_df['papers'] = [authors_papers[a] for a in authors_df['author']]
authors_df['size'] = [1 + 10 * np.log1p(authors_citations[a]) for a in authors_df['author']]
# Limit max size
authors_df['size'] = authors_df['size'] * 10 / authors_df['size'].max() + 3

In [None]:
from bokeh.models import LabelSet, Label


def plot_authors(authors_df, highlights=[], groups_topics=None, width=600, height=600):
    clusters = set(authors_df['cluster'])
    cmap = factors_colormap(len(clusters))
    palette = dict(zip(sorted(clusters),
                       [color_to_rgb(cmap(i)).to_hex() for i in range(len(clusters))]))
    authors_df['color'] = [palette[c] for c in authors_df['cluster']]
    authors_df['line_width'] = [3 if a in highlights else 1 for a in authors_df['author']]
    ds = ColumnDataSource(authors_df)
    del authors_df['color'], authors_df['line_width']
    x = authors_df['x']
    y = authors_df['y']
    xrange = max(x) - min(x)
    yrange = max(y) - min(y)
    p = figure(width=width, height=height,
               x_range=(min(x) - 0.05 * xrange, max(x) + 0.05 * xrange),
               y_range=(min(y) - 0.05 * yrange, max(y) + 0.05 * yrange),
               tools="hover,pan,tap,wheel_zoom,box_zoom,reset,save",
               tooltips=[("Author", '@author'),
                         ("Papers", '@papers'),
                         ("Cited by", '@cited'),
                         ("Cluster", '@cluster'),
                         ("Tags", '@tags')])

    p.xaxis.major_tick_line_color = None  # turn off x-axis major ticks
    p.xaxis.minor_tick_line_color = None  # turn off x-axis minor ticks
    p.yaxis.major_tick_line_color = None  # turn off y-axis major ticks
    p.yaxis.minor_tick_line_color = None  # turn off y-axis minor ticks
    p.xaxis.major_label_text_font_size = '0pt'  # preferred method for removing tick labels
    p.yaxis.major_label_text_font_size = '0pt'  # preferred method for removing tick labels
    p.grid.grid_line_color = None
    p.outline_line_color = None
    p.sizing_mode = 'stretch_width'

    p.scatter(source=ds, x='x', y='y', fill_alpha=0.8, size='size', line_width='line_width',
             line_color='black', fill_color='color')

    lxs = [authors_df.loc[authors_df['cluster'] == c]['x'].mean() for c in sorted(clusters)]
    lys = [authors_df.loc[authors_df['cluster'] == c]['y'].mean() for c in sorted(clusters)]
    cluster_labels = [f'#{c + 1}' for c in sorted(clusters)]
    source = ColumnDataSource({'x': lxs, 'y': lys, 'name': cluster_labels})
    labels = LabelSet(x='x', y='y', text='name', source=source,
                      background_fill_color='white', text_font_size='11px', background_fill_alpha=.9)
    p.renderers.append(labels)
    for i, c in enumerate(sorted(clusters)):
        p.rect(x=min(x), y=max(y) - i * 2, width=1, height=2, fill_color=palette[c], line_color=None)
        if groups_topics is not None:
            text = f"#{c + 1} {', '.join(t for t, _ in groups_topics[c][:5])}"
        else:
            text = f"#{c + 1}"
        p.add_layout(Label(
                    x=min(x) + 1, y=max(y) - 1 - i * 2,
                    text=text,
                    text_font_size='11px',
                    text_align="left",
                    background_fill_color="white",
                    background_fill_alpha=0.7,
                ))
    return p

In [None]:
authors_df['cluster'] = 0
authors_df['tags'] = 'n/a'
show(plot_authors(authors_df, highlights=AUTHORS_HIGHLIGTHS))

## Authors clustering

In [None]:
from pysrc.papers.analysis.topics import cluster_and_sort

author_clusters, _ = cluster_and_sort(authors_pca_coords, 10, 5)
authors_df['cluster'] = author_clusters

print('Cluster sizes')
t = pd.DataFrame({'cluster': author_clusters,
                  'size': np.ones(len(author_clusters))}).groupby(['cluster']).sum().astype(int).reset_index()
sns.barplot(data=t, x='cluster', y='size')
plt.tight_layout()
plt.show()

In [None]:
show(plot_authors(authors_df, highlights=AUTHORS_HIGHLIGTHS))

## Analyze authors group topics

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer


def compute_tfidf(corpus_counts):
    logger.debug('Compute TF-IDF on tokens counts')
    tfidf_transformer = TfidfTransformer()
    tfidf = tfidf_transformer.fit_transform(corpus_counts)
    logger.debug(f'TFIDF shape {tfidf.shape}')
    return tfidf


def compute_groups_topics(authors_df):
    logging.info('Computing groups of authors topics')
    groups_counts = np.zeros(shape=(len(set(authors_df['cluster'])), analyzer.corpus_counts.shape[1]), dtype=np.float64)

    part_sizes = Counter(authors_df['cluster'])
    authors_clusters_map = dict(zip(authors_df['author'], authors_df['cluster']))

    for i, row in tqdm(analyzer.df[['authors']].iterrows()):
        for a in row[0].split(', '):
            if a in authors_clusters_map:
                group = authors_clusters_map[a]
                groups_counts[group, :] += analyzer.corpus_counts[i, :] / part_sizes[group]

    tfidf = compute_tfidf(groups_counts)

    logging.info('Take terms with the largest tfidf for topics')
    result = {}
    for g in range(groups_counts.shape[0]):
        counter = Counter()
        for i, t in enumerate(analyzer.corpus_tokens):
            counter[t] += tfidf[g, i]
        # Ignore terms with insignificant frequencies
        result[g] = [(t, f) for t, f in counter.most_common(10) if f > 0]
    return result



groups_topics = compute_groups_topics(authors_df)
kwds = [(g, ','.join(f'{t}:{v:.3f}' for t, v in vs)) for g, vs in groups_topics.items()]
logging.info('Description\n' + '\n'.join(f'{g}: {kwd}' for g, kwd in kwds))


In [None]:
groups_df = pd.DataFrame(columns=['group', 'authors', 'keywords'], dtype=object)
for g in sorted(set(authors_df['cluster'])):
    authors = ', '.join(authors_df.loc[authors_df['cluster'] == g]['author'])
    groups_df.loc[len(groups_df)] = (g, authors, ','.join(t for t, _ in groups_topics[g]))

display(groups_df)

In [None]:
authors_df['tags'] = [', '.join(f'{t}:{v:.3f}' for t, v in groups_topics[c][:5]) for c in authors_df['cluster']]
show(plot_authors(authors_df, highlights=AUTHORS_HIGHLIGTHS, groups_topics=groups_topics))

In [None]:
#logging.info('Saving author groups graph for bokeh')
#output_file(filename=os.path.expanduser("~/authors.html"), title="Authors similarity graph")
#save(plot_authors(authors_df, width=1600, height=1200))
#reset_output()
#output_notebook()

### Use ego-splitting to compute possible overlapping groups of authors
Taken from https://github.com/benedekrozemberczki/EgoSplitting

In [None]:
import community
from tqdm import tqdm


class EgoNetSplitter(object):
    """An implementation of `"Ego-Splitting" see:
    https://www.eecs.yorku.ca/course_archive/2017-18/F/6412/reading/kdd17p145.pdf
    From the KDD '17 paper "Ego-Splitting Framework: from Non-Overlapping to Overlapping Clusters".
    The tool first creates the egonets of nodes.
    A persona-graph is created which is clustered by the Louvain method.
    The resulting overlapping cluster memberships are stored as a dictionary.
    Args:
        resolution (float): Resolution parameter of Python Louvain. Default 1.0.
    """

    def __init__(self, resolution=1.0):
        self.resolution = resolution

    def _create_egonet(self, node):
        """
        Creating an ego net, extracting personas and partitioning it.

        Args:
            node: Node ID for egonet (ego node).
        """
        ego_net_minus_ego = self.graph.subgraph(self.graph.neighbors(node))
        components = {i: n for i, n in enumerate(nx.connected_components(ego_net_minus_ego))}
        new_mapping = {}
        personalities = []
        for k, v in components.items():
            personalities.append(self.index)
            for other_node in v:
                new_mapping[other_node] = self.index
            self.index = self.index + 1
        self.components[node] = new_mapping
        self.personalities[node] = personalities

    def _create_egonets(self):
        """
        Creating an egonet for each node.
        """
        self.components = {}
        self.personalities = {}
        self.index = 0
        print("Creating egonets.")
        for node in tqdm(self.graph.nodes()):
            self._create_egonet(node)

    def _map_personalities(self):
        """
        Mapping the personas to new nodes.
        """
        self.personality_map = {p: n for n in self.graph.nodes() for p in self.personalities[n]}

    def _get_new_edge_ids(self, edge):
        """
        Getting the new edge identifiers.
        Args:
            edge: Edge being mapped to the new identifiers.
        """
        return self.components[edge[0]][edge[1]], self.components[edge[1]][edge[0]]

    def _create_persona_graph(self):
        """
        Create a persona graph using the egonet components.
        """
        print("Creating the persona graph.")
        self.persona_graph_edges = [self._get_new_edge_ids(e) for e in tqdm(self.graph.edges())]
        self.persona_graph = nx.from_edgelist(self.persona_graph_edges)

    def _create_partitions(self):
        """
        Creating a non-overlapping clustering of nodes in the persona graph.
        """
        print("Clustering the persona graph.")
        self.partitions = community.best_partition(self.persona_graph, resolution=self.resolution)
        self.overlapping_partitions = {node: [] for node in self.graph.nodes()}
        for node, membership in self.partitions.items():
            self.overlapping_partitions[self.personality_map[node]].append(membership)

    def fit(self, graph):
        """
        Fitting an Ego-Splitter clustering model.

        Arg types:
            * **graph** *(NetworkX graph)* - The graph to be clustered.
        """
        self.graph = graph
        self._create_egonets()
        self._map_personalities()
        self._create_persona_graph()
        self._create_partitions()

    def get_memberships(self):
        r"""Getting the cluster membership of nodes.
        Return types:
            * **memberships** *(dictionary of lists)* - Cluster memberships.
        """
        return self.overlapping_partitions


In [None]:
splitter = EgoNetSplitter(0.8)
splitter.fit(authors_similarity_graph)

ego_clusters = []
for a, cs in splitter.overlapping_partitions.items():
    ego_clusters.extend(cs)
print('Total clusters', len(set(ego_clusters)))
print('Clusters', Counter(ego_clusters))
print(len(authors_similarity_graph.nodes()))
print(len(ego_clusters))