# Reanalyze exported file


In [None]:
import gzip
import json
import logging
import os

from bokeh.plotting import output_notebook

from pysrc.config import PubtrendsConfig
from pysrc.papers.utils import SORT_MOST_CITED

SEARCH_SORT = SORT_MOST_CITED
SEARCH_PAPERS = 10_000

logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s: %(message)s')
logger = logging.getLogger('notebook')

output_notebook()

%matplotlib inline
%config InlineBackend.figure_format='retina'

# Load data

In [None]:
from pysrc.papers.data import AnalysisData

# path_data = os.path.expanduser('~/Downloads/pubmed-human-aging.json.gz')
path_data = os.path.expanduser('~/Downloads/pubmed-human-aging-semantic.json.gz')
with gzip.open(path_data, 'r') as f:
    data = AnalysisData.from_json(json.loads(f.read().decode('utf-8')))
data


# Analysis

In [None]:
from pysrc.papers.analysis.graph import build_papers_graph, similarity

config = PubtrendsConfig(test=False)

bibliographic_graph = build_papers_graph(
    data.df, data.cit_df, data.cocit_grouped_df, data.bibliographic_coupling_df,
)
logger.debug(f'Bibliographic edges/nodes={bibliographic_graph.number_of_edges() / bibliographic_graph.number_of_nodes()}')

In [None]:
from pysrc.papers.analysis.text import chunks_to_text_embeddings

papers_text_embeddings = chunks_to_text_embeddings(data.df, data.chunks_embeddings, data.chunks_idx)

In [None]:
from pysrc.papers.analysis.node2vec import node2vec
from pysrc.config import NODE2VEC_GRAPH_EDGES, GRAPH_TEXT_SIMILARITY_EDGES
from pysrc.papers.analysis.graph import sparse_graph, add_text_similarities_edges


logger.debug('Adding text similarities edges')
add_text_similarities_edges(data.df['id'], papers_text_embeddings, bibliographic_graph, GRAPH_TEXT_SIMILARITY_EDGES)
logger.debug(f'Bibliographic+text edges/nodes={bibliographic_graph.number_of_edges() / bibliographic_graph.number_of_nodes()}')

logger.debug('Prepare sparse graph of text + bibliographic edges for node2vec')
for i, j in bibliographic_graph.edges():
    bibliographic_graph[i][j]['similarity'] = similarity(bibliographic_graph.get_edge_data(i, j))
analysis_graph = sparse_graph(bibliographic_graph, NODE2VEC_GRAPH_EDGES)


In [None]:
graph_embeddings = node2vec(
    data.df['id'],
    analysis_graph,
    key='similarity'
)

In [None]:
from pysrc.papers.analysis.clustering import cluster_and_sort

data.clusters, data.dendrogram = cluster_and_sort(graph_embeddings, 10)
data.df['comp'] = data.clusters

In [None]:
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from pysrc.config import PCA_VARIANCE

t = StandardScaler().fit_transform(graph_embeddings)
pca = PCA(n_components=PCA_VARIANCE, svd_solver="full")
pca_coords = pca.fit_transform(t)
logger.debug('Apply transformation')
tsne = TSNE(n_components=2, random_state=42, metric="cosine", perplexity=min(30, len(data.df) - 1))
coords = tsne.fit_transform(pca_coords)
data.df['x'] = coords[:, 0]
data.df['y'] = coords[:, 1]

In [None]:
from pysrc.papers.plot.plotter import Plotter
from bokeh.io import show

plotter = Plotter(config, data)

In [None]:
show(plotter.plot_papers_graph())

In [None]:
show(plotter.topics_hierarchy_with_keywords())

In [None]:
show(plotter.plot_papers_by_year())

In [None]:
show(plotter.plot_top_cited_papers())

In [None]:
show(plotter.plot_most_cited_per_year_papers())

In [None]:
show(plotter.plot_fastest_growth_per_year_papers())

In [None]:
from pysrc.papers.analysis.text import get_frequent_tokens
from itertools import chain

freq_kwds = get_frequent_tokens(chain(*chain(*plotter.data.corpus)))
show(plotter.plot_keywords_frequencies(freq_kwds))