# Pubtrends-analysis

Notebook with default analysis

# Config

In [None]:
import logging
from collections import Counter

import pandas as pd
import seaborn as sns
from bokeh.models import ColumnDataSource
from bokeh.plotting import show, figure, output_notebook
from matplotlib import pyplot as plt
from tqdm.auto import tqdm

from pysrc.config import *
from pysrc.papers.db.pm_postgres_loader import PubmedPostgresLoader
from pysrc.papers.utils import SORT_MOST_CITED

SEARCH_SORT = SORT_MOST_CITED
SEARCH_PAPERS = 10_000

logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s: %(message)s')
logger = logging.getLogger('notebook')

# Avoid info message about compilation flags
# tf.get_logger().setLevel('ERROR')

output_notebook()

%matplotlib inline
%config InlineBackend.figure_format='retina'

# Papers lookup

In [None]:
from pysrc.papers.analyzer import PapersAnalyzer

config = PubtrendsConfig(test=False)
loader = PubmedPostgresLoader(config)
analyzer = PapersAnalyzer(loader, config)

## By titles

In [None]:
# titles = ['Title1', 'Title2']

In [None]:
# import re
# from pysrc.papers.db.postgres_utils import preprocess_quotes, preprocess_search_query_for_postgres
# from pysrc.papers.utils import SORT_MOST_RECENT
#
# pmids = []
# for title in tqdm(titles):
#     paperids = loader.search_key_value('title', title)
#     if paperids:
#         pmids.extend(paperids)
#     else:
#         print(f'NOT FOUND: {title}')
#
# print('Found papers', len(pmids), 'of', len(titles))

## By DOI

In [None]:
# from pysrc.papers.utils import cut_authors_list, crc32, \
#     preprocess_doi, preprocess_search_title, rgb2hex
# dois = [preprocess_doi(d) for d in dois]
# pmids = []
# for doi in tqdm(dois):
#    paperids = loader.find('doi', doi)
#    if paperids:
#        pmids.extend(paperids)
#    else:
#        print(doi)

## With Pubmed syntax

In [None]:
# import os
# from Bio import Entrez
# Entrez.email = 'os@jetbrains.com'
# QUERY = '((Aging) NOT (Review[Publication Type])) AND (("2015"[Date - Publication] : "2025"[Date - Publication]))'
# handle = Entrez.esearch(db='pubmed', retmax='1000', retmode='xml', term=QUERY)
# pmids = Entrez.read(handle)['IdList']
# print(f'Found {len(pmids)} papers')

## By Ids

In [None]:
import os
path_data = os.path.expanduser('~/Downloads/immunology_paper_ids_fixed.tsv')
df = pd.read_csv(path_data, sep='\t')
df = df[df['pmid'].isna() == False].copy()
df['pmid'] = df['pmid'].astype(int)
pmids = df['pmid'].tolist()
len(pmids)

## Regular search

In [None]:
# try:
#     pmids = analyzer.search_terms('Human Immune Aging', 1000, SORT_MOST_CITED)
# finally:
#     loader.close_connection()
#     analyzer.teardown()

# Analysis

In [None]:
try:
    # Use cache to be able to load all the intermediate results
    analyzer.analyze_papers(pmids, 100, cache=True)
finally:
    loader.close_connection()
    analyzer.teardown()

# Result

In [None]:
OUTPUT = os.path.expanduser('~/Result')
! mkdir -p {OUTPUT}

In [None]:
analyzer.df[[c for c in analyzer.df.columns if type(c) != int]].to_csv(f'{OUTPUT}/result.csv.gz', index=None, compression='gzip')

# Standard plots

In [None]:
from pysrc.papers.plot.plotter import Plotter
analyzer.search_ids = pmids
plotter = Plotter(config, analyzer)

In [None]:
print(len(analyzer.df))

In [None]:
show(plotter.plot_papers_by_year())

In [None]:
show(plotter.plot_top_cited_papers())

In [None]:
show(plotter.plot_most_cited_per_year_papers())

In [None]:
show(plotter.plot_fastest_growth_per_year_papers())

In [None]:
from pysrc.papers.analysis.text import get_frequent_tokens
from itertools import chain

freq_kwds = get_frequent_tokens(chain(*chain(*plotter.data.corpus)))
show(plotter.plot_keywords_frequencies(freq_kwds))

In [None]:
show(plotter.plot_papers_graph(keywords=False, boundaries=False, centroids=False, interactive=False, scale=0.2))

# Save graphs to html

In [None]:
graph_html = """
<head>
    <script src="https://cdn.jsdelivr.net/npm/jquery@3.5.1/dist/jquery.min.js"></script>
    <script src="https://cdn.jsdelivr.net/npm/jquery-ui@1.14.1/dist/jquery-ui.min.js"></script>
    <script src="https://cdn.bokeh.org/bokeh/release/bokeh-3.6.3.min.js"></script>
    <script src="https://cdn.bokeh.org/bokeh/release/bokeh-widgets-3.6.3.min.js"></script>
</head>
<body>
<div id="network">
    {% if papers_graph is defined %}
    {% for script, div in papers_graph %}
    {{ script|safe }}
    {{ div|safe }}
    {% endfor %}
    {% endif %}
</div>
</body>
"""

In [None]:
from pysrc.papers.analysis.graph import sparse_graph
from pysrc.papers.plot.plotter import Plotter, components_list
import jinja2
import os


print('Prepare sparse graph to visualize with reduced number of edges')
visualize_graph = sparse_graph(plotter.data.papers_graph, VISUALIZATION_GRAPH_EDGES)

print('Plotting graph')
papers_graph = Plotter._plot_papers_graph(
    None, 'Pubmed', visualize_graph, plotter.data.df, TOPIC_DESCRIPTION_WORDS, 
    topics_tags=plotter.topics_description,
    width=1500, height=1000,
    keywords=False, boundaries=False, centroids=False, interactive=True, callbacks=True, scale=0.3
)

path = os.path.expanduser(f'{OUTPUT}/graph.html')

html = jinja2.Environment(loader=jinja2.BaseLoader()).from_string(graph_html).render(
    papers_graph=components_list(papers_graph)
)

with open(path, 'w') as f:
    f.write(html)
print(f'Saved to {path}')

In [None]:
import networkx as nx

def topic_graph(graph, df, comp):
    comp_ids = set(df[df['comp'] == comp]['id'])
    logger.debug(f'Filtering graph to component {comp}')
    result = nx.Graph()
    for n in graph.nodes():
        if n in comp_ids:
            result.add_node(n)
    for (s, e, d) in graph.edges(data=True):
        if s in comp_ids and e in comp_ids:
            result.add_edge(s, e, **d)
    logger.debug(f'Filtered graph nodes={result.number_of_nodes()} edged={result.number_of_edges()}')
    return result

In [None]:
for comp in sorted(analyzer.df['comp'].unique()):
    path = os.path.expanduser(f'{OUTPUT}/graph_{comp + 1}.html')
    graph_comp = topic_graph(plotter.data.papers_graph, plotter.data.df, comp)
    visualize_graph = sparse_graph(graph_comp, VISUALIZATION_GRAPH_EDGES)
    papers_graph = Plotter._plot_papers_graph(
        None, 'Pubmed', visualize_graph, plotter.data.df[plotter.data.df['comp']==comp], TOPIC_DESCRIPTION_WORDS, 
        topics_tags=plotter.topics_description,
        width=1500, height=1000,
        keywords=False, boundaries=False, centroids=False, interactive=True, callbacks=True, scale=0.3
    )
    html = jinja2.Environment(loader=jinja2.BaseLoader()).from_string(graph_html).render(
        papers_graph=components_list(papers_graph)
    )
    with open(path, 'w') as f:
        f.write(html)
    print(f'Saved to {path}')

# Keywords to file

In [None]:
import gzip
import json

path = os.path.expanduser(f'{OUTPUT}/keywords.json.gz')
with gzip.open(path, 'wt', encoding='utf-8') as f:
    json.dump(plotter.topics_description, f)
print(f'Saved to {path}')

# Prepare topics html

In [None]:
import base64
from io import BytesIO
from bokeh.embed import components

def wordcloud_data_uri(wc) -> str:
    img = wc.to_image()
    buf = BytesIO()
    img.save(buf, format="PNG")
    png_bytes = buf.getvalue()

    b64 = base64.b64encode(png_bytes).decode("ascii")
    return f"data:image/png;base64,{b64}"

wctis_to_plot = [
        (components(p), wordcloud_data_uri(wc))
        for (p, wc) in plotter.plot_topics_info_and_word_cloud()
]

In [None]:
topics_html = """
<head>
    <script src="https://cdn.jsdelivr.net/npm/jquery@3.5.1/dist/jquery.min.js"></script>
    <script src="https://cdn.jsdelivr.net/npm/jquery-ui@1.14.1/dist/jquery-ui.min.js"></script>
    <script src="https://cdn.bokeh.org/bokeh/release/bokeh-3.6.3.min.js"></script>
    <script src="https://cdn.bokeh.org/bokeh/release/bokeh-widgets-3.6.3.min.js"></script>
</head>
<body>
    <div>
    {% for (script, div), word_cloud in topics_info_and_word_cloud %}
        <div>
            <h1>
                Topic {{ loop.index }}
            </h1> 
            Number of papers: {{ component_sizes[loop.index - 1] }}
            <div style="display: flex; align-items: flex-start; gap: 16px; width: 100%;">
            <div style="flex: 0 0 auto;">
                    <img src="{{ word_cloud }}" alt="Wordcloud">
            </div>

            <div style="flex: 1 1 auto; min-width: 0;">
                <small><strong>Publications</strong></small>
                {{ script|safe }}
                {{ div|safe }}
            </div>
        </div>
    </div>
    {% endfor %}
    </div>
</body>
"""

In [None]:
from pysrc.papers.plot.plot_preprocessor import PlotPreprocessor

path = os.path.expanduser(f'{OUTPUT}/topics.html')

html = jinja2.Environment(loader=jinja2.BaseLoader()).from_string(topics_html).render(
    component_sizes=PlotPreprocessor.component_sizes(analyzer.df),
    topics_info_and_word_cloud=wctis_to_plot
)

with open(path, 'w') as f:
    f.write(html)
print(f'Saved to {path}')

# Paper tables

In [None]:
topic_papers_html = """
<head>
    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.2/dist/css/bootstrap.min.css">
    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.11.3/font/bootstrap-icons.css">
    <link rel="stylesheet" href="/static/style.css?v=3">
    
    <!-- Keep DataTables using Bootstrap 4 skin to avoid regressions -->
    <link href="https://cdn.datatables.net/2.3.3/css/dataTables.bootstrap4.min.css" rel="stylesheet" type="text/css">
    <link rel="stylesheet" href="/static/feedback.css?v=1">

    <script src="https://cdn.jsdelivr.net/npm/jquery@3.5.1/dist/jquery.min.js"></script>
    <script src="https://cdn.jsdelivr.net/npm/jquery-ui@1.14.1/dist/jquery-ui.min.js"></script>
    <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.2/dist/js/bootstrap.bundle.min.js"></script>
    
    <!-- DataTables with search (deferred by base) -->
    <script src="https://cdn.datatables.net/2.3.3/js/dataTables.min.js"></script>
    <script src="https://cdn.datatables.net/2.3.3/js/dataTables.bootstrap4.min.js"></script>
    <script src="https://cdn.jsdelivr.net/g/mark.js(jquery.mark.min.js)"></script>
    <script src="https://cdn.datatables.net/plug-ins/2.3.3/features/mark.js/datatables.mark.js"></script>
    <script>
    "use strict";
    document.addEventListener('DOMContentLoaded', function () {
        // Mark search by default in datatables
        if ($ && $.fn && $.fn.dataTable) {
            $.extend(true, $.fn.dataTable.defaults, { mark: true });
        }
        // Initialize DataTable and ensure it uses full width
        if ($ && $.fn && $.fn.DataTable) {
            window.papers_table = $('#papers').DataTable({
                autoWidth: false
            });
            // Recalculate column widths now that the table is visible and has 100% width
            window.papers_table.columns.adjust();
        }
    });
    </script>
<body>
<div class="table-responsive">
    <table id="papers" class="table table-sm table-bordered table-striped w-100" width="100%" aria-describedby="papers table">
        <thead>
            <tr>
                <th scope="col">#</th>
                <th scope="col">Title</th>
                <th scope="col">Authors</th>
                <th scope="col">Journal</th>
                <th scope="col">Year</th>
                <th scope="col">Id</th>
                <th scope="col">Doi</th>
                <th scope="col">Cited</th>
                <th scope="col">Topic</th>
            </tr>
        </thead>
        <tbody>
        {% for id, title, authors, url, journal, year, total, doi, topic in papers %}
            <tr>
                <th scope="row">{{ loop.index }}</th>
                <td>{{ title }}</td>
                <td>{{ authors }}</td>
                <td>{{ journal }}</td>
                <td>{{ year }}</td>
                <td><a class="link" href="{{ url }}" title="Open in {{ source }}">{{ id }}</a></td>
                <td>
                    {% if doi != "" %}
                    <a class="link" href="https://doi.org/{{ doi }}">{{ doi }}</a>
                    {% endif %}
                </td>
                <td>{{ total }}</td>
                <td>{{ topic }}</td>
            </tr>
        {% endfor %}
        </tbody>
    </table>
</div>
</body>
"""

In [None]:
from pysrc.papers.plot.plot_app import prepare_papers_data

for comp in sorted(analyzer.df['comp'].unique()):
    path = os.path.expanduser(f'{OUTPUT}/papers_{comp + 1}.html')

    html = jinja2.Environment(loader=jinja2.BaseLoader()).from_string(topic_papers_html).render(
        papers=prepare_papers_data(plotter.data, comp, None, None, None, None)
    )
    with open(path, 'w') as f:
        f.write(html)
    print(f'Saved to {path}')