# Pubtrends-analysis

Notebook with default analysis

# Config

In [None]:
import logging
from collections import Counter

import pandas as pd
import seaborn as sns
from bokeh.models import ColumnDataSource
from bokeh.plotting import show, figure, output_notebook
from matplotlib import pyplot as plt
from tqdm.auto import tqdm

from pysrc.config import PubtrendsConfig
from pysrc.papers.db.pm_postgres_loader import PubmedPostgresLoader
from pysrc.papers.utils import SORT_MOST_CITED

SEARCH_SORT = SORT_MOST_CITED
SEARCH_PAPERS = 10_000

logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s: %(message)s')
logger = logging.getLogger('notebook')

# Avoid info message about compilation flags
# tf.get_logger().setLevel('ERROR')

output_notebook()

%matplotlib inline
%config InlineBackend.figure_format='retina'

# Papers lookup

In [None]:
from pysrc.papers.analyzer import PapersAnalyzer

config = PubtrendsConfig(test=False)
loader = PubmedPostgresLoader(config)
analyzer = PapersAnalyzer(loader, config)

## By titles

In [None]:
# titles = ['Title1', 'Title2']

In [None]:
# import re
# from pysrc.papers.db.postgres_utils import preprocess_quotes, preprocess_search_query_for_postgres
# from pysrc.papers.utils import SORT_MOST_RECENT
#
# pmids = []
# for title in tqdm(titles):
#     paperids = loader.search_key_value('title', title)
#     if paperids:
#         pmids.extend(paperids)
#     else:
#         print(f'NOT FOUND: {title}')
#
# print('Found papers', len(pmids), 'of', len(titles))

## By DOI

In [None]:
# from pysrc.papers.utils import cut_authors_list, crc32, \
#     preprocess_doi, preprocess_search_title, rgb2hex
# dois = [preprocess_doi(d) for d in dois]
# pmids = []
# for doi in tqdm(dois):
#    paperids = loader.find('doi', doi)
#    if paperids:
#        pmids.extend(paperids)
#    else:
#        print(doi)

## With Pubmed syntax

In [None]:
# import os
# from Bio import Entrez
# Entrez.email = 'os@jetbrains.com'
# QUERY = '((Aging) NOT (Review[Publication Type])) AND (("2015"[Date - Publication] : "2025"[Date - Publication]))'
# handle = Entrez.esearch(db='pubmed', retmax='1000', retmode='xml', term=QUERY)
# pmids = Entrez.read(handle)['IdList']
# print(f'Found {len(pmids)} papers')

## By Ids

In [None]:
import os
path_data = os.path.expanduser('~/Downloads/immunology_paper_ids_fixed.tsv')
df = pd.read_csv(path_data, sep='\t')
df = df[df['pmid'].isna() == False].copy()
df['pmid'] = df['pmid'].astype(int)
pmids = df['pmid'].tolist()
len(pmids)

## Regular search

In [None]:
# try:
#     pmids = analyzer.search_terms('Human Immune Aging', 1000, SORT_MOST_CITED)
# finally:
#     loader.close_connection()
#     analyzer.teardown()

# Analysis

In [None]:
try:
    # Use cache to be able to load all the intermediate results
    analyzer.analyze_papers(pmids, 100, cache=True)
finally:
    loader.close_connection()
    analyzer.teardown()

In [None]:
from pysrc.papers.plot.plotter import Plotter
analyzer.search_ids = pmids
plotter = Plotter(config, analyzer)

In [None]:
show(plotter.plot_papers_by_year())

In [None]:
show(plotter.plot_top_cited_papers())

In [None]:
show(plotter.plot_most_cited_per_year_papers())

In [None]:
show(plotter.plot_fastest_growth_per_year_papers())

In [None]:
from pysrc.papers.analysis.text import get_frequent_tokens
from itertools import chain

freq_kwds = get_frequent_tokens(chain(*chain(*plotter.data.corpus)))
show(plotter.plot_keywords_frequencies(freq_kwds))

In [None]:
show(plotter.plot_papers_graph(keywords=False, boundaries=False, centroids=False, interactive=False, scale=0.2))