# Dump by years

In [7]:
import logging
import pandas as pd
from tqdm.auto import tqdm
import os
import psycopg2

logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s')
logger = logging.getLogger('notebook')

%matplotlib inline
%config InlineBackend.figure_format='retina'

In [8]:
postgres_host = 'localhost'
postgres_port = 5432
postgres_database = 'pubtrends'
postgres_username = 'biolabs'
postgres_password = 'mysecretpassword'

In [9]:
! mkdir -p ~/pubtrends_dump_tsv
for year in tqdm(range(2025, 1969, -1)):
    print(f'Processing year {year}')
    filename = os.path.expanduser(f'~/pubtrends_dump_tsv/{year}.tsv.gz')
    if os.path.exists(filename):
        continue
    connection_string = f"""
                    host={postgres_host} \
                    port={postgres_port} \
                    dbname={postgres_database} \
                    user={postgres_username} \
                    password={postgres_password}
                """.strip()
    with psycopg2.connect(connection_string) as connection:
        connection.set_session(readonly=True)
        query = f'''
                SELECT P.pmid as id, title, abstract
                FROM PMPublications P
                WHERE year = {year};
                '''
        with connection.cursor() as cursor:
            cursor.execute(query)
            df = pd.DataFrame(cursor.fetchall(),
                              columns=['id', 'title', 'abstract'],
                              dtype=object)
            print(len(df))
            df.to_csv(filename, index=None, sep='\t', compression='gzip')

  0%|          | 0/56 [00:00<?, ?it/s]

Processing year 2025
Processing year 2024
Processing year 2023
Processing year 2022
Processing year 2021
Processing year 2020
Processing year 2019
Processing year 2018
Processing year 2017
Processing year 2016
Processing year 2015
Processing year 2014
Processing year 2013
Processing year 2012
Processing year 2011
Processing year 2010
Processing year 2009
Processing year 2008
Processing year 2007
Processing year 2006
Processing year 2005
Processing year 2004
Processing year 2003
Processing year 2002
Processing year 2001
Processing year 2000
Processing year 1999
Processing year 1998
Processing year 1997
Processing year 1996
Processing year 1995
Processing year 1994
Processing year 1993
Processing year 1992
Processing year 1991
Processing year 1990
Processing year 1989
Processing year 1988
Processing year 1987
Processing year 1986
Processing year 1985
Processing year 1984
Processing year 1983
Processing year 1982
Processing year 1981
Processing year 1980
Processing year 1979
Processing ye

# Most cited Nature review papers

In [13]:
# Find most cited Nature review papers
with psycopg2.connect(connection_string) as connection:
    connection.set_session(readonly=True)
    query = '''
    WITH X AS
         (SELECT pmid as pmid, title, abstract, year
          FROM PMPublications P
          WHERE type = 'Review' AND
                aux -> 'journal' @> '{"name": "Nature"}' AND
                year >= 2015
          ORDER BY random()
          LIMIT 1000000)
    SELECT X.pmid as pmid, X.year, count, X.title, X.abstract
    FROM X
         LEFT JOIN matview_pmcitations C
                   ON X.pmid = C.pmid
    ORDER BY count DESC NULLS LAST, X.pmid
    LIMIT 100;
                '''
    with connection.cursor() as cursor:
        cursor.execute(query)
        df = pd.DataFrame(cursor.fetchall(),
                          columns=['id', 'year', 'cited', 'title', 'abstract'],
                          dtype=object)
        df.to_csv(os.path.expanduser('~/Desktop/most_cited_nature_review_papers_after_2015.tsv'), index=None, sep='\t')