# Download new lab publication information from PubMed
We use this notebook to periodically search for, and download information about, new papers by either Dr. Laird or Dr. Sutherland. This notebook uses BioPython's PubMed search tool to grab information from PubMed based on search criteria. Then, we build a publication-specific MarkDown file for each new paper. A lot of the elements of the file are automatically set up. The only thing you generally have to check is that the journal cover that the MarkDown file automatically points to exists. If the image doesn't exist, search online for a good one, export to PNG, and reduce the size to ~150px by 300px.

You might also want to check new papers for relevant info, like a link to a GitHub repository or OpenNeuro collection, that might be found in the text.

Unfortunately, this notebook cannot find new preprints, so the associated website files must be created manually. We also have to merge those files with the version grabbed from PubMed once the preprint is published by hand.

## Steps

1. Run this notebook.
2. If any new papers were grabbed, check the following:
    1. The journal image exists.
    2. The paper has either of the lab PIs as an author. Ensure that it isn't by *another* AR Laird or MT Sutherland.
    3. The paper is not a duplicate of a preprint or another version of the paper. If so, merge the two versions.
3. Save the changes to the notebook.
4. Push changes to the notebook and affected files to GitHub.
5. Open a pull request to NBCLab/NBCLab.github.io.

In [1]:
import re
from glob import glob

from Bio import Entrez
from Bio import Medline
from datetime import datetime
from dateutil import parser
import pandas as pd
import ast

In [2]:
# Only grab papers from after the lab PIs came to FIU.
searches = ['"Laird AR"[AUTH] AND ("2012/01/01"[PDAT] : "3000/12/31"[PDAT])',
            '"Sutherland MT"[AUTH] AND ("2012/01/01"[PDAT] : "3000/12/31"[PDAT])']

# Extract all publications matching term.
Entrez.email = 'tsalo006@fiu.edu'

In [3]:
rows = []

for TERM in searches:
    h = Entrez.esearch(db='pubmed', retmax='2', term=TERM)
    result = Entrez.read(h)
    print('Total number of publications containing {0}: {1}'.format(TERM, result['Count']))
    h_all = Entrez.esearch(db='pubmed', term=TERM, retmax=result['Count'])
    result_all = Entrez.read(h_all)
    ids_all = result_all['IdList']
    h = Entrez.efetch(db='pubmed', id=ids_all, rettype='medline', retmode='text')
    records = Medline.parse(h)

    acceptable_formats = ['journal article', 'comparative study', 'editorial',
                          'introductory journal article']
    for record in records:
        if any([type_.lower() in acceptable_formats for type_ in record.get('PT')]):
            pmid = record.get('PMID')
            pmcid = record.get('PMC', '')
            
            doi = [aid for aid in record.get('AID', []) if aid.endswith(' [doi]')]
            if doi:
                doi = doi[0].replace(' [doi]', '')
            else:
                doi = ''
            
            title = record.get('TI').rstrip('.')
            authors = record.get('AU')

            pub_date = parser.parse(record.get('DP'))
            year = pub_date.year
            month = pub_date.month
            day = pub_date.day
            
            journal = record.get('TA')
            volume = record.get('VI', '')
            issue = record.get('IP', '')
            pages = record.get('PG', '')
            
            abstract = record.get('AB', '')
            
            row = [pmid, pmcid, doi, title, authors, year, month,
                   day, journal, volume, issue, pages, abstract]
            rows += [row]

# Save all relevant info from articles to a csv.
df = pd.DataFrame(columns=['pmid', 'pmcid', 'doi', 'title', 'authors',
                           'year', 'month', 'day',
                           'journal', 'volume', 'issue', 'pages',
                           'abstract'],
                  data=rows)
df = df.sort_values(by=['pmid'])
df.to_csv('articles.csv', index=False)
df = df.fillna('')

Total number of publications containing "Laird AR"[AUTH] AND ("2012/01/01"[PDAT] : "3000/12/31"[PDAT]): 114
Total number of publications containing "Sutherland MT"[AUTH] AND ("2012/01/01"[PDAT] : "3000/12/31"[PDAT]): 37


In [4]:
# Grab our markdown file template
with open('papers/_posts/template_with_stuff.md', 'r') as fo:
    template = fo.read()

In [5]:
old_papers = sorted(glob('papers/_posts/20*.md'))

# One paper is by another MT Sutherland.
# Something to do with mouse teeth.
skip_pmids = ['28650075']

# Add papers we already have pages for.
old_pmids = skip_pmids
for pap in old_papers:
    # Grab each existing article's PMID
    with open(pap, 'r') as fo:
        dat = fo.readlines()
    line = [l for l in dat if l.startswith('pmid:')][0]
    pmid = line.replace('pmid:', '').strip()
    old_pmids.append(pmid)
    old_pmids = [pmid for pmid in old_pmids if pmid]
print("{} articles found.".format(len(old_papers)))
print("{} articles with PubMed IDs found.".format(len(old_pmids)))

133 articles found.
127 articles with PubMed IDs found.


In [6]:
# Just a small check. Unnecessary for the notebook.
journals = df['journal'].str.lower().unique()
print(journals)

['hum brain mapp' 'neuroinformatics' 'brain struct funct' 'neurol res int'
 'neuroimage' 'cogn affect behav neurosci' 'plos one' 'cereb cortex'
 'neurodegener dis' 'front neuroinform'
 'j am acad child adolesc psychiatry' 'psychopharmacology (berl)'
 'biol psychiatry' 'j pain' 'front hum neurosci' 'j neurosci'
 'obesity (silver spring)' 'neuroimage clin' 'front aging neurosci'
 'front neurosci' 'addict biol' 'brain lang' 'cortex' 'annu rev neurosci'
 'biol psychol' 'j biomed semantics' 'jama psychiatry' 'j addict'
 'neuropsychopharmacology' 'dev sci' 'neurosci biobehav rev'
 'behav brain funct' 'front neuroendocrinol' 'j bone miner res'
 'front behav neurosci' 'j psychopharmacol' 'trends mol med'
 'dev cogn neurosci' 'mol psychiatry' 'netw neurosci' 'j sex med'
 'sleep med rev' 'front ict' 'nat hum behav' 'exp clin psychopharmacol'
 'sci adv' 'npj sci learn' 'front neurol' 'drug alcohol depend'
 'biol psychiatry cogn neurosci neuroimaging' 'nature' 'am j psychiatry'
 'res synth methods

In [7]:
# Create files for new articles
for _, row in df.iterrows():
    pmid = row['pmid']
    if str(pmid) not in old_pmids:
        # This appears broken. 'authors' is now a list of strings.
        # authors = ast.literal_eval(row['authors'])
        authors = row['authors']
        nick = [re.sub(r'\W+', '', w) for w in row['title'].lower().split(' ')[:3]]
        nickname = '{0}-{1}-{2}-{3}-{4}'.format(row['year'],
                                                '{0:02d}'.format(int(row['month'])),
                                                '{0:02d}'.format(int(row['day'])),
                                                authors[0].split(' ')[0].lower(),
                                                '-'.join(nick))
        nickname = nickname.replace(':', '')
        journal = row['journal']
        image = '/assets/images/papers/{0}.png'.format('-'.join(journal.lower().split(' ')))
        title = row['title'].replace('"', "'")
        completed = template.format(title=title, nickname=nickname,
                                    authors=', '.join(authors), year=int(row['year']),
                                    journal=journal, volume=row['volume'],
                                    image=image,
                                    issue=row['issue'], pages=row['pages'],
                                    pmcid=row['pmcid'], doi=row['doi'], pmid=row['pmid'],
                                    abstract=row['abstract'])
        with open('papers/_posts/{0}.md'.format(nickname), 'w') as fo:
            fo.write(completed)
        
        print('New file created for {0}'.format(pmid))

New file created for 33075692
