# Search Pubmed

## Define Search Criteria

In [6]:
from Bio import Entrez
import numpy as np

def search(query):
    Entrez.email = 'your.email@example.com'
    handle = Entrez.esearch(db='pubmed', 
                            sort='most recent', 
                            retmax='5000',
                            retmode='xml', 
                            reldate = 7, #only within n days from now
                            term=query)
    results = Entrez.read(handle)
    return results

#search terms (can test string with Pubmed Advanced Search)
search_results = search('(Biomech*[Title/Abstract] OR locomot*[Title/Abstract])')

## Perform Search and Save Paper Titles

In [7]:


def fetch_details(id_list):
    ids = ','.join(id_list)
    Entrez.email = 'your.email@example.com'
    handle = Entrez.efetch(db='pubmed',
                           retmode='xml',
                           id=ids)
    results = Entrez.read(handle)
    return results

id_list = search_results['IdList']
papers = fetch_details(id_list)
print("")
titles = [0 for i in enumerate(papers['PubmedArticle'])]
keywords = ['' for i in enumerate(papers['PubmedArticle'])]
authors = ['' for i in enumerate(papers['PubmedArticle'])]
links = ['' for i in enumerate(papers['PubmedArticle'])]
journals = ['' for i in enumerate(papers['PubmedArticle'])]


for i, paper in enumerate(papers['PubmedArticle']):
    titles[i] = papers['PubmedArticle'][i]['MedlineCitation']['Article']['ArticleTitle']








## (Optional) Print Paper Info in Github Markdown Format

In [8]:
for i, paper in enumerate(papers['PubmedArticle']):
#     print("* [%s](https://www.ncbi.nlm.nih.gov/pubmed/%s)" % (paper['MedlineCitation']['Article']['ArticleTitle'],paper['MedlineCitation']['PMID']))
    links[i] = "* [%s](https://www.ncbi.nlm.nih.gov/pubmed/%s)" % (paper['MedlineCitation']['Article']['ArticleTitle'],paper['MedlineCitation']['PMID'])
    auths = []
    for auth in paper['MedlineCitation']['Article']['AuthorList']:
        try:
            auth_name = [auth['LastName'],auth['Initials']+',']
            auth_name = ' '.join(auth_name)
#             print(auth_name)
            auths.append(auth_name)
        except:
            print('NAME ERROR')
    authors[i] = ' '.join(auths)
#     print('*%s*' % (paper['MedlineCitation']['Article']['Journal']['Title']) )
    journals[i] = '*%s*' % (paper['MedlineCitation']['Article']['Journal']['Title']) 
    #store keywords 
#     print(" - ") #uncomment to print keywords (1 of 3)
    if paper['MedlineCitation']['KeywordList'] != []:
        kwds = []
        for kw in paper['MedlineCitation']['KeywordList'][0]:
#             print(kw,'/') #uncomment to print keywords (2 of 3)
            kwds.append(kw[:])         
        keywords[i] = ' '.join(kwds)
#     else:
#         print("NO_KEYWORDS") #uncomment to print keywords (3 of 3)
#     print("<br>  ") #linebreak for github md 
    #end keywords test
#     print("")

NAME ERROR


### Example:

* [Skeletal muscles of hibernating black bears show minimal atrophy and phenotype shifting despite prolonged physical inactivity and starvation.](https://www.ncbi.nlm.nih.gov/pubmed/30998788)
Miyazaki M,
Shimozuru M,
Tsubota T,
*PloS one*
<br>  

* [Phase space methods for non-linear analysis of pedalling forces in cycling.](https://www.ncbi.nlm.nih.gov/pubmed/30998746)
Kunert A,
Ott M,
Reuter T,
Koska D,
Maiwald C,
*PloS one*
<br>  

# Clean up title and keyword strings

In [9]:
print('Number of Papers: ',np.size(titles,0)) #number of papers. limited to 500. (retmax)
#clean up titles
import re
titles = [t.lower() for t in titles] #same case
titles = [t.replace('<sub>',' ').replace('</sub>','') for t in titles] #subscript
titles = [t.replace('<i>',' ').replace('</i>','') for t in titles] #italics
titles = [t.replace('[','').replace(']','') for t in titles] #remove brackets from html parser
#clean up keywords
keywords = [k.lower() for k in keywords] #same case


Number of Papers:  146


# Load Top-performing Model

In [10]:
from sklearn.externals import joblib
model = joblib.load('Models/LogReg.pkl')
print(model)
print("\nLoaded model from disk")
 

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=500, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=0, solver='sag',
          tol=0.0001, verbose=0, warm_start=False)

Loaded model from disk


# Load Associated Vectorizer

In [11]:
from sklearn.externals import joblib
import numpy as np
# import numpy as np
from sklearn.preprocessing import LabelEncoder
#load vectorizer and label encoder
vect = joblib.load('Models/test_many_ML_vectorizer.pkl')
le = LabelEncoder()
le.classes_= np.load('Models/test_many_ML_Label_Encoder.npy')
print('\nLoaded Vectorizer/Encoder')



Loaded Vectorizer/Encoder


# Vectorize Strings

In [12]:
#get titles for this week's literature update
import pandas as pd
papers_df = pd.DataFrame({'title': titles, 'keywords': keywords})
#join keywords with titles
# papers_df['everything'] = papers_df['title'] + ' ' + papers_df['keywords']
#TITLES ONLY
papers_df['everything'] = papers_df['title']

#vectorize 
titles_vec = vect.transform(papers_df['everything'])
#OR if you don't want to use just the title:
# titles_vec = vect.transform(papers_df['title'])

# Predict Topics For Each Paper

In [13]:
prediction_vec = model.predict(titles_vec)
topics = [top_val for top_val in model.predict(titles_vec)]
papers_df['topic'] = topics
papers_df[['title','topic']].head()

Unnamed: 0,title,topic
0,mu opioid receptors in the medial habenula con...,NEURAL
1,a state-of-the-art review on badminton lunge a...,SPORT/EXERCISE
2,asymmetric biomechanical characteristics of th...,ORTHOPAEDICS/SPINE
3,effects of hypothyroidism on the mesenteric an...,TISSUE/BIOMATERIAL
4,the effects of short-term and long-term enviro...,NEURAL


# Save Paper Titles and Topics

### Store everything in DataFrame and sort by Topic

In [14]:
#add info for github markdown format
papers_df['authors'] = authors
papers_df['journal'] = journals
papers_df['links'] = links
#generate filename
import datetime
now = datetime.datetime.now()
strings = [str(now.year), str(now.month), str(now.day),'litupdate.csv']
fname = 'Literature_Updates/'+'-'.join(strings)
strings = [str(now.year), str(now.month), str(now.day),'litupdate.md']
mdname = 'Literature_Updates/'+'-'.join(strings)
strings = [str(now.year), str(now.month), str(now.day),'litupdate']
urlname = '-'.join(strings)

print('Filename: ',fname)

# papers_df = papers_df.sort_values(by = ['topic'])
# papers_df = papers_df.reset_index(drop = True)
papers_df.head()

Filename:  Literature_Updates/2019-4-21-litupdate.csv


Unnamed: 0,title,keywords,everything,topic,authors,journal,links
0,mu opioid receptors in the medial habenula con...,,mu opioid receptors in the medial habenula con...,NEURAL,"Boulos LJ, Ben Hamida S, Bailly J, Maitra M, E...",*Neuropsychopharmacology : official publicatio...,* [Mu opioid receptors in the medial habenula ...
1,a state-of-the-art review on badminton lunge a...,attributes badminton kinematics kinetics lunge...,a state-of-the-art review on badminton lunge a...,SPORT/EXERCISE,"Lee JJJ, Loh WP,",*Computers in biology and medicine*,* [A state-of-the-art review on badminton lung...
2,asymmetric biomechanical characteristics of th...,adolescent idiopathic scoliosis biomechanics p...,asymmetric biomechanical characteristics of th...,ORTHOPAEDICS/SPINE,"Liu Y, Pan A, Hai Y, Li W, Yin L, Guo R,","*Clinical biomechanics (Bristol, Avon)*",* [Asymmetric biomechanical characteristics of...
3,effects of hypothyroidism on the mesenteric an...,adipokines adipose tissue energy balance hormo...,effects of hypothyroidism on the mesenteric an...,TISSUE/BIOMATERIAL,"López Fontana CM, Pennachio G, Zyla L, Toneatt...",*Molecular and cellular endocrinology*,* [Effects of hypothyroidism on the mesenteric...
4,the effects of short-term and long-term enviro...,anxiety cognition depression environmental enr...,the effects of short-term and long-term enviro...,NEURAL,"Singhal G, Morgan J, Jawahar MC, Corrigan F, J...",*Behavioural brain research*,* [The effects of short-term and long-term env...


### Save as .csv 

In [15]:
# header = ['title','topic']
# papers_df.sort_values('topic').to_csv(fname, index = False, columns = header)
# print('\nLiterature Update Exported as .csv')


# Compile papers grouped by topic

In [16]:
#header for alcantarar.github.io literature update site:
print('---',file=open(mdname,"w"))
print('title: Biomechanics Literature Update',file=open(mdname,"a"))
print('collection: literature',file=open(mdname,"a"))
print('permalink: /literature/'+urlname,file=open(mdname,"a"))
print('excerpt: <br>',file=open(mdname,"a"))
print('---',file=open(mdname,"a"))

#tidy up topic strings
topic_list = np.unique(papers_df.sort_values('topic')['topic'])
ss = [s for s in topic_list if 'UNIQUE' in s]
for i,t in enumerate(topic_list):
    if 'UNIQUE' in t:  
        topic_list[i] = 'UNIQUE TOPIC'
    if 'IMPACT' in t:
        topic_list[i] = 'TRAUMA/IMPACT'

# print('# Literature Update: ',str(now.year)+'-'+str(now.month)+'-'+str(now.day),'  ', file = open(mdname,'w'))
print('### Created by: [Ryan Alcantara](https://twitter.com/Ryan_Alcantara_) & [Gary Bruening](https://twitter.com/garebearbru) - University of Colorado Boulder', file=open(mdname, "a"))
print('### Table Of Contents: ', file=open(mdname, "a"))
for topic in topic_list:
    print('['+topic+']'+'(#'+str.lower(topic).replace('/','').replace(' ','')+')  ', file=open(mdname, "a"))
print('', file=open(mdname,"a"))
for topic in topic_list:
    print('----', file=open(mdname, "a"))
    print('#',topic, file=open(mdname, "a"))
    print('----', file=open(mdname, "a"))
    print('', file=open(mdname, "a"))
    print('[Back to top](#table-of-contents)', file=open(mdname, "a"))
    print('', file=open(mdname, "a"))
    papers_subset = pd.DataFrame(papers_df[papers_df.topic == topic].reset_index(drop = True))
    for i,paper in enumerate(papers_subset['links']):
        print(paper, file=open(mdname, "a"))
        print(papers_subset['authors'][i], file=open(mdname, "a"))
        print(papers_subset['journal'][i]+'.  ', file=open(mdname, "a"))
#         print('<br>  ', file=open("output.md", "a"))
        print('', file=open(mdname, "a"))
    
print('Literature Update Exported as Markdown')

Literature Update Exported as Markdown
