# Week 2.2: Parallellisation
Author: Juana Karina Diaz Barba
  

### Step 1: install and use the biopython package

In [52]:
from Bio import Entrez
import os
import multiprocessing as mp
import time 

# the next two lines are needed to create an environment in which the 
# ssl doesn't complain about non-existing public keys...
import ssl
ssl._create_default_https_context = ssl._create_unverified_context


In [53]:
#enter your email here; the one you used to create an api key in step 0
Entrez.email = 'j.k.diaz.barba@st.hanze.nl' 

file = Entrez.elink(dbfrom="pubmed",
                   db="pmc",
                   LinkName="pubmed_pmc_refs",
                   id="30049270",
                   api_key='113a43b73ad4ea2ee9a48e8c102cf26e0009')
results = Entrez.read(file)
print (results)

[{'ERROR': [], 'LinkSetDbHistory': [], 'LinkSetDb': [{'Link': [{'Id': '10862343'}, {'Id': '10824403'}, {'Id': '10810535'}, {'Id': '10712917'}, {'Id': '10668443'}, {'Id': '10617710'}, {'Id': '10604399'}, {'Id': '10590010'}, {'Id': '10546998'}, {'Id': '10486119'}, {'Id': '10449449'}, {'Id': '10413913'}, {'Id': '10387313'}, {'Id': '10354302'}, {'Id': '10296546'}, {'Id': '10281569'}, {'Id': '10240642'}, {'Id': '10224495'}, {'Id': '10195063'}, {'Id': '10189984'}, {'Id': '10029973'}, {'Id': '10023427'}, {'Id': '10013231'}, {'Id': '9983048'}, {'Id': '9970664'}, {'Id': '9892353'}, {'Id': '9884887'}, {'Id': '9843110'}, {'Id': '9821373'}, {'Id': '9788541'}, {'Id': '9640424'}, {'Id': '9553271'}, {'Id': '9550025'}, {'Id': '9544891'}, {'Id': '9522633'}, {'Id': '9518073'}, {'Id': '9517012'}, {'Id': '9511241'}, {'Id': '9510112'}, {'Id': '9505332'}, {'Id': '9274363'}, {'Id': '9274335'}, {'Id': '9259961'}, {'Id': '9255267'}, {'Id': '9241475'}, {'Id': '9233012'}, {'Id': '9216484'}, {'Id': '9047029'}, {'

In [54]:
#  get all the IDs of all the articles that are referenced in this one
references = [f'{link["Id"]}' for link in results[0]["LinkSetDb"][0]["Link"]]
print (references)

['10862343', '10824403', '10810535', '10712917', '10668443', '10617710', '10604399', '10590010', '10546998', '10486119', '10449449', '10413913', '10387313', '10354302', '10296546', '10281569', '10240642', '10224495', '10195063', '10189984', '10029973', '10023427', '10013231', '9983048', '9970664', '9892353', '9884887', '9843110', '9821373', '9788541', '9640424', '9553271', '9550025', '9544891', '9522633', '9518073', '9517012', '9511241', '9510112', '9505332', '9274363', '9274335', '9259961', '9255267', '9241475', '9233012', '9216484', '9047029', '9007843', '8944180', '8900724', '8900470', '8886176', '8860903', '8832600', '8822421', '8767730', '8764740', '8700835', '8671734', '8665940', '8663079', '8636824', '8600343', '8355211', '8351482', '8287384', '8285468', '8274250', '8159085', '7982513', '7820374', '7785626', '7661370', '7661268', '7559120', '7557385', '7525587', '7435551', '7418773', '7259862', '7216302', '7201176', '7142778', '6965168']


In [61]:
# obtain the article with authors and abstract
handle = Entrez.efetch(db="pubmed",
                id='30049270',
                retmode="xml",
                api_key='113a43b73ad4ea2ee9a48e8c102cf26e0009')
#print(handle.read())

### Step 2: Download the referenced articles and extra challenge comparing the execution time

Obtain the ten first references of an article

In [56]:
def get_article_references(pubmed_id='30049270'):
    '''Obtain the ten first references of an article'''
    file = Entrez.elink(dbfrom="pubmed",
                    db="pmc",
                    LinkName="pubmed_pmc_refs",
                    id=pubmed_id,
                    api_key='113a43b73ad4ea2ee9a48e8c102cf26e0009')
    # Storing the results obtained from biopython package
    results = Entrez.read(file)

    #  get all the IDs of all the articles that are referenced in this one
    references = [f'{link["Id"]}' for link in results[0]["LinkSetDb"][0]["Link"]]
    # return only the first 10 references
    return references[0:10]

In [46]:
def download_article(pubmed_id):
    '''Function to download an article based on the pubmed_id'''
    filename = f'article_{pubmed_id}.xml'
    print(f'File name: {filename}')
    
    # Obtain the article with authors and abstract
    net_handle = Entrez.efetch(db="pubmed",
                    id=pubmed_id,
                    retmode="xml",
                    api_key='113a43b73ad4ea2ee9a48e8c102cf26e0009')
    article_data = net_handle.read()

    # write the article, is in binary mode so 'b' option is needed
    with open(filename,'wb')as file:
        file.write(article_data)

    print('------ Article downloaded ------')


In [57]:
def download_article_forloop(pubmed_references_list):
    '''Function to download an article based on the pubmed_id NOT using 
    parallellisation, but a list comprehension'''
    start_time = time.time()
    [download_article(reference) for reference in pubmed_references_list]
    end_time = time.time() - start_time
    return end_time

In [58]:
def download_article_mp(pubmed_references_list):
    '''Function to download an article based on the pubmed_id using 
    parallellisation (multiprocessing)'''
    start_time = time.time()
    with mp.Pool()as p:
        res = p.map(download_article, pubmed_references_list)
    end_time = time.time() - start_time
    return end_time


In [59]:
# Getting the ids of the articles referenced
references_list = get_article_references('30049270')
references_list

['10862343',
 '10824403',
 '10810535',
 '10712917',
 '10668443',
 '10617710',
 '10604399',
 '10590010',
 '10546998',
 '10486119']

In [60]:
# Downloadding the files 

print(f'Time Foor loop {download_article_forloop(references_list):.3f} seconds')
print(f'Time Multiprocessing {download_article_mp(references_list):.3f} seconds')



File name: article_10862343.xml
------ Article downloaded ------
File name: article_10824403.xml
------ Article downloaded ------
File name: article_10810535.xml
------ Article downloaded ------
File name: article_10712917.xml
------ Article downloaded ------
File name: article_10668443.xml
------ Article downloaded ------
File name: article_10617710.xml
------ Article downloaded ------
File name: article_10604399.xml
------ Article downloaded ------
File name: article_10590010.xml
------ Article downloaded ------
File name: article_10546998.xml
------ Article downloaded ------
File name: article_10486119.xml
------ Article downloaded ------
Time Foor loop 6.385 seconds
File name: article_10862343.xmlFile name: article_10712917.xml
File name: article_10604399.xmlFile name: article_10546998.xmlFile name: article_10486119.xmlFile name: article_10810535.xmlFile name: article_10668443.xmlFile name: article_10617710.xmlFile name: article_10590010.xmlFile name: article_10824403.xml








-