# Title
[]()

In [1]:

# import pandas as pd
import sys
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\custom_python")
from silvhua import *

# From previous note book `2023-07-05 citation API`

In [3]:
import requests
import json

def search_article(title):
    base_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    params = {
        'db': 'pubmed',
        'term': title,
        'field': 'title',
        'retmax': 5,
        'retmode': 'json'
    }

    response = requests.get(base_url, params=params)
    data = response.json()

    try:
        id_list = data['esearchresult']['idlist']
        if id_list:
            for article_id in id_list:
                result = retrieve_citation(article_id).decode('utf-8')
                if title.lower() in result.lower():
                    print(f'Match found for {title}: PMID = {article_id}.')
                    return result
            print('Article title not found in PMIDs.')
            return id_list        
    except:
        print('Article not found.')
        return None
    
def retrieve_citation(article_id):
    base_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
    params = {
        'db': 'pubmed',
        'id': article_id
    }

    response = requests.get(base_url, params=params)
    return response.content

def format_citation(article):
    medline_citation = article['MedlineCitation']
    article_title = medline_citation['Article']['ArticleTitle']
    journal_title = medline_citation['Article']['Journal']['Title']
    publication_date = medline_citation['Article']['Journal']['JournalIssue']['PubDate']
    volume = medline_citation['Article']['Journal']['JournalIssue']['Volume']
    issue = medline_citation['Article']['Journal']['JournalIssue']['Issue']
    pages = medline_citation['Article']['Pagination']['MedlinePgn']
    
    citation = f"{article_title}. {journal_title}. {publication_date}; {volume}({issue}): {pages}."
    print('APA Citation:')
    print(citation)

# iteration = 3.1
# # # Example usage
# article_title = 'Daily Energy Expenditure through the Human Life Course'
## reference pubmed page: https://pubmed.ncbi.nlm.nih.gov/34385400/

# search_dict[iteration] =  search_article(article_title)
# search_dict[iteration]

# citation_dict[iteration] = retrieve_citation('34385400')
# citation_dict[iteration].content


# Load previously saved citation data

In [8]:
filename = 'pubmed_result_2023-07-09_1151.sav'
path = '../output/'
search_dict = {}
search_dict[0] = loadpickle(filename, path)
print(search_dict[0])

	Time completed: 2023-07-10 17:07:11.150296
<?xml version="1.0" ?>
<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, 1st January 2023//EN" "https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_230101.dtd">
<PubmedArticleSet>
<PubmedArticle><MedlineCitation Status="MEDLINE" Owner="NLM"><PMID Version="1">34385400</PMID><DateCompleted><Year>2021</Year><Month>08</Month><Day>20</Day></DateCompleted><DateRevised><Year>2022</Year><Month>08</Month><Day>14</Day></DateRevised><Article PubModel="Print"><Journal><ISSN IssnType="Electronic">1095-9203</ISSN><JournalIssue CitedMedium="Internet"><Volume>373</Volume><Issue>6556</Issue><PubDate><Year>2021</Year><Month>Aug</Month><Day>13</Day></PubDate></JournalIssue><Title>Science (New York, N.Y.)</Title><ISOAbbreviation>Science</ISOAbbreviation></Journal><ArticleTitle>Daily energy expenditure through the human life course.</ArticleTitle><Pagination><StartPage>808</StartPage><EndPage>812</EndPage><MedlinePgn>808-812</MedlinePgn></Pagination><ELoca

In [9]:
from pprint import pprint
pprint(search_dict[0])

('<?xml version="1.0" ?>\n'
 '<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, 1st January '
 '2023//EN" "https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_230101.dtd">\n'
 '<PubmedArticleSet>\n'
 '<PubmedArticle><MedlineCitation Status="MEDLINE" Owner="NLM"><PMID '
 'Version="1">34385400</PMID><DateCompleted><Year>2021</Year><Month>08</Month><Day>20</Day></DateCompleted><DateRevised><Year>2022</Year><Month>08</Month><Day>14</Day></DateRevised><Article '
 'PubModel="Print"><Journal><ISSN '
 'IssnType="Electronic">1095-9203</ISSN><JournalIssue '
 'CitedMedium="Internet"><Volume>373</Volume><Issue>6556</Issue><PubDate><Year>2021</Year><Month>Aug</Month><Day>13</Day></PubDate></JournalIssue><Title>Science '
 '(New York, '
 'N.Y.)</Title><ISOAbbreviation>Science</ISOAbbreviation></Journal><ArticleTitle>Daily '
 'energy expenditure through the human life '
 'course.</ArticleTitle><Pagination><StartPage>808</StartPage><EndPage>812</EndPage><MedlinePgn>808-812</MedlinePgn></Paginati

# Extract key details

Here is an example of a record from the PubMed API ('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'), which has been converted to a string.

Write a function in Python that extracts the following details:
- Authors using APA citation format. The number of authors is variable.
- Publication year.
- ArticleTitle.
- Journal title.
- Journal volume.
- Journal issue.
- Start page.
- End page.
- ELocationID.

In [10]:
import re

def extract_pubmed_details(record_string):
    # Extract authors using APA citation format
    authors = re.findall(r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>', record_string)
    formatted_authors = ', '.join(['{} {}'.format(author[1], author[0]) for author in authors])

    # Extract publication year
    publication_year = re.search(r'<Year>(\d{4})</Year>', record_string)
    publication_year = publication_year.group(1) if publication_year else '[]'

    # Extract article title
    article_title = re.search(r'<ArticleTitle>(.*?)</ArticleTitle>', record_string)
    article_title = article_title.group(1) if article_title else '[]'

    # Extract journal title
    journal_title = re.search(r'<Title>(.*?)</Title>', record_string)
    journal_title = journal_title.group(1) if journal_title else '[]'

    # Extract journal volume
    journal_volume = re.search(r'<Volume>(.*?)</Volume>', record_string)
    journal_volume = journal_volume.group(1) if journal_volume else '[]'

    # Extract journal issue
    journal_issue = re.search(r'<Issue>(.*?)</Issue>', record_string)
    journal_issue = journal_issue.group(1) if journal_issue else '[]'

    # Extract start page
    start_page = re.search(r'<StartPage>(.*?)</StartPage>', record_string)
    start_page = start_page.group(1) if start_page else '[]'

    # Extract end page
    end_page = re.search(r'<EndPage>(.*?)</EndPage>', record_string)
    end_page = end_page.group(1) if end_page else '[]'

    # Extract ELocationID
    elocation_id = re.search(r'<ELocationID.*?>(.*?)</ELocationID>', record_string)
    elocation_id = elocation_id.group(1) if elocation_id else '[]'

    return {
        'Authors': formatted_authors,
        'Publication Year': publication_year,
        'Article Title': article_title,
        'Journal Title': journal_title,
        'Journal Volume': journal_volume,
        'Journal Issue': journal_issue,
        'Start Page': start_page,
        'End Page': end_page,
        'ELocationID': elocation_id
    }


record_string = search_dict[0]
details = extract_pubmed_details(record_string)
print(details)


{'Authors': 'Herman Pontzer, Yosuke Yamada, Hiroyuki Sagayama, Philip N Ainslie, Lene F Andersen, Liam J Anderson, Lenore Arab, Issaad Baddou, Kweku Bedu-Addo, Ellen E Blaak, Stephane Blanc, Alberto G Bonomi, Carlijn V C Bouten, Pascal Bovet, Maciej S Buchowski, Nancy F Butte, Stefan G Camps, Graeme L Close, Jamie A Cooper, Richard Cooper, Sai Krupa Das, Lara R Dugas, Ulf Ekelund, Sonja Entringer, Terrence Forrester, Barry W Fudge, Annelies H Goris, Michael Gurven, Catherine Hambly, Asmaa El Hamdouchi, Marjije B Hoos, Sumei Hu, Noorjehan Joonas, Annemiek M Joosen, Peter Katzmarzyk, Kitty P Kempen, Misaka Kimura, William E Kraus, Robert F Kushner, Estelle V Lambert, William R Leonard, Nader Lessan, Corby Martin, Anine C Medin, Erwin P Meijer, James C Morehen, James P Morton, Marian L Neuhouser, Teresa A Nicklas, Robert M Ojiambo, Kirsi H Pietil&#xe4;inen, Yannis P Pitsiladis, Jacob Plange-Rhule, Guy Plasqui, Ross L Prentice, Roberto A Rabinovich, Susan B Racette, David A Raichlen, Eric 

In [11]:
details

{'Authors': 'Herman Pontzer, Yosuke Yamada, Hiroyuki Sagayama, Philip N Ainslie, Lene F Andersen, Liam J Anderson, Lenore Arab, Issaad Baddou, Kweku Bedu-Addo, Ellen E Blaak, Stephane Blanc, Alberto G Bonomi, Carlijn V C Bouten, Pascal Bovet, Maciej S Buchowski, Nancy F Butte, Stefan G Camps, Graeme L Close, Jamie A Cooper, Richard Cooper, Sai Krupa Das, Lara R Dugas, Ulf Ekelund, Sonja Entringer, Terrence Forrester, Barry W Fudge, Annelies H Goris, Michael Gurven, Catherine Hambly, Asmaa El Hamdouchi, Marjije B Hoos, Sumei Hu, Noorjehan Joonas, Annemiek M Joosen, Peter Katzmarzyk, Kitty P Kempen, Misaka Kimura, William E Kraus, Robert F Kushner, Estelle V Lambert, William R Leonard, Nader Lessan, Corby Martin, Anine C Medin, Erwin P Meijer, James C Morehen, James P Morton, Marian L Neuhouser, Teresa A Nicklas, Robert M Ojiambo, Kirsi H Pietil&#xe4;inen, Yannis P Pitsiladis, Jacob Plange-Rhule, Guy Plasqui, Ross L Prentice, Roberto A Rabinovich, Susan B Racette, David A Raichlen, Eric 

## Try with another journal title

In [12]:
iteration = 1
# # Example usage
article_title = 'High doses of anti-inflammatory drugs compromise muscle strength and hypertrophic adaptations to resistance training in young adults'
# reference pubmed page: https://pubmed.ncbi.nlm.nih.gov/28834248/

search_dict[iteration] =  search_article(article_title)
search_dict[iteration]

Match found for High doses of anti-inflammatory drugs compromise muscle strength and hypertrophic adaptations to resistance training in young adults: PMID = 28834248.


'<?xml version="1.0" ?>\n<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, 1st January 2023//EN" "https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_230101.dtd">\n<PubmedArticleSet>\n<PubmedArticle><MedlineCitation Status="MEDLINE" Owner="NLM"><PMID Version="1">28834248</PMID><DateCompleted><Year>2019</Year><Month>07</Month><Day>16</Day></DateCompleted><DateRevised><Year>2019</Year><Month>07</Month><Day>16</Day></DateRevised><Article PubModel="Print-Electronic"><Journal><ISSN IssnType="Electronic">1748-1716</ISSN><JournalIssue CitedMedium="Internet"><Volume>222</Volume><Issue>2</Issue><PubDate><Year>2018</Year><Month>Feb</Month></PubDate></JournalIssue><Title>Acta physiologica (Oxford, England)</Title><ISOAbbreviation>Acta Physiol (Oxf)</ISOAbbreviation></Journal><ArticleTitle>High doses of anti-inflammatory drugs compromise muscle strength and hypertrophic adaptations to resistance training in young adults.</ArticleTitle><ELocationID EIdType="doi" ValidYN="Y">10.1111/apha.1294

### Extracted details
- Wrong year extracted
- This pubmed record has no page numbers.

In [15]:
details_dict = dict()
record_string = search_dict[iteration]
details_dict[iteration] = extract_pubmed_details(record_string)
details_dict[iteration]

{'Authors': 'M Lilja, M Mandi&#x107;, W Apr&#xf3;, M Melin, K Olsson, S Rosenborg, T Gustafsson, T R Lundberg',
 'Publication Year': '2019',
 'Article Title': 'High doses of anti-inflammatory drugs compromise muscle strength and hypertrophic adaptations to resistance training in young adults.',
 'Journal Title': 'Acta physiologica (Oxford, England)',
 'Journal Volume': '222',
 'Journal Issue': '2',
 'Start Page': '[]',
 'End Page': '[]',
 'ELocationID': '10.1111/apha.12948'}

# *End of Page*