Парсим по API вместо самого pubmed'а

In [1]:
# https://gist.github.com/bonzanini/5a4c39e4c02502a8451d -- пример кода
# https://biopython.org/docs/latest/api/Bio.Entrez.html?highlight=pubmed -- документация

from Bio import Entrez

In [2]:
# это просто на всякий случай

def search(query):
    Entrez.email = 'your.email@example.com'
    handle = Entrez.esearch(db='pubmed',
                            sort='relevance',
                            retmax='20',
                            retmode='xml',
                            term=query)
    results = Entrez.read(handle)
    return results

In [19]:
results = search('Dat-ko')
results['IdList']

['30784211', '23912772']

In [4]:
def fetch_details(id_list):
    ids = ','.join(id_list)
    Entrez.email = 'your.email@example.com'  # не забудь поменять на свой мэйл
    handle = Entrez.efetch(db='pubmed',
                           retmode='xml',
                           id=ids)
    results = Entrez.read(handle)
    return results

In [None]:
# разбор всех элементов можно найти здесь: https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html

In [66]:
paper = fetch_details(['5660041'])

Результат для статьи с PMID = 5660041 должен выглядеть так:

Huston R.B. Activation of Skeletal Muscle Phosphorylase Kinase by Ca2+. II. Identification of the Kinase Activating Factor as a Proteolytic Enzyme / Huston R.B., Krebs E.G. // Biochemistry – 1968. – Т. 7 – № 6 – С.2116–2122.

In [103]:
# для нахождения нужных ключей словаря

import json
# print(json.dumps(paper['PubmedArticle'], indent=2))

In [72]:
paper['PubmedArticle'][0]['MedlineCitation']['Article']['Journal']['JournalIssue']

DictElement({'Volume': '7', 'Issue': '6', 'PubDate': {'Year': '1968', 'Month': 'Jun'}}, attributes={'CitedMedium': 'Print'})

In [152]:
# get journal info

journal_info = paper['PubmedArticle'][0]['MedlineCitation']['Article']['Journal']
journal_title = journal_info['Title']
journal_volume = 'T. ' + journal_info['JournalIssue']['Volume']                                              # (e.x. "T. 7")
journal_issue = '№ ' + journal_info['JournalIssue']['Issue']                                                 # (e.x. "№ 6")
journal_year = journal_info['JournalIssue']['PubDate']['Year'] + '.'                                         # (e.x. "1968.")

journal_pages = 'C.' + paper['PubmedArticle'][0]['MedlineCitation']['Article']['Pagination']['MedlinePgn']  # (e.x. "С.2116–2122.")

In [153]:
# get all authors from article

authors = []

for author in paper['PubmedArticle'][0]['MedlineCitation']['Article']['AuthorList']:
    initials = author['ForeName']                            # get author's initials          (e.x. "R B")
    initials_w_dots = '.'.join(initials.split())+'.'         # add dots                       (e.x. "R. B.")
    last_name = author['LastName']                           # extract author's last name     (e.x. "Huston")
    
    full_name = ' '.join([last_name, initials_w_dots])       # combine last name and initials (e.x. "Huston R.B.")
    authors.append(full_name)
    
first_author = authors[0]

In [154]:
# get article title

article_title = paper['PubmedArticle'][0]['MedlineCitation']['Article']['ArticleTitle']

In [155]:
# cobmine all info to get citation

citation = ' '.join([first_author, article_title, '/', 
                     ', '.join(authors), '//',
                     journal_title, '–', journal_year, '–', journal_volume, '–', journal_issue, '–', journal_pages])
citation

'Huston R.B. Activation of skeletal muscle phosphorylase kinase by Ca2+. II. Identification of the kinase activating factor as a proteolytic enzyme. / Huston R.B., Krebs E.G. // Biochemistry – 1968. – T. 7 – № 6 – C.2116-22'

Получили:

Huston R.B. Activation of skeletal muscle phosphorylase kinase by Ca2+. II. Identification of the kinase activating factor as a proteolytic enzyme. / Huston R.B., Krebs E.G. // Biochemistry – 1968. – T. 7 – № 6 – C.2116-22

Результат для статьи с PMID = 5660041 должен выглядеть так:

Huston R.B. Activation of Skeletal Muscle Phosphorylase Kinase by Ca2+. II. Identification of the Kinase Activating Factor as a Proteolytic Enzyme / Huston R.B., Krebs E.G. // Biochemistry – 1968. – Т. 7 – № 6 – С.2116–2122.

In [144]:
journal_pages_copy = journal_pages
journal_pages_copy

'C. 2116-22'

In [145]:
# если номер END страницы меньше, чем у START,
# то мы делаем 2116 -> (//100) -> 2100 -> (+22) -> 2122

splitted_pages = list(map(int, journal_pages_copy[3:].split('-')))

if splitted_pages[1] < splitted_pages[0]:
    splitted_pages[1] = splitted_pages[0] // 10**len(str(splitted_pages[1])) * 10**len(str(splitted_pages[1])) + splitted_pages[1]

In [146]:
splitted_pages

[2116, 2122]

In [157]:
pages = 'C.' + '–'.join(list(map(str, splitted_pages)))
pages

'C.2116–2122'

In [122]:
# ну и дальше склеиваем. Как-нибудь так можно переформатировать номера страниц, если нужно

[2116, 22]

In [None]:
# наверное можно было сделать и проще
# а ещё можно подавать список из PMID в id у Entrez.esummary и в функцию fetch_details
# короче читай мануал по библиотеке

In [162]:
handle = Entrez.esummary(db="pubmed", id="5660041", retmode="xml")
records = Entrez.parse(handle)
for record in records:
    print(record)
    
handle.close()  # не забывать

{'Item': [], 'Id': '5660041', 'PubDate': '1968 Jun', 'EPubDate': '', 'Source': 'Biochemistry', 'AuthorList': ['Huston RB', 'Krebs EG'], 'LastAuthor': 'Krebs EG', 'Title': 'Activation of skeletal muscle phosphorylase kinase by Ca2+. II. Identification of the kinase activating factor as a proteolytic enzyme.', 'Volume': '7', 'Issue': '6', 'Pages': '2116-22', 'LangList': ['English'], 'NlmUniqueID': '0370623', 'ISSN': '0006-2960', 'ESSN': '1520-4995', 'PubTypeList': ['Journal Article'], 'RecordStatus': 'PubMed - indexed for MEDLINE', 'PubStatus': 'ppublish', 'ArticleIds': {'pubmed': ['5660041'], 'medline': [], 'doi': '10.1021/bi00846a014', 'rid': '5660041', 'eid': '5660041'}, 'DOI': '10.1021/bi00846a014', 'History': {'pubmed': ['1968/06/01 00:00'], 'medline': ['1968/06/01 00:01'], 'entrez': '1968/06/01 00:00'}, 'References': [], 'HasAbstract': IntegerElement(0, attributes={}), 'PmcRefCount': IntegerElement(23, attributes={}), 'FullJournalName': 'Biochemistry', 'ELocationID': '', 'SO': '196