In [1]:
from Bio import Entrez
import pandas as pd
import time

In [34]:
def search(query, retstart, retmax):
    Entrez.email = "sandra_friebolin@proton.me"
    handle = Entrez.esearch(db='pubmed', 
                            retstart=retstart, 
                            retmax=retmax, 
                            retmode='xml', 
                            term=query)
    results = Entrez.read(handle)
    return results


def fetch_details(id_list):
    ids = ','.join(id_list)
    Entrez.email = "sandra_friebolin@proton.me"
    handle = Entrez.efetch(db='pubmed',
                            retmode='xml',
                            id=ids)
    results = Entrez.read(handle)
    return results


# def get_pub_date(article, medline_citation):
#     # Check various fields for publication date
#     if 'ArticleDate' in article and article['ArticleDate']:
#         return format_date(article['ArticleDate'])
#     elif 'PubDate' in article and article['PubDate']:
#         return format_date(article['PubDate'])
#     elif 'DateCompleted' in medline_citation and medline_citation['DateCompleted']:
#         return format_date(medline_citation['DateCompleted'])
#     elif 'DateRevised' in medline_citation and medline_citation['DateRevised']:
#         return format_date(medline_citation['DateRevised'])
#     else:
#         # Log or print when a date is not found
#         print(f"Date not found for PMID: {article.get('PMID')}")
#         return ""
    

# def format_date(date_field):
#     if isinstance(date_field, list):
#         date_field = date_field[0] if date_field else {}

#     year = date_field.get('Year', '')
#     month = date_field.get('Month', '')
#     day = date_field.get('Day', '')

#     date_parts = [part for part in [year, month, day] if part]
#     formatted_date = '-'.join(date_parts)

#     return formatted_date if formatted_date else "Unknown"

def get_pub_date(paper):
    # Fetching the nested PubDate field
    journal_issue = paper.get('MedlineCitation', {}).get('Article', {}).get('Journal', {}).get('JournalIssue', {})
    pub_date = journal_issue.get('PubDate', {})

    year = pub_date.get('Year', '')
    month = pub_date.get('Month', '')
    day = pub_date.get('Day', '')

    # Formatting the date
    if year:
        formatted_date = year
        if month:
            formatted_date += f"-{month}"
            if day:
                formatted_date += f"-{day}"
        return formatted_date
    else:
        return "Unknown"

In [35]:
def save_data_to_lists(papers, pmid_list, title_list, abstract_list, author_list, date_list, doi_list):
    for paper in papers['PubmedArticle']:
        article = paper['MedlineCitation']['Article']
        medline_citation = paper['MedlineCitation']

        # Check if Abstract is present
        if article.get("Abstract") is not None and paper['MedlineCitation']['PMID'] not in pmid_list:
            abstract_texts = article['Abstract']['AbstractText']
            full_abstract = ' '.join([str(text) for text in abstract_texts])

            title_list.append(article['ArticleTitle'])
            pmid_list.append(paper['MedlineCitation']['PMID'])

            # Fetch authors
            if 'AuthorList' in article:
                authors = article['AuthorList']
                author_names = [author.get('ForeName') + " " + author.get('LastName') \
                                    if author.get('ForeName') else author.get('LastName') \
                                for author in authors if 'LastName' in author]
                author_list.append("; ".join(author_names))
            else:
                author_list.append("")

            # # Fetch Publication Date
            # medline_citation = paper.get('MedlineCitation', {})
            # article = medline_citation.get('Article', {})
            # pub_date = None

            # # Check various fields for publication date
            # if 'ArticleDate' in article:
            #     pub_date = article['ArticleDate']
            # elif 'PubDate' in article:
            #     pub_date = article['PubDate']
            # elif 'DateCompleted' in medline_citation:
            #     pub_date = medline_citation['DateCompleted']
            # elif 'DateRevised' in medline_citation:
            #     pub_date = medline_citation['DateRevised']

            # # Format the publication date
            # if pub_date:
            #     date_str = f"{pub_date[0]['Year']}-{pub_date[0].get('Month', '01')}-{pub_date[0].get('Day', '01')}"
            # else:
            #     date_str = ""

            # date_list.append(date_str)
            
            # Fetch and format Publication Date
            # pub_date = get_pub_date(article, medline_citation)
            pub_date = get_pub_date(paper)
            date_list.append(pub_date)

            # Fetch DOI
            article_id_list = paper.get('PubmedData', {}).get('ArticleIdList', [])
            doi = next((id_ for id_ in article_id_list if id_.attributes.get('IdType') == 'doi'), None)
            doi_list.append(doi if doi is not None else "")

            # Append Abstract
            abstract_list.append(full_abstract)

In [41]:
pmid_list = []
title_list = []
abstract_list =[]
author_list = []
date_list = []
doi_list = []
saved_data_cnt = 0
total_to_fetch = 1000

for year in range(2013, 2024):
    # if len(pmid_list) >= total_to_fetch:
    #     break  # Stop if we have fetched enough records
    
    for quartal in range(4):
        # if len(pmid_list) >= total_to_fetch:
        #     break       

        month_start, month_end = (quartal) * 3 + 1, ((quartal)) * 3 + 3
        query = f"intelligence[Title/Abstract] AND (\"{year}/{month_start}\"[Date - Publication] : \"{year}/{month_end}\"[Date - Publication])"
        handle = Entrez.esearch(db='pubmed', retmax=10000, retmode='xml', term=query)
        studies = Entrez.read(handle)

        print(f"{studies['Count']} data for {month_start}/{year}-{month_end}/{year}")

        studiesIdList = studies['IdList']
        papers = fetch_details(studiesIdList)
        save_data_to_lists(papers, pmid_list, title_list, abstract_list, author_list, date_list, doi_list)

        print(f"Newly saved data: {len(pmid_list)-saved_data_cnt}; Total saved data: {len(pmid_list)}")
        
        saved_data_cnt = len(pmid_list)

642 data for 1/2013-3/2013
Newly saved data: 625; Total saved data: 625
448 data for 4/2013-6/2013
Newly saved data: 349; Total saved data: 974
513 data for 7/2013-9/2013
Newly saved data: 361; Total saved data: 1335
513 data for 10/2013-12/2013
Newly saved data: 285; Total saved data: 1620
789 data for 1/2014-3/2014
Newly saved data: 599; Total saved data: 2219
469 data for 4/2014-6/2014
Newly saved data: 295; Total saved data: 2514
484 data for 7/2014-9/2014
Newly saved data: 302; Total saved data: 2816
512 data for 10/2014-12/2014
Newly saved data: 307; Total saved data: 3123
857 data for 1/2015-3/2015
Newly saved data: 658; Total saved data: 3781
539 data for 4/2015-6/2015
Newly saved data: 314; Total saved data: 4095
546 data for 7/2015-9/2015
Newly saved data: 323; Total saved data: 4418
524 data for 10/2015-12/2015
Newly saved data: 320; Total saved data: 4738
827 data for 1/2016-3/2016
Newly saved data: 620; Total saved data: 5358
544 data for 4/2016-6/2016
Newly saved data: 31

In [44]:
# Create DataFrame
df = pd.DataFrame({
    'PMID': pmid_list,
    'Title': title_list,
    'Abstract': abstract_list,
    'Authors': author_list,
    'Publication Date': date_list,
    'DOI': doi_list
})

In [46]:
pd.set_option('display.max_colwidth', None)
df.tail()

Unnamed: 0,PMID,Title,Abstract,Authors,Publication Date,DOI
58845,35235526,Efficient Architecture Search for Continual Learning.,"Continual learning with neural networks, which aims to learn a sequence of tasks, is an important learning framework in artificial intelligence (AI). However, it often confronts three challenges: 1) overcome the catastrophic forgetting problem; 2) adapt the current network to new tasks; and 3) control its model complexity. To reach these goals, we propose a novel approach named continual learning with efficient architecture search (CLEAS). CLEAS works closely with neural architecture search (NAS), which leverages reinforcement learning techniques to search for the best neural architecture that fits a new task. In particular, we design a neuron-level NAS controller that decides which old neurons from previous tasks should be reused (knowledge transfer) and which new neurons should be added (to learn new knowledge). Such a fine-grained controller allows finding a very concise architecture that can fit each new task well. Meanwhile, since we do not alter the weights of the reused neurons, we perfectly memorize the knowledge learned from the previous tasks. We evaluate CLEAS on numerous sequential classification tasks, and the results demonstrate that CLEAS outperforms other state-of-the-art alternative methods, achieving higher classification accuracy while using simpler neural architectures.",Qiang Gao; Zhipeng Luo; Diego Klabjan; Fengli Zhang,2023-Nov,10.1109/TNNLS.2022.3151511
58846,35230953,QTT-DLSTM: A Cloud-Edge-Aided Distributed LSTM for Cyber-Physical-Social Big Data.,"Cyber-physical-social systems (CPSS), an emerging cross-disciplinary research area, combines cyber-physical systems (CPS) with social networking for the purpose of providing personalized services for humans. CPSS big data, recording various aspects of human lives, should be processed to mine valuable information for CPSS services. To efficiently deal with CPSS big data, artificial intelligence (AI), an increasingly important technology, is used for CPSS data processing and analysis. Meanwhile, the rapid development of edge devices with fast processors and large memories allows local edge computing to be a powerful real-time complement to global cloud computing. Therefore, to facilitate the processing and analysis of CPSS big data from the perspective of multi-attributes, a cloud-edge-aided quantized tensor-train distributed long short-term memory (QTT-DLSTM) method is presented in this article. First, a tensor is used to represent the multi-attributes CPSS big data, which will be decomposed into the QTT form to facilitate distributed training and computing. Second, a distributed cloud-edge computing model is used to systematically process the CPSS data, including global large-scale data processing in the cloud, and local small-scale data processed at the edge. Third, a distributed computing strategy is used to improve the efficiency of training via partitioning the weight matrix and large amounts of input data in the QTT form. Finally, the performance of the proposed QTT-DLSTM method is evaluated using experiments on a public discrete manufacturing process dataset, the Li-ion battery dataset, and a public social dataset.",Xiaokang Wang; Lei Ren; Ruixue Yuan; Laurence T Yang; M Jamal Deen,2023-Oct,10.1109/TNNLS.2022.3140238
58847,35130174,A Review of Recurrent Neural Network-Based Methods in Computational Physiology.,"Artificial intelligence and machine learning techniques have progressed dramatically and become powerful tools required to solve complicated tasks, such as computer vision, speech recognition, and natural language processing. Since these techniques have provided promising and evident results in these fields, they emerged as valuable methods for applications in human physiology and healthcare. General physiological recordings are time-related expressions of bodily processes associated with health or morbidity. Sequence classification, anomaly detection, decision making, and future status prediction drive the learning algorithms to focus on the temporal pattern and model the nonstationary dynamics of the human body. These practical requirements give birth to the use of recurrent neural networks (RNNs), which offer a tractable solution in dealing with physiological time series and provide a way to understand complex time variations and dependencies. The primary objective of this article is to provide an overview of current applications of RNNs in the area of human physiology for automated prediction and diagnosis within different fields. Finally, we highlight some pathways of future RNN developments for human physiology.",Shitong Mao; Ervin Sejdic,2023-Oct,10.1109/TNNLS.2022.3145365
58848,35108210,Online Intention Recognition With Incomplete Information Based on a Weighted Contrastive Predictive Coding Model in Wargame.,"The incomplete and imperfect essence of the battlefield situation results in a challenge to the efficiency, stability, and reliability of traditional intention recognition methods. For this problem, we propose a deep learning architecture that consists of a contrastive predictive coding (CPC) model, a variable-length long short-term memory network (LSTM) model, and an attention weight allocator for online intention recognition with incomplete information in wargame (W-CPCLSTM). First, based on the typical characteristics of intelligence data, a CPC model is designed to capture more global structures from limited battlefield information. Then, a variable-length LSTM model is employed to classify the learned representations into predefined intention categories. Next, a weighted approach to the training attention of CPC and LSTM is introduced to allow for the stability of the model. Finally, performance evaluation and application analysis of the proposed model for the online intention recognition task were carried out based on four different degrees of detection information and a perfect situation of ideal conditions in a wargame. Besides, we explored the effect of different lengths of intelligence data on recognition performance and gave application examples of the proposed model to a wargame platform. The simulation results demonstrate that our method not only contributes to the growth of recognition stability, but it also improves recognition accuracy by 7%-11%, 3%-7%, 3%-13%, and 3%-7%, the recognition speed by 6- 32× , 4- 18× , 13-* × , and 1- 6× compared with the traditional LSTM, classical FCN, OctConv, and OctFCN models, respectively, which characterizes it as a promising reference tool for command decision-making.",Li Chen; Xingxing Liang; Yanghe Feng; Longfei Zhang; Jing Yang; Zhong Liu,2023-Oct,10.1109/TNNLS.2022.3144171
58849,34670834,A Nietzschean critique of liberal eugenics.,"Ethical debates about liberal eugenics frequently focus on the supposed unnaturalness of its means and possible harm to autonomy. I present a Nietzsche-inspired critique focusing on intention rather than means and harm to abilities rather than to autonomy. I first critique subjective eugenics, the selection of extrinsically valuable traits, drawing on Nietzsche's notion of 'slavish' values reducible to the negation of another's good. Subjective eugenics slavishly evaluates traits relative to a negatively evaluated norm (eg, above-average intelligence), disguising a harmful intention to diminish the relative value of that norm. I then argue there is no objective form of eugenics on the Nietzschean ground that abilities are not valuable intrinsically; they are valuable only if one possesses the relative power to exercise them. Abilities frustrated by conflict with other abilities or environment are harmful, while disabilities that empower one's other abilities are beneficial. Consequently, all forms of eugenics are subject to the prior ethical critique of subjective eugenics.",Donovan Tateshi Miyasaki,2023-Dec-14,10.1136/medethics-2021-107414


In [47]:
# Save the data to a CSV file
# the sizes becomes more than 100 MB which can't be pushed
# df.to_csv('pubmed_data.csv', index=False)

split_index = df.shape[0] // 2

# Split the DataFrame into two parts
df_part1 = df.iloc[:split_index]
df_part2 = df.iloc[split_index:]

# Save each part to a CSV file
df_part1.to_csv('pubmed_data_part1.csv', index=False)
df_part2.to_csv('pubmed_data_part2.csv', index=False)

In [48]:
with open('pubmed_data_part1.csv', 'r', encoding='utf-8') as file:
    content = file.read()

# Replace unusual line terminators with standard newline character
content = content.replace('\u2028', '\n').replace('\u2029', '\n')

# Write the corrected content back to the file
with open('pubmed_data_part1.csv', 'w', encoding='utf-8') as file:
    file.write(content)


In [49]:
# Read the two CSV files
df_part1 = pd.read_csv('pubmed_data_part1.csv')
df_part2 = pd.read_csv('pubmed_data_part2.csv')

# Concatenate the two DataFrames
df = pd.concat([df_part1, df_part2], ignore_index=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58854 entries, 0 to 58853
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   PMID              58854 non-null  object
 1   Title             58854 non-null  object
 2   Abstract          58850 non-null  object
 3   Authors           58766 non-null  object
 4   Publication Date  58850 non-null  object
 5   DOI               57986 non-null  object
dtypes: object(6)
memory usage: 2.7+ MB
