In [1]:
# Get all the annotated, Ensembl and RefSeq matched genes' names
from Bio import Entrez
from urllib.error import HTTPError
import re

# 设置email以便NCBI联系
Entrez.email = "18810602991@163.com"

# 获取所有已知的人类基因
handle = Entrez.esearch(db='gene', term='"Homo sapiens"[Organism] AND ("annotated genes"[Filter] AND ("matches ensembl"[Properties] AND "srcdb refseq"[Properties]) AND alive[prop])',
                       retmax = 40000)
record = Entrez.read(handle)
gene_id_list = record['IdList']

In [2]:
import gzip

# Open the gene_info.gz file, gene_info.gz is downloaded from NCBI database
with gzip.open('gene_info.gz', 'rt') as f:
    # Create a dictionary to store Entrez IDs and their corresponding gene symbols
    entrez_to_symbol = {}

    # Loop over the lines in the gene_info file
    for line in f:
        # Skip the header line
        if line.startswith('#'):
            continue

        # Split the line into columns using tab as the delimiter
        columns = line.strip().split('\t')

        # Extract the Entrez ID and Gene Symbol
        entrez_id = columns[1]
        gene_symbol = columns[2]

        # Add the Entrez ID and Gene Symbol to the dictionary
        entrez_to_symbol[entrez_id] = gene_symbol

In [3]:
# Convert Entrez IDs to Gene Symbols
gene_symbols = [entrez_to_symbol[id] for id in gene_id_list]

# Print the results
print(len(gene_symbols))

36406


In [11]:
"TP53" in gene_symbols

True

In [41]:
## 检索关键词
#search_query = '("fertility"[Title/Abstract] OR "infertility"[Title/Abstract] OR "sterility"[Title/Abstract] OR "subfertility"[Title/Abstract] OR "infecundity"[Title/Abstract]) AND "female"[Title/Abstract] AND 2020/01/01:2023/12/31[Date - Publication]'

# 根据16年文献方式检索2020年后的文献, 共1104个
#search_query = '("infertility, female"[MeSH Terms] AND ("ovarian neoplasms"[MeSH Terms] OR (("cancer s"[All Fields] OR "cancerated"[All Fields] OR "canceration"[All Fields] OR "cancerization"[All Fields] OR "cancerized"[All Fields] OR "cancerous"[All Fields] OR "neoplasms"[MeSH Terms] OR "neoplasms"[All Fields] OR "cancer"[All Fields] OR "cancers"[All Fields]) AND "risk"[MeSH Terms]) OR "fertility agents"[MeSH Terms] OR ("breast neoplasms"[MeSH Terms] OR ("breast"[All Fields] AND "neoplasms"[All Fields]) OR "breast neoplasms"[All Fields]) OR "breast neoplasms"[MeSH Terms] OR "breast neoplasms"[MeSH Terms] OR (("fertiles"[All Fields] OR "fertility"[MeSH Terms] OR "fertility"[All Fields] OR "fertile"[All Fields] OR "fertilities"[All Fields]) AND "therapeutics"[MeSH Terms]) OR (("fertiles"[All Fields] OR "fertility"[MeSH Terms] OR "fertility"[All Fields] OR "fertile"[All Fields] OR "fertilities"[All Fields]) AND "therapeutics"[MeSH Terms]) OR "polycystic ovary syndrome"[MeSH Terms] OR "non alcoholic fatty liver disease"[MeSH Terms] OR "endometriosis"[MeSH Terms] OR (("major"[All Fields] OR "majored"[All Fields] OR "majoring"[All Fields] OR "majorities"[All Fields] OR "majority"[All Fields] OR "majors"[All Fields]) AND "chronic disease"[MeSH Terms]) OR "thyroid diseases"[MeSH Terms] OR "thyroid diseases"[MeSH Terms] OR "thyroid gland"[MeSH Terms] OR "lipoproteins"[MeSH Terms] OR "insulin resistance"[MeSH Terms] OR "coronary disease"[MeSH Terms] OR "coronary disease"[MeSH Terms] OR "genes, brca1"[MeSH Terms] OR "genes, brca2"[MeSH Terms] OR "germ line mutation"[MeSH Terms] OR "anti mullerian hormone"[MeSH Terms] OR "endometrial neoplasms"[MeSH Terms] OR "endometrial neoplasms"[MeSH Terms] OR "fertility agents"[MeSH Terms] OR "mental disorders"[MeSH Terms] OR "anxiety"[MeSH Terms] OR ("depressive disorder"[MeSH Terms] OR "depression"[MeSH Terms]))) AND (2020:2023[pdat])'

# 根据16年文献方式检索2010-2019年的文献, 共4157个
#search_query = '("infertility, female"[MeSH Terms] AND ("ovarian neoplasms"[MeSH Terms] OR (("cancer s"[All Fields] OR "cancerated"[All Fields] OR "canceration"[All Fields] OR "cancerization"[All Fields] OR "cancerized"[All Fields] OR "cancerous"[All Fields] OR "neoplasms"[MeSH Terms] OR "neoplasms"[All Fields] OR "cancer"[All Fields] OR "cancers"[All Fields]) AND "risk"[MeSH Terms]) OR "fertility agents"[MeSH Terms] OR ("breast neoplasms"[MeSH Terms] OR ("breast"[All Fields] AND "neoplasms"[All Fields]) OR "breast neoplasms"[All Fields]) OR "breast neoplasms"[MeSH Terms] OR "breast neoplasms"[MeSH Terms] OR (("fertiles"[All Fields] OR "fertility"[MeSH Terms] OR "fertility"[All Fields] OR "fertile"[All Fields] OR "fertilities"[All Fields]) AND "therapeutics"[MeSH Terms]) OR (("fertiles"[All Fields] OR "fertility"[MeSH Terms] OR "fertility"[All Fields] OR "fertile"[All Fields] OR "fertilities"[All Fields]) AND "therapeutics"[MeSH Terms]) OR "polycystic ovary syndrome"[MeSH Terms] OR "non alcoholic fatty liver disease"[MeSH Terms] OR "endometriosis"[MeSH Terms] OR (("major"[All Fields] OR "majored"[All Fields] OR "majoring"[All Fields] OR "majorities"[All Fields] OR "majority"[All Fields] OR "majors"[All Fields]) AND "chronic disease"[MeSH Terms]) OR "thyroid diseases"[MeSH Terms] OR "thyroid diseases"[MeSH Terms] OR "thyroid gland"[MeSH Terms] OR "lipoproteins"[MeSH Terms] OR "insulin resistance"[MeSH Terms] OR "coronary disease"[MeSH Terms] OR "coronary disease"[MeSH Terms] OR "genes, brca1"[MeSH Terms] OR "genes, brca2"[MeSH Terms] OR "germ line mutation"[MeSH Terms] OR "anti mullerian hormone"[MeSH Terms] OR "endometrial neoplasms"[MeSH Terms] OR "endometrial neoplasms"[MeSH Terms] OR "fertility agents"[MeSH Terms] OR "mental disorders"[MeSH Terms] OR "anxiety"[MeSH Terms] OR ("depressive disorder"[MeSH Terms] OR "depression"[MeSH Terms]))) AND (2010:2019[pdat])'

# 使用infertility, female和mutation这两个MeSH Terms进行检索，共937个
#search_query = '"infertility, female"[MeSH Terms] AND "mutation"[MeSH Terms]'

# 使用infertility, female和（mutation或polymorphism, single nucleotide）这3个MeSH Terms进行检索，共1066个
#search_query = '"infertility, female"[MeSH Terms] AND ("mutation"[MeSH Terms] OR "polymorphism, single nucleotide"[MeSH Terms])'

# 严格使用mutation与女性不孕相关表型进行搜索, 共360个
#search_query = '"mutation"[MeSH Terms] AND ("reproductive endocrine disorders"[All Fields] OR "oocyte maturation arrest"[All Fields] OR "menopause, premature"[MeSH Terms] OR "Zona Pellucida"[MeSH Terms] OR "fertilization defects"[All Fields] OR "Polycystic Ovary Syndrome"[MeSH Terms] OR "endrometriosis"[All Fields]) NOT "review"[Publication Type]'

# 使用和女性不孕相关疾病或分子机制为关键词进行mutation搜索，共2740个
#search_query = '("mutation"[MeSH Terms] AND ("reproductive endocrine disorders"[All Fields] OR "oocyte maturation arrest"[All Fields] OR "menopause, premature"[MeSH Terms] OR "Zona Pellucida"[MeSH Terms] OR "fertilization defects"[All Fields] OR "Polycystic Ovary Syndrome"[MeSH Terms] OR "endrometriosis"[All Fields] OR "meiosis/genetics"[MeSH Terms])) NOT "review"[Publication Type]'

# 使用和女性不孕相关疾病或分子机制为关键词进行SNP搜索，共574个
#search_query = '("polymorphism, single nucleotide"[MeSH Terms] AND ("reproductive endocrine disorders"[All Fields] OR "oocyte maturation arrest"[All Fields] OR "menopause, premature"[MeSH Terms] OR "fertilization defects"[All Fields] OR "Polycystic Ovary Syndrome"[MeSH Terms] OR "endrometriosis"[All Fields] OR "meiosis/genetics"[MeSH Terms])) NOT "review"[Publication Type]'

# 使用mutation, meiosis, oocytes作为关键词，2020-2023共384篇
#search_query = '(("mutation"[MeSH Terms] AND ("meiosis/genetics"[MeSH Terms] OR "Oocytes"[MeSH Terms])) NOT "review"[Publication Type]) AND (2020:2023[pdat])'

# 检索和POI相关的变异，大部分都在Chen Zijiang 2023 Nat Med中报道了
#search_query = '("mutation"[MeSH Terms] AND "Primary Ovarian Insufficiency"[MeSH Terms]) NOT "review"[Publication Type]'

# mutation和 oocyte maturation arrest作为关键词检索, 共161篇，最终筛选出一个
search_query = '(oocyte maturation arrest) AND (mutation[MeSH Terms])'

# (failure of fertilization) AND (mutation[MeSH Terms]) AND (human[MeSH Terms]), 共405个
search_query = '(failure of fertilization) AND (mutation[MeSH Terms]) AND (female[MeSH Terms])'

# 使用ESearch函数进行检索
handle = Entrez.esearch(db='pubmed', 
                        retmax=10000, # 检索结果的最大数量
                        term=search_query)


# 解析检索结果
search_results = Entrez.read(handle)
article_id_list = search_results['IdList']

In [42]:
len(article_id_list)

405

In [43]:
# 3000篇文章16核需要跑3小时左右

from http.client import IncompleteRead

result_dict = {}

for id in article_id_list:
    try :
        handle = Entrez.efetch(db='pubmed', id=id, rettype='abstract', retmode='text')
        abstract = handle.read()

        # 查找基因
        matched_genes = set(re.findall(r'\b(' + '|'.join(gene_symbols) + r')\b', abstract))

        # 将匹配到的基因作为键，文章id作为值存储到字典中
        for gene in matched_genes:
            if gene not in result_dict:
                result_dict[gene] = [id]
            else:
                result_dict[gene].append(id)
        
        # 打印出现过的基因名
        if len(matched_genes) > 0:
            print(f'PMID {id} matched genes: {matched_genes}')

    except IncompleteRead:
        handle = Entrez.efetch(db='pubmed', id=id, rettype='abstract', retmode='text')
        abstract = handle.read()

        # 查找基因
        matched_genes = set(re.findall(r'\b(' + '|'.join(gene_symbols) + r')\b', abstract))

        # 将匹配到的基因作为键，文章id作为值存储到字典中
        for gene in matched_genes:
            if gene not in result_dict:
                result_dict[gene] = [id]
            else:
                result_dict[gene].append(id)
        
        # 打印出现过的基因名
        if len(matched_genes) > 0:
            print(f'PMID {id} matched genes: {matched_genes}')

    except HTTPError:
        pass

PMID 36529831 matched genes: {'PLCZ1'}
PMID 36471203 matched genes: {'BTG4', 'PABPN1L'}
PMID 36463079 matched genes: {'TUBB8'}
PMID 36422765 matched genes: {'CP'}
PMID 36379263 matched genes: {'MAP1S', 'HAUS1', 'PIWIL1', 'IL9R', 'TRIP13', 'XRN1', 'KIF1C', 'KIF4A', 'ADAM3A', 'ADAM15', 'SUPT5H', 'SPACA1', 'PLK4', 'NLRP7', 'TP53', 'DAB2IP', 'PLCZ1', 'MARK4'}
PMID 36088419 matched genes: {'PADI6'}
PMID 35946397 matched genes: {'ZBED3', 'KHDC3L', 'NLRP2', 'TLE6', 'NLRP5', 'OOEP', 'PADI6'}
PMID 35900055 matched genes: {'TRIP13', 'FBXO43', 'MEI1', 'TBPL2', 'WEE2', 'REC114', 'TUBB8', 'BTG4', 'ZP1', 'CDC20', 'NLRP2', 'MOS', 'TLE6', 'PANX1', 'PADI6'}
PMID 35354490 matched genes: {'TUBB8'}
PMID 35347416 matched genes: {'DDX60L', 'FGGY', 'MCM5'}
PMID 34647228 matched genes: {'BTG4', 'CNOT7'}
PMID 34476630 matched genes: {'WEE2'}
PMID 34264011 matched genes: {'TLE6'}
PMID 34237786 matched genes: {'NHS'}
PMID 34160777 matched genes: {'TUBB8'}
PMID 34053384 matched genes: {'MTHFR'}
PMID 33939064 matc

In [31]:
import pandas as pd
# 将字典转换为列表
dict_items = result_dict.items()
dict_list = list(dict_items)

# 将列表转换为DataFrame
df = pd.DataFrame(dict_list, columns=["Gene", "PMID"])

In [32]:
# 将 DataFrame 写入 Excel 文件
with pd.ExcelWriter('Keyword_failureofFertilization_mutation_genelist_all_5thSearch_0302.xlsx') as writer:
    df.to_excel(writer, index=False)

In [33]:
df

Unnamed: 0,Gene,PMID
0,DNAH17,"[36589837, 34126833]"
1,WEE2,"[36589837, 35900055, 34476630, 33895934, 33148..."
2,TLE6,"[36589837, 35946397, 35900055, 34264011, 33895..."
3,ACTL7A,"[36589837, 36574082]"
4,ZP2,"[36589837, 33895934, 30810869, 29895852, 28886..."
5,TUBB8,"[36589837, 36463079, 35900055, 35354490, 34160..."
6,PLCZ1,"[36589837, 36574082, 36529831, 36379263, 34126..."
7,ACTL9,[36589837]
8,NLRP5,"[36589837, 35946397, 33895934, 33073652, 32712..."
9,BTG4,"[36471203, 35900055, 34647228, 33895934, 32502..."


In [34]:
import pandas as pd

# 读取不需要的基因列表
redundant_list = pd.read_excel('redundantGenes.xlsx')

# 读取检索到的基因和文章对应的列表
search_result = pd.read_excel('Keyword_failureofFertilization_mutation_genelist_all_5thSearch_0302.xlsx')

In [35]:
picked_gene = redundant_list["GeneName"].tolist()
search_result_matched = search_result[search_result["Gene"].isin(picked_gene)]
search_result_unmatched = search_result[~search_result["Gene"].isin(picked_gene)]

In [36]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None


In [37]:
"TUBB8" in search_result_unmatched['Gene'].tolist()

False

In [40]:
with pd.ExcelWriter('unmatched_68genelist_5thSearch.xlsx') as writer:
    search_result_unmatched.to_excel(writer, index=False)

In [39]:
len(search_result_unmatched)

68