In [3]:
from Bio import Entrez, SeqIO, SeqRecord, Seq
from Bio.Entrez.Parser import DictionaryElement
Entrez.email = "3303652975@qq.com"

def search_nucleotide_transcript_ref_seq(animal: str, gene: str) -> DictionaryElement :
    """ 根据物种和基因，获取nucleotide中查询到的首个为transcript的 项目id """
    search_term =  '{}[Organism] AND {}[Gene]'.format(animal, gene)
    search_result = Entrez.read(Entrez.esearch(db="nucleotide",term=search_term))
    search_summary = Entrez.read(Entrez.esummary(db="nucleotide", id=",".join(search_result['IdList'])))
    # 1. 在列表中查找 Title 中包含 `transcript` 且以 `mRNA` 结尾的项
    transcript_mrna_target = next((x for x in search_summary if "transcript" in x['Title'] and str(x['Title']).endswith('mRNA')), None)
    # 2. 如果没找到，则查找列表中以 `mRNA` 结尾的项
    mrna_target = next((x for x in search_summary if str(x['Title']).endswith('mRNA')), None)
    # 3. 如果没找到，则查找列表中以 `complete cds` 结尾的项
    complete_cds_target = next((x for x in search_summary if str(x['Title']).endswith('complete cds')), None)
    # 4. 如果没找到，则查找列表中以 `partial cds` 结尾的项
    partial_cds_target = next((x for x in search_summary if str(x['Title']).endswith('partial cds')), None)
    return transcript_mrna_target or mrna_target or complete_cds_target or partial_cds_target or None

def get_seq_record_cds(seq_record: SeqRecord) -> Seq :
    """ 通过 seq_record, 提取其中的 CDS 序列 """
    if seq_record is None: return None
    cds_feature = None
    for idx, feature in enumerate(seq_record.features):
        if feature.type == 'CDS':
            cds_feature = feature
            break
    if cds_feature is None:
        return None
    else:
        cds_seq = seq_record.seq[cds_feature.location.start:cds_feature.location.end]
        return cds_seq

In [60]:
def get_nucleotide_transcript_cds_seq(animal, gene):
    target_summary = search_nucleotide_transcript_ref_seq(animal, gene)
    # print(target_summary)
    seq_record = SeqIO.read(Entrez.efetch(db="nucleotide", id=target_summary['Id'], rettype="gb", retmode="text"), "genbank")
    # print(seq_record)
    cds_seq: Seq = get_seq_record_cds(seq_record)
    if cds_seq is None:
        return None
    else:
        return '>{}_{}'.format(animal, gene) + '\n' + str(cds_seq) + '\n'

def print_all_cds_seq(animal_list: [str], gene_list: [str]):
    failure_list = []
    for animal in animal_list:
        for gene in gene_list:
            try:
                result = get_nucleotide_transcript_cds_seq(animal, gene)
                print(result + '\n')
            except:
                failure_list.append('{}_{}'.format(animal, gene))

    print('failre_list: \n', failure_list)

In [60]:
animals = [
    'Homo_sapiens',
]
genes = [
    'Tmbim1',
]
print_all_cds_seq(animals, genes)