In [88]:
from Bio import SeqIO

def remove_specific_records(input_file):
    # 读取FASTA文件中的所有序列
    sequences = list(SeqIO.parse(input_file, "fasta"))
    
    # 过滤掉描述中包含"SYN"、"PAT"或"ENV"的序列
    filtered_sequences = [record for record in sequences if not any(keyword in record.description for keyword in ["SYN", "PAT", "ENV"])]
    
    # 将过滤后的序列写回原文件
    with open(input_file, "w") as output_handle:
        for record in filtered_sequences:
            # 直接使用原始的头部信息
            header = f">{record.description}\n"
            # 将序列转换为字符串，并确保只占一行
            sequence = str(record.seq) + "\n"
            output_handle.write(header)
            output_handle.write(sequence)

# 使用示例
input_file = "../data/5utr_ncbi.fasta"  # 替换为你的FASTA文件名
remove_specific_records(input_file)

print("SYN, PLT, ENV sequences removed.")

SYN, PLT, ENV sequences removed.


In [90]:
def format_and_merge_files(input_files, output_file, species_dict):
    with open(output_file, "w") as output_handle:
        for input_file, species in input_files.items():
            # 读取每个FASTA文件中的序列
            sequences = SeqIO.parse(input_file, "fasta")
            for record in sequences:
                # 获取序列长度
                seq_length = len(record.seq)
                # 格式化描述信息
                new_description = f"> {seq_length} {species}\n"
                # 写入新的描述和序列到输出文件
                output_handle.write(new_description)
                output_handle.write(str(record.seq) + "\n")

# 输入文件及其对应的物种类别
input_files = {
    "../data/ensembl_5species/Fivespecies_chicken_energy_structure_31577sequence.fasta": "VRT",
    "../data/ensembl_5species/Fivespecies_human_energy_structure_77835sequence.fasta": "PRI",
    "../data/ensembl_5species/Fivespecies_mouse_energy_structure_48378sequence.fasta": "ROD",
    "../data/ensembl_5species/Fivespecies_rat_energy_structure_27740sequence.fasta": "ROD",
    "../data/ensembl_5species/Fivespecies_zebrafish_energy_structure_28819sequence.fasta": "VRT"
}

# 输出文件
output_file = "../5utr_ensembl.fasta"

# 执行格式化和合并
format_and_merge_files(input_files, output_file, input_files)

print("essembl data merged.")

essembl data merged.
