In [1]:
def fasta2dict(inf):
    """
    作者：徐诗芬
    功能：将序列读取成字典，根据特定序列ID提取字典里的序列
    日期：2021.1.9
    """
    # 按行读取序列
    # 输入fasta文件，返回名称，序列
    global name
    dict = {}
    for line in inf:
        line = line.strip()
        if line.startswith('>'):
            name = line
            dict[name] = ''
        else:
            dict[name] += line
    return dict

In [2]:
# 设置工作目录
wd = "/Users/sherlock/Documents/bioinformatics/基因家族/玉米基因家族分析/"
# wd = "/your/working/directory/"

In [3]:
# 读取目标物种的蛋白序列 fasta 文件
filepath = wd + "B73_protein.faa"
with open(filepath, "r", encoding="utf-8") as f:
    prot_seq = f.readlines()

In [4]:
# 预览
# prot_seq

In [5]:
# 将序列转为字典格式
prot_dict = fasta2dict(prot_seq)

In [6]:
# 查看字典
# prot_dict[]

In [7]:
# 将字典转为 dataframe
import pandas as pd
prot_df = pd.DataFrame([prot_dict]).T.reset_index()
prot_df.head()

Unnamed: 0,index,0
0,">NP_001104837.2 ferredoxin-6, chloroplastic [Z...",MRVPGTAQHPSSIDPPNHLLRPALRLGTGVRRAPSRPGTGRAVTHP...
1,>NP_001104839.1 squalene synthase 1 [Zea mays],MGALSRPEEVLALVKLRVAAGQIKRQIPPEEHWAFAYSMLQKVSRS...
2,>NP_001104840.2 sucrose transporter 1 [Zea mays],MARGDGELELSVGVRGTGGAAAAAADHVAPISLGRLILAGMVAGGV...
3,>NP_001104843.2 MAP kinase 2 [Zea mays],MDGGGQPPDTEMSEAGAGGGGQPPQQPLPPVGGGVMLDNIQATLSH...
4,">NP_001104844.1 ferredoxin-2, chloroplastic [Z...",MAATALSMSILRAPPPCFSSPLRLRVAVAKPLAAPMRRQLLRAQAT...


In [8]:
# 将列重命名
prot_df.columns = ["name","sequence"]
prot_df.head()

Unnamed: 0,name,sequence
0,">NP_001104837.2 ferredoxin-6, chloroplastic [Z...",MRVPGTAQHPSSIDPPNHLLRPALRLGTGVRRAPSRPGTGRAVTHP...
1,>NP_001104839.1 squalene synthase 1 [Zea mays],MGALSRPEEVLALVKLRVAAGQIKRQIPPEEHWAFAYSMLQKVSRS...
2,>NP_001104840.2 sucrose transporter 1 [Zea mays],MARGDGELELSVGVRGTGGAAAAAADHVAPISLGRLILAGMVAGGV...
3,>NP_001104843.2 MAP kinase 2 [Zea mays],MDGGGQPPDTEMSEAGAGGGGQPPQQPLPPVGGGVMLDNIQATLSH...
4,">NP_001104844.1 ferredoxin-2, chloroplastic [Z...",MAATALSMSILRAPPPCFSSPLRLRVAVAKPLAAPMRRQLLRAQAT...


In [9]:
# 将 name 列中的基因名拆解出来
name_df = prot_df["name"].str.split(" ",expand=True)
name_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,>NP_001104837.2,"ferredoxin-6,",chloroplastic,[Zea,mays],,,,,,,,,,,
1,>NP_001104839.1,squalene,synthase,1,[Zea,mays],,,,,,,,,,
2,>NP_001104840.2,sucrose,transporter,1,[Zea,mays],,,,,,,,,,
3,>NP_001104843.2,MAP,kinase,2,[Zea,mays],,,,,,,,,,
4,>NP_001104844.1,"ferredoxin-2,",chloroplastic,[Zea,mays],,,,,,,,,,,


In [10]:
# 去掉 name 中的 > 符号
name_df[0] = name_df[0].str.replace(">","")
name_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,NP_001104837.2,"ferredoxin-6,",chloroplastic,[Zea,mays],,,,,,,,,,,
1,NP_001104839.1,squalene,synthase,1,[Zea,mays],,,,,,,,,,
2,NP_001104840.2,sucrose,transporter,1,[Zea,mays],,,,,,,,,,
3,NP_001104843.2,MAP,kinase,2,[Zea,mays],,,,,,,,,,
4,NP_001104844.1,"ferredoxin-2,",chloroplastic,[Zea,mays],,,,,,,,,,,


In [11]:
# 合并
prot_shortname_df = pd.concat([name_df[0],prot_df],axis=1)
prot_shortname_df.head()

Unnamed: 0,0,name,sequence
0,NP_001104837.2,">NP_001104837.2 ferredoxin-6, chloroplastic [Z...",MRVPGTAQHPSSIDPPNHLLRPALRLGTGVRRAPSRPGTGRAVTHP...
1,NP_001104839.1,>NP_001104839.1 squalene synthase 1 [Zea mays],MGALSRPEEVLALVKLRVAAGQIKRQIPPEEHWAFAYSMLQKVSRS...
2,NP_001104840.2,>NP_001104840.2 sucrose transporter 1 [Zea mays],MARGDGELELSVGVRGTGGAAAAAADHVAPISLGRLILAGMVAGGV...
3,NP_001104843.2,>NP_001104843.2 MAP kinase 2 [Zea mays],MDGGGQPPDTEMSEAGAGGGGQPPQQPLPPVGGGVMLDNIQATLSH...
4,NP_001104844.1,">NP_001104844.1 ferredoxin-2, chloroplastic [Z...",MAATALSMSILRAPPPCFSSPLRLRVAVAKPLAAPMRRQLLRAQAT...


In [12]:
# 生成以 NAME 为索引的蛋白序列表
prot_shortname_df = prot_shortname_df.rename(columns={0:"NAME"})
prot_shortname_df.head()

Unnamed: 0,NAME,name,sequence
0,NP_001104837.2,">NP_001104837.2 ferredoxin-6, chloroplastic [Z...",MRVPGTAQHPSSIDPPNHLLRPALRLGTGVRRAPSRPGTGRAVTHP...
1,NP_001104839.1,>NP_001104839.1 squalene synthase 1 [Zea mays],MGALSRPEEVLALVKLRVAAGQIKRQIPPEEHWAFAYSMLQKVSRS...
2,NP_001104840.2,>NP_001104840.2 sucrose transporter 1 [Zea mays],MARGDGELELSVGVRGTGGAAAAAADHVAPISLGRLILAGMVAGGV...
3,NP_001104843.2,>NP_001104843.2 MAP kinase 2 [Zea mays],MDGGGQPPDTEMSEAGAGGGGQPPQQPLPPVGGGVMLDNIQATLSH...
4,NP_001104844.1,">NP_001104844.1 ferredoxin-2, chloroplastic [Z...",MAATALSMSILRAPPPCFSSPLRLRVAVAKPLAAPMRRQLLRAQAT...


In [13]:
# 读入 hmmsearch 步骤获得的候选基因
filepath = wd + "candidate.txt"
candi = pd.read_csv(filepath,header=None)
candi.head()

Unnamed: 0,0
0,XP_008647807.1
1,NP_001354713.1
2,NP_001354955.1
3,XP_035819724.1
4,NP_001354299.1


In [14]:
# 重命名
candi.columns = ["NAME"]
candi.head()

Unnamed: 0,NAME
0,XP_008647807.1
1,NP_001354713.1
2,NP_001354955.1
3,XP_035819724.1
4,NP_001354299.1


In [15]:
# 将候选基因与蛋白序列表匹配，获得候选基因的蛋白序列
candi_seq = pd.merge(candi,prot_shortname_df,on="NAME")
candi_seq.head()

Unnamed: 0,NAME,name,sequence
0,XP_008647807.1,>XP_008647807.1 uncharacterized protein LOC100...,MASSTGSLEHGGFTFTPPPFITSFTELLSGAAADMVGAAGADHQER...
1,NP_001354713.1,>NP_001354713.1 uncharacterized protein LOC100...,MASSTGSLEHGGFTFTPPPFITSFTELLSSAGDMLGAGADQERSSP...
2,NP_001354955.1,>NP_001354955.1 uncharacterized protein LOC100...,MASSTGSLEHGGFTFTPPPFITSFTELLSGAAADMVGAAGADHQER...
3,XP_035819724.1,>XP_035819724.1 WRKY transcription factor WRKY...,MTTSSSGSIEAPANSRPGSFSFASTSTSFTNMLGGSADAAGGASRY...
4,NP_001354299.1,>NP_001354299.1 uncharacterized protein LOC100...,MTTSSSGSIEAPANSRPGSFSFASTSTSFTNMLGGSADAAGGASRY...


In [16]:
candi_seq.shape

(311, 3)

In [17]:
df = candi_seq.copy()

In [18]:
# 生成候选基因的蛋白序列 fasta 文件
# 短名字
filepath = wd + "candidate_short.faa"

with open(filepath, "a", encoding="utf-8") as f:
    for i in range(df.shape[0]):
        name = df.iloc[i,0]
        seq = df.iloc[i,2]
        f.write(name + "\n")
        f.write(seq + "\n")

In [19]:
# 生成候选基因的蛋白序列 fasta 文件
# 长名字
filepath = wd + "candidate_long.faa"

with open(filepath, "a", encoding="utf-8") as f:
    for i in range(df.shape[0]):
        name = df.iloc[i,1]
        seq = df.iloc[i,2]
        f.write(name + "\n")
        f.write(seq + "\n")