In [3]:
from Bio import SeqIO
import os,sys
import pandas as pd
import phylopandas as ph

In [4]:
def getgene(fasta_file, anno_list):
    f = open(fasta_file, "w")
    for rec in SeqIO.parse("BetaCoV.gb", "gb"):
        for feature in rec.features:
            for key, val in feature.qualifiers.items():
                if feature.type == "CDS":
                    if any (s in val for s in anno_list):
                        print (">" + rec.id, file=f )
                        print (feature.location.extract(rec).seq,file=f)
    f.close()

In [12]:
def duplicate_remover(fasta_file):
    df = ph.read_fasta(fasta_file)
    df = df.filter(['id','sequence'], axis=1)
    df = df.drop_duplicates()
    df.to_csv("temp.tab", sep="\t",index = False,header=False)
    df2= SeqIO.parse("temp.tab", "tab")
    SeqIO.write(df2, "rm"+fasta_file, "fasta")
    os.remove("temp.tab")
    print("duplicate records removed!\n"+ str(len(df.index))+" unique records saved from " + fasta_file)


https://viralzone.expasy.org/764?outline=all_by_species

### ORF1ab

In [130]:
### get ORF1ab
with open('orf1ab_v2.txt') as f1:
    ORF1ab = f1.readlines()
    
f.close()    
ORF1ab = [x.strip() for x in lines] 
ORF1ab

['ORF1a/b polyprotein',
 'ORF1ab',
 'ORF1ab polyprotein',
 'ORF1ab protein',
 '1',
 '1ab',
 'putative orf1ab polyprotein',
 'putative polyprotein',
 'non-structural polyprotein 1a',
 'non-structural polyprotein 1ab',
 'orf1ab',
 'orf1ab polyprotein',
 'polyprotein 1ab',
 'polyprotein',
 'polyprotein ORF1ab',
 'polyprotein orf1a',
 'polyprotein orf1ab',
 'replicase 1AB',
 'replicase',
 'replicase p1AB',
 'replicase polyprotein 1ab',
 'replicase polyprotein',
 'replicase polyprotein ORF 1ab',
 'replicase protein',
 'ORF 1ab',
 'PP1ab',
 'Pp1ab',
 'rep']

In [61]:
ORF1ab={'1ab',
        'ORF 1ab',
        'ORF1',
        'ORF1a/b polyprotein',
        'ORF1ab',
        'ORF1ab polyprotein',
        'ORF1ab protein',
        'PP1ab',
        'Pp1ab',   
        'non-structural polyprotein 1ab',
        'nonstructural polyprotein pp1ab',
        'nonstructural polyprotein',
        'orf1ab',
        'orf1ab polyprotein',
        'polymerase',
        'polyprotein 1ab',
        'polyprotein',
        'polyprotein ORF1ab',
        'polyprotein orf1ab',
        'putative orf1ab polyprotein',
        'putative polyprotein',
        'rep',
        'replicase 1AB',
        'replicase',
        'replicase p1AB',
        'replicase polyprotein 1ab',
        'replicase polyprotein',
        'replicase polyprotein ORF 1ab',
        "replicase protein"}

In [62]:
getgene("ORF1ab.fasta",ORF1ab)

In [63]:
duplicate_remover("ORF1ab.fasta")

duplicate records removed!
5486 unique records saved from ORF1ab.fasta


In [101]:
## duplicate ORF1a need to be further removed
## read the ORFab fasta in a dataframe
df = ph.read_fasta("Data/rmORF1ab.fasta")
df['seq_length'] = df['sequence'].str.len()
##sort by length
df = df.sort_values('seq_length')
## keep the last one with the identical id
df = df.drop_duplicates(subset='id',keep = "last") 
df = df.filter(['id','sequence'], axis=1)
df.to_csv("temp.tab", sep="\t",index = False,header=False)
df2= SeqIO.parse("temp.tab", "tab")
SeqIO.write(df2, "Data/rmORF1ab.fasta", "fasta")
os.remove("temp.tab")

### S

In [67]:
S={
       'S',
        'S glycoprotein',
        'S glycoprotein S1B portion',
        'S protein',
        'Spike',
        'Spike protein',
        'putative E2 glycoprotein precursor',
        'putative spike glycoprotein',
        'spike',
        'spike glycoprotein',
        'spike glycoprotein S',
        'spike glycoprotein precursor',
        'spike protein',
        'spike protein S1 subunit',
        'spike protein subunit 1',
        'spike surface glycoprotein',
        'surface glycoprotein',
        'surface protein'}


In [68]:
getgene("S.fasta",S)

In [69]:
duplicate_remover("S.fasta")

duplicate records removed!
5291 unique records saved from S.fasta


### ORF3a 

In [71]:
ORF3a = {'ORF 3a',
        'ORF3a',
        'ORF3a protein',
        'hypothetical protein ORF3a',
        'orf3a',
        'orf3a protein'}

In [72]:
getgene("ORF3a.fasta",ORF3a)

In [73]:
duplicate_remover("ORF3a.fasta")

duplicate records removed!
1899 unique records saved from ORF3a.fasta


### E gene

In [160]:
E={'E',
    'E protein',
    'E small membrane protein',
    'envelope',
    'envelope protein',
    'envelope protein E',
    'envelope small membrane protein',
    'putative envelope protein E',
    'putative small envelope protein',
    'small envelope E protein',
    "small envelope protein",
    }

In [161]:
getgene("E.fasta",E)

In [162]:
duplicate_remover("E.fasta")

duplicate records removed!
3193 unique records saved from E.fasta


### M gene

In [164]:
M={'M',
    'M protein',
    'Membrane protein',
    'matrix protein',
    'membrance glycoprotein',
    'membrane',
    'membrane glycoprotein',
    'membrane glycoprotein M',
    'membrane protein',
    'membrane protein M',
    'putative M protein',
    'putative envelope protein M',
    'sM',
    "small membrane protein" }

In [165]:
getgene("M.fasta",M)

In [166]:
duplicate_remover("M.fasta")

duplicate records removed!
3324 unique records saved from M.fasta


### ORF6a

In [75]:
ORF6a = {'NS6',
        'ORF6',
        'ORF6 protein'}

In [77]:
getgene("ORF6a.fasta",ORF6a)

In [78]:
duplicate_remover("ORF6a.fasta")

duplicate records removed!
1863 unique records saved from ORF6a.fasta


### ORF7a 

In [79]:
ORF7a = {
    'ORF7a',
    'ORF7a protein',
    'NS7a'
    
}

In [83]:
getgene("ORF7a.fasta",ORF7a)

In [84]:
duplicate_remover("ORF7a.fasta")

duplicate records removed!
1895 unique records saved from ORF7a.fasta


### ORF7b

In [86]:
ORF7b = {'ORF7b',
'ORF7b protein',
'NS7b'
    
}

In [87]:
getgene("ORF7b.fasta",ORF7b)

In [88]:
duplicate_remover("ORF7b.fasta")

duplicate records removed!
1726 unique records saved from ORF7b.fasta


### ORF8

In [90]:
ORF8 = {
    'NS8',
    'ORF8',
    'ORF8 protein'
    
}

In [91]:
getgene("ORF8.fasta",ORF8)

In [92]:
duplicate_remover("ORF8.fasta")

duplicate records removed!
1817 unique records saved from ORF8.fasta


### N gene

In [168]:
N={'N',
'N protein',
'nucleocapsid',
'nucleocapsid phosphoprotein',
'nucleocapsid protein',
'nucleocapsid protein N',
'nucleoprotein',
'putative nucleocapsid protein',
"putative nucleocapsid protein N"  }

In [169]:
getgene("N.fasta",N)

In [170]:
duplicate_remover("N.fasta")

duplicate records removed!
3977 unique records saved from N.fasta


### ORF10

In [93]:
ORF10 = {
    'ORF10',
'ORF10 protein',
'orf10',
'p10',
'p10 protein'
    
}

In [94]:
getgene("ORF10.fasta", ORF10)

In [95]:
duplicate_remover("ORF10.fasta")

duplicate records removed!
1907 unique records saved from ORF10.fasta
