In [1]:
import modelseedpy
from modelseedpy.core.msgenome import MSGenome, parse_fasta_str
from modelseedpy.core.rast_client import RastClient

In [19]:
from modelseedpy.core.msgenome import MSFeature
def parse_fasta_str(faa_str, split='|'):
    lines = faa_str.split('\n')
    features = []
    seq = None
    for line in lines:
        if line.startswith('>'):
            if seq:
                features.append(seq)
            seq_id = line[1:]
            desc = ""
            if split:
                header_data = line[1:].split(split, 1)
                seq_id = header_data[0]
                desc = header_data[1]

            seq = MSFeature(seq_id, "", desc)
        else:
            if seq:
                seq.seq += line
    if seq and seq.seq and len(seq.seq) > 0:
        features.append(seq)
    return features

In [4]:
    faa_str = """
    >NP_414543.1 fused aspartate kinase/homoserine dehydrogenase 1
    MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNHLVAMIEKTISGQDALPNISDAERIFAELLTGLAA
    AQPGFPLAQLKTFVDQEFAQIKHVLHGISLLGQCPDSINAALICRGEKMSIAIMAGVLEARGHNVTVIDPVEKLLAVGHY
    LESTVDIAESTRRIAASRIPADHMVLMAGFTAGNEKGELVVLGRNGSDYSAAVLAACLRADCCEIWTDVDGVYTCDPRQV
    PDARLLKSMSYQEAMELSYFGAKVLHPRTITPIAQFQIPCLIKNTGNPQAPGTLIGASRDEDELPVKGISNLNNMAMFSV
    SGPGMKGMVGMAARVFAAMSRARISVVLITQSSSEYSISFCVPQSDCVRAERAMQEEFYLELKEGLLEPLAVTERLAIIS
    VVGDGMRTLRGISAKFFAALARANINIVAIAQGSSERSISVVVNNDDATTGVRVTHQMLFNTDQVIEVFVIGVGGVGGAL
    LEQLKRQQSWLKNKHIDLRVCGVANSKALLTNVHGLNLENWQEELAQAKEPFNLGRLIRLVKEYHLLNPVIVDCTSSQAV
    ADQYADFLREGFHVVTPNKKANTSSMDYYHQLRYAAEKSRRKFLYDTNVGAGLPVIENLQNLLNAGDELMKFSGILSGSL
    SYIFGKLDEGMSFSEATTLAREMGYTEPDPRDDLSGMDVARKLLILARETGRELELADIEIEPVLPAEFNAEGDVAAFMA
    NLSQLDDLFAARVAKARDEGKVLRYVGNIDEDGVCRVKIAEVDGNDPLFKVKNGENALAFYSHYYQPLPLVLRGYGAGND
    VTAAGVFADLLRTLSWKLGV
    >NP_414544.1 homoserine kinase
    MVKVYAPASSANMSVGFDVLGAAVTPVDGALLGDVVTVEAAETFSLNNLGRFADKLPSEPRENIVYQCWERFCQELGKQI
    PVAMTLEKNMPIGSGLGSSACSVVAALMAMNEHCGKPLNDTRLLALMGELEGRISGSIHYDNVAPCFLGGMQLMIEENDI
    ISQQVPGFDEWLWVLAYPGIKVSTAEARAILPAQYRRQDCIAHGRHLAGFIHACYSRQPELAAKLMKDVIAEPYRERLLP
    GFRQARQAVAEIGAVASGISGSGPTLFALCDKPETAQRVADWLGKNYLQNQEGFVHICRLDTAGARVLEN
    """
    features = parse_fasta_str(faa_str, ' ')
    assert len(features) == 2

In [9]:
len(features[0].seq)

820

In [2]:
rast = RastClient()

In [18]:
genome = MSGenome.from_fasta('GCF_000005845.2.faa', split=' ')

In [19]:
print('Number of features:', len(genome.features))

Number of features: 3


In [20]:
for f in genome.features:
    print(f.id, len(f.seq), f.description)

NP_414543.1 820 fused aspartate kinase/homoserine dehydrogenase 1 [Escherichia coli str. K-12 substr. MG1655]
NP_414544.1 310 homoserine kinase [Escherichia coli str. K-12 substr. MG1655]
NP_414545.1 428 threonine synthase [Escherichia coli str. K-12 substr. MG1655]


In [14]:
rast.annotate_genome(genome)

[{'execution_time': 1622756127.36331,
  'tool_name': 'kmer_search',
  'hostname': 'pear',
  'parameters': ['-a',
   '-g',
   200,
   '-m',
   5,
   '-d',
   '/opt/patric-common/data/kmer_metadata_v2',
   '-u',
   'http://pear.mcs.anl.gov:6100/query'],
  'id': '9CCA6D20-C4B3-11EB-A893-36A8BEF382BD'},
 {'parameters': ['annotate_hypothetical_only=1',
   'dataset_name=Release70',
   'kmer_size=8'],
  'hostname': 'pear',
  'tool_name': 'KmerAnnotationByFigfam',
  'id': '9CE3769E-C4B3-11EB-A893-36A8BEF382BD',
  'execution_time': 1622756127.52738},
 {'execute_time': 1622756127.88296,
  'hostname': 'pear',
  'parameters': [],
  'tool_name': 'annotate_proteins_similarity',
  'id': '9D19B7EA-C4B3-11EB-9714-71B3BDF382BD'}]

### Equivalent call from the client it self

In [15]:
#genome, res = rast.annotate_genome_from_fasta('GCF_000005845.2_ASM584v2_protein.faa', split=' ')
#res

In [34]:
feature = genome.features.get_by_id('YP_588478.1')

In [36]:
feature.ontology_terms

{'RAST': 'DUF1435 domain-containing protein YjjZ [Escherichia coli str. K-12 substr. MG1655]'}

In [None]:
feature.add_ontology_term('')