In [1]:
from Bio import SeqIO

## Atributos Generales

In [42]:
for record in SeqIO.parse('./data/virus.gb', format='genbank'):
    atributos = record.__dict__
    print(f'Estos son todos los atributos del objeto record: {list(atributos.keys())}')
    print(f'Atributo ID ({type(record.id)}): {record.id}')
    print(f'Atributo secuencia ({type(record.seq)}): {str(record.seq)[0:30]}...')
    print(f'Atributo annotations ({type(record.annotations)}): {record.annotations}')
    print(f'Atributo features ({type(record.features)}): {record.features}')
    print(f'Longitud ({type(record)}): {len(record)}')

Estos son todos los atributos del objeto record: ['_seq', 'id', 'name', 'description', 'dbxrefs', 'annotations', '_per_letter_annotations', 'features']
Atributo ID (<class 'str'>): NC_020806.1
Atributo secuencia (<class 'Bio.Seq.Seq'>): ACGGAGAAAAACAAACCAATTCACGCATTA...
Atributo annotations (<class 'dict'>): {'molecule_type': 'RNA', 'topology': 'linear', 'data_file_division': 'VRL', 'date': '13-AUG-2018', 'accessions': ['NC_020806'], 'sequence_version': 1, 'keywords': ['RefSeq', 'G gene', 'glycoprotein', 'L gene', 'large polymerase protein', 'M gene', 'matrix protein', 'N gene', 'nucleocapsid protein', 'P gene', 'phosphoprotein'], 'source': 'Isfahan virus (ISFV)', 'organism': 'Isfahan virus', 'taxonomy': ['Viruses', 'Riboviria', 'Orthornavirae', 'Negarnaviricota', 'Haploviricotina', 'Monjiviricetes', 'Mononegavirales', 'Rhabdoviridae', 'Alpharhabdovirinae', 'Vesiculovirus'], 'references': [Reference(title='Complete genome sequences of Chandipura and Isfahan vesiculoviruses', ...), Refe

## Anotaciones (contiene ejercicio 4)

In [18]:
for record in SeqIO.parse('./data/virus.gb', format='genbank'):
    print(f"Date: {record.annotations['date']}")
    print(f"Organism: {record.annotations['organism']}")
    print(f"Taxonomy: {record.annotations['taxonomy']}")
    print(f"Molecule Type: {record.annotations['molecule_type']}")
    print(f"References: {record.annotations['references']}")

Date: 13-AUG-2018
Organism: Isfahan virus
Taxonomy: ['Viruses', 'Riboviria', 'Orthornavirae', 'Negarnaviricota', 'Haploviricotina', 'Monjiviricetes', 'Mononegavirales', 'Rhabdoviridae', 'Alpharhabdovirinae', 'Vesiculovirus']
Molecule Type: RNA
References: [Reference(title='Complete genome sequences of Chandipura and Isfahan vesiculoviruses', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...)]


In [21]:
for annotation, value in record.annotations.items():
    print(annotation, value)

molecule_type RNA
topology linear
data_file_division VRL
date 13-AUG-2018
accessions ['NC_020806']
sequence_version 1
keywords ['RefSeq', 'G gene', 'glycoprotein', 'L gene', 'large polymerase protein', 'M gene', 'matrix protein', 'N gene', 'nucleocapsid protein', 'P gene', 'phosphoprotein']
source Isfahan virus (ISFV)
organism Isfahan virus
taxonomy ['Viruses', 'Riboviria', 'Orthornavirae', 'Negarnaviricota', 'Haploviricotina', 'Monjiviricetes', 'Mononegavirales', 'Rhabdoviridae', 'Alpharhabdovirinae', 'Vesiculovirus']
references [Reference(title='Complete genome sequences of Chandipura and Isfahan vesiculoviruses', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...)]
comment PROVISIONAL REFSEQ: This record has not yet been subject to final
NCBI review. The reference sequence is identical to AJ810084.
COMPLETENESS: full length.


#### Ejercicio 4

In [113]:
def get_annotations(file : str, anotations_wanted : list = ['all']):
    anot = {}
    for record in SeqIO.parse(file, format='genbank'):
        for anotation, value in record.annotations.items():
            if anotations_wanted[0] == 'all':
                anot[anotation] = value
            elif anotation in anotations_wanted:
                anot[anotation] = value
    return anot

x = get_annotations(
    file= './data/virus.gb',
    anotations_wanted= ['date', 'organism', 'country']
)
x

{'date': '13-AUG-2018', 'organism': 'Isfahan virus'}

## Features (contiene ejercicio 5)

In [54]:
for record in SeqIO.parse('./data/virus.gb', format='genbank'):
    for feat in record.features:
        print(type(feat), feat, sep='\n')

<class 'Bio.SeqFeature.SeqFeature'>
type: source
location: [0:11088](+)
qualifiers:
    Key: country, Value: ['Iran:Isfahan province']
    Key: db_xref, Value: ['taxon:290008']
    Key: isolation_source, Value: ['Phlebotomus papatasi']
    Key: mol_type, Value: ['genomic RNA']
    Key: organism, Value: ['Isfahan virus']

<class 'Bio.SeqFeature.SeqFeature'>
type: gene
location: [65:1337](+)
qualifiers:
    Key: db_xref, Value: ['GeneID:14857918']
    Key: gene, Value: ['N']
    Key: locus_tag, Value: ['J427_gp2']

<class 'Bio.SeqFeature.SeqFeature'>
type: CDS
location: [65:1337](+)
qualifiers:
    Key: codon_start, Value: ['1']
    Key: db_xref, Value: ['GOA:Q5K2K7', 'InterPro:IPR000448', 'UniProtKB/Swiss-Prot:Q5K2K7', 'GeneID:14857918']
    Key: function, Value: ['involved in genome encapsidation']
    Key: gene, Value: ['N']
    Key: locus_tag, Value: ['J427_gp2']
    Key: product, Value: ['nucleocapsid protein']
    Key: protein_id, Value: ['YP_007641382.1']
    Key: translation, Val

#### Ejercicio 5

In [67]:
try:
    print(record.features[0].qualifiers['country'])
    print(record.features[0].qualifiers['isolate'])
except KeyError:
    raise KeyError('Atributo no encontrado')

['Iran:Isfahan province']


KeyError: 'Atributo no encontrado'

## Origin

In [68]:
record.seq

Seq('ACGGAGAAAAACAAACCAATTCACGCATTAGAAGATTCCAGAGGAAAGTGCTAA...CGT')

In [77]:
type = (record.features[1].type)
start = record.features[1].location.nofuzzy_start
end = record.features[1].location.nofuzzy_end

nueva_seq = record.seq[start:end]

print(f'Para el {type} en la posición 1 de los features, su seq es: {nueva_seq}\nCodifica para: {nueva_seq.translate()}')

Para el gene en la posición 1 de los features, su seq es: ATGACTTCTGTAGTAAAGAGGATTGCTACTGGCTCAAGTGTGTTGGCAGTACTACCCGCCAATGAAGACCCTGTAGAGTTTCCAGGGGACTATTTTTTGCAAAATCCGGGAAAAATAAGGGTGTGCATCAACAGAAAATTAGACGTTGCCACACTCCGCCAATACGTCTACGAGGGACTAAAAAATGGGGATGTCCATGTGTGTCACATCAATTCATACCTGTACCAGGTGCTCAAAGACACCAGAGATGAGGCCCAAAGTGATTGGATATCTTTCGGAGTGTCCCTTGCAGTCAAAGGTGGCATTGTCAGTGTATTTGACACCCTTATGATTGAGGATTACAGGGGAGAGGCTCCGGATGGGAGGAAATGCGATGGAAGAACCATCGACGATGACAAATGGCTGCCAATGTTAATCCTCGGCCTATACAGGGTGTCGAGAGCGACACAAGAAGACTACAAAAAGTCACTACTGCAGAAACTCTACGCTCAGTGCAAGCTGAGGAGTCCTCAAGCTGAAGAATTAGTTGAAGACGCAGCAGAATTTTATGAAGTTTGGTCCAATGATTCAAACTTCCTAAAATTGGTTGCAGCAATTGACATGTTCTTTCACAAATTCAAAAACCATGCAGATGCAGGCTTGAGATGGGGAACCATTGTGTCACGATTTAAAGATTGTGCCGCCTTGGCAACATTGTCTCATGTACAGAAAGTGACTGGCCTGTCAATCAAAGAAGTGTTCACCTGGGTTCTGAACAAATCAGTTGAAGATGAGTTGTGCAGAATGATGAAAGAAAGACAAGAAGTGGACAAAGCTGATTCCTACATGCCTTACCTGATTGATTTCGGGATCTCAACAAAATCCCCCTATTCATCAGTAAAGAACCCGTGTTTTCATTTCTGGGGACAACTGACAGCACTGCTGGTCCACTCTCACAGAGCT

#### Ejercicio 6

In [111]:
for feat in record.features:
    if feat.type == 'gene' and feat.qualifiers['gene'][0] == 'L':
        seq = record.seq[
            feat.location.nofuzzy_start 
            :
            feat.location.nofuzzy_end
        ]
        print(f'Para el gen L:\nSeq: {seq},\nTranscript: {seq.transcribe()},\nTranslation: {seq.translate()}')

Para el gen L:
Seq: ATGGATGAGTACTCTGAAGAAAAGTGGGGCGATTCTGATGAAGAATCTTTTGGCACAGGGAAATATTCTGACGAGTCTAGAATAAGAGGATTAAATTCTGTTGACTATAATCTAAACTCTCCCTTAATTCAAGATGATCTGTACTATCTAATGGAACGAGTGCGTGGAAGACCGGTACCTCCCATTTGGAAAGCAAAAAATTGGACCGAAACTATACATCTGGTTCAAGAAAGTAGATTAGATTATTTACCAACACAGAAGCTACACAGTTGGTATGCGGAATGGCTCATGGAGGAGAGTCATGACTCCTCTCAAGGACTAGCATTCTTGAAGGAAGTGGACAAAGACAGTTTGGAAACATACGAAGTTGTTATGTCATTCCTAAGGGGCTGGTGTGGTGGTGCTCCGGCGTATAAAAAGAAAGAAGGGCGACACATAGCAAAGATAGGATCATTATGCCAGAAATTCTTGGATCTCCACCGAGTCATACTTATAATGAATGCTTCTACCCAGATGGAGTTGTCAAATTTGGCAGAGACATTTCAGGCCTCTTCTGTGTCAAAGAAAATTATTACAACACCCTCAATGGGAAAGATGGAGATGAGTGGACAATTTGCACTTGCATACCAGCAAAAAGTCATACTTGATAGAAACTTCTTATTAATGATGAAAGATGTTGTGATTGGAAGGATGCAAACATTGTTGTCCATGGTTTCTCGAACAGATGACAAGTTCTCTGATGGGGACATTAGTTACTTAATCAAGATTTATCAATTGGGTGATAAAATCATTCAATCGCTAGGAAATGATGGATATGAGCTGATTAAAACAATAGAGCCCATGTGCAACTTGAGACTGTCTGATTTAGCCAGAGAATATCGGCCTCTCATACCGGAGTTCCCTCACTTTCGCCAGCATATCGAGGGAACCGTGTCAGAGCTCAGAAAAAAGACTGCGTTGATTGTAGACATGTTCAAGAT