In [27]:
from Bio.Seq import MutableSeq, Seq
import re

### Seq

In [30]:
b = Seq('TAGCAAATCATGGCTUAGGAT')

for codon in re.findall(r'(.{3})', str(b)):
    print(codon)

TAG
CAA
ATC
ATG
GCT
UAG
GAT


### SeqUtils

In [37]:
from Bio import SeqUtils

pattern = Seq("ACG")
sequence = Seq("ATGCGCGACGGCGTGATCAGCTTATAGCCGTACGACTGCTGC")

print(SeqUtils.nt_search(str(sequence), pattern))
print(SeqUtils.nt_search(str(sequence), pattern.reverse_complement()))
print(SeqUtils.GC(sequence))

['ACG', 7, 31]
['CGT', 11, 28]
59.523809523809526




### Ejercicio

In [62]:
sequence = Seq('AGCCATGTAGCTAACTCAGGTTACATGGGGATGACCCCGCGACTTGGATTAGAGTCTCTTTTGGAATAAGCCTGAATGATCCGAGTAGCATCTCAG')
print('Original')
print(f'Primer ORF: {sequence.translate()}\nSegundo ORF: {sequence[1:].translate()}\nTercer ORF: {sequence[2:].translate()}')

print('\nReverse compliment')
print(f'Primer ORF: {sequence.reverse_complement().translate()}\nSegundo ORF: {sequence.reverse_complement()[1:].translate()}\nTercer ORF: {sequence.reverse_complement()[2:].translate()}')


Original
Primer ORF: SHVANSGYMGMTPRLGLESLLE*A*MIRVASQ
Segundo ORF: AM*LTQVTWG*PRDLD*SLFWNKPE*SE*HL
Tercer ORF: PCS*LRLHGDDPATWIRVSFGISLNDPSSIS

Reverse compliment
Primer ORF: LRCYSDHSGLFQKRL*SKSRGHPHVT*VSYMA
Segundo ORF: *DATRIIQAYSKRDSNPSRGVIPM*PELATW
Tercer ORF: EMLLGSFRLIPKETLIQVAGSSPCNLS*LHG


### Automatizado considerando ORFs diferentes

In [75]:
sequence[4:], sequence.reverse_complement()[4:]

(Seq('ATGTAGCTAACTCAGGTTACATGGGGATGACCCCGCGACTTGGATTAGAGTCTC...CAG'),
 Seq('GATGCTACTCGGATCATTCAGGCTTATTCCAAAAGAGACTCTAATCCAAGTCGC...GCT'))

In [74]:
starts = SeqUtils.nt_search(str(sequence), 'ATG')[1:]
rev_starts = SeqUtils.nt_search(str(sequence.reverse_complement()), 'ATG')[1:]
for i,j in zip(starts, rev_starts):
    new_seq = sequence[i:]
    rev_new_seq
    print(f'Para: {new_seq}')
    print(f'Original\nPrimer ORF: {new_seq.translate(to_stop=True)}\nSegundo ORF: {new_seq[1:].translate(to_stop=True)}\nTercer ORF: {new_seq[2:].translate(to_stop=True)}')

    print(f'Reverse compliment\nPrimer ORF: {new_seq.reverse_complement().translate(to_stop=True)}\nSegundo ORF: {new_seq.reverse_complement()[1:].translate(to_stop=True)}\nTercer ORF: {new_seq.reverse_complement()[2:].translate(to_stop=True)}\n\n')

Para: ATGTAGCTAACTCAGGTTACATGGGGATGACCCCGCGACTTGGATTAGAGTCTCTTTTGGAATAAGCCTGAATGATCCGAGTAGCATCTCAG
Original
Primer ORF: M
Segundo ORF: CS
Tercer ORF: VANSGYMGMTPRLGLESLLE
Reverse compliment
Primer ORF: LRCYSDHSGLFQKRL
Segundo ORF: 
Tercer ORF: EMLLGSFRLIPKETLIQVAGSSPCNLS


Para: ATGGGGATGACCCCGCGACTTGGATTAGAGTCTCTTTTGGAATAAGCCTGAATGATCCGAGTAGCATCTCAG
Original
Primer ORF: MGMTPRLGLESLLE
Segundo ORF: WG
Tercer ORF: GDDPATWIRVSFGISLNDPSSIS
Reverse compliment
Primer ORF: LRCYSDHSGLFQKRL
Segundo ORF: 
Tercer ORF: EMLLGSFRLIPKETLIQVAGSSP


Para: ATGACCCCGCGACTTGGATTAGAGTCTCTTTTGGAATAAGCCTGAATGATCCGAGTAGCATCTCAG
Original
Primer ORF: MTPRLGLESLLE
Segundo ORF: 
Tercer ORF: DPATWIRVSFGISLNDPSSIS
Reverse compliment
Primer ORF: LRCYSDHSGLFQKRL
Segundo ORF: 
Tercer ORF: EMLLGSFRLIPKETLIQVAGS


Para: ATGATCCGAGTAGCATCTCAG
Original
Primer ORF: MIRVASQ
Segundo ORF: 
Tercer ORF: DPSSIS
Reverse compliment
Primer ORF: LRCYSDH
Segundo ORF: 
Tercer ORF: EMLLGS






In [8]:
from Bio import SeqIO
secuencias = SeqIO.parse('./../data/seq.nt.fa', format= 'fasta')
for record in secuencias:
    print('ID {}'.format(record.id))
    print('len {}'.format(len(record)))
    print('Traducción {}'.format(record.seq.translate(to_stop=True)))

secuencias

ID seq1
len 180
Traducción KSSSR
ID seq2
len 180
Traducción ATEPRTPT
ID seq3
len 98
Traducción MKVT
ID seq4
len 209
Traducción MLTKVSVRTCR




<Bio.SeqIO.FastaIO.FastaIterator at 0x1e3ba4d4a90>

### Tarea manejo de secuencias 

In [25]:
import numpy as np
from Bio import SeqIO

def poor_qc(file= str, threshold: int = 30) -> dict:
    poor_seqs = {
        record.id : record.seq
        for record in SeqIO.parse(file, format='fastq')
        if np.asarray(record.letter_annotations['phred_quality']).mean() < threshold
    }
    return poor_seqs

poor_seqs = poor_qc(file= './../tareas/ManejoSecuencias/data/sample.fastq', threshold= 30)

In [26]:
len(poor_seqs)

5