# Basic Sequence Processing

In [6]:
from Bio import Entrez, Seq, SeqIO


In [7]:
Entrez.email = 'mariadiazcsga@gmail.com'
handle = Entrez.efetch(db='nucleotide', id = ['NM_002299'], rettype='fasta') #Lactase. gene
seq = SeqIO.read(handle, 'fasta')

In [9]:
type(seq)

Bio.SeqRecord.SeqRecord

In [12]:
seq.description #check description from fasta Seq Record

'NM_002299.4 Homo sapiens lactase (LCT), mRNA'

In [15]:
seq.id

'NM_002299.4'

In [16]:
seq.seq

Seq('AACAGTTCCTAGAAAATGGAGCTGTCTTGGCATGTAGTCTTTATTGCCCTGCTA...GTC')

Now that we've confirmed we have the correct file we can perform basic manipulations. Calling the SeqRecord object provides poorly formatted data.

In [17]:
seq

SeqRecord(seq=Seq('AACAGTTCCTAGAAAATGGAGCTGTCTTGGCATGTAGTCTTTATTGCCCTGCTA...GTC'), id='NM_002299.4', name='NM_002299.4', description='NM_002299.4 Homo sapiens lactase (LCT), mRNA', dbxrefs=[])

We can adjust the format in which the fasta data is reported out to improve readability.

In [20]:
prec_hdl = open('example.fasta', 'w')
SeqIO.write([seq], prec_hdl, 'fasta')
prec_hdl.close()

In [22]:
recs = SeqIO.parse('example.fasta', 'fasta')
for rec in recs:
    print(type(rec))
    seq = rec.seq
    print(rec.description)
    print(seq[:10]) #first 10 bases of sequence


<class 'Bio.SeqRecord.SeqRecord'>
NM_002299.4 Homo sapiens lactase (LCT), mRNA
AACAGTTCCT


In [23]:
#extract sequence from record and convert to sequence object
seq = Seq.Seq(str(seq))
seq

Seq('AACAGTTCCTAGAAAATGGAGCTGTCTTGGCATGTAGTCTTTATTGCCCTGCTA...GTC')

In [24]:
type(seq)

Bio.Seq.Seq

In [25]:
RNA = seq.transcribe() #create RNA product
RNA

Seq('AACAGUUCCUAGAAAAUGGAGCUGUCUUGGCAUGUAGUCUUUAUUGCCCUGCUA...GUC')

In [26]:
print(RNA[-20:]) #print last 20 bases of RNA

AAUAAAAACAGCAGACUGUC


In [30]:
#create protein product of RNA
prot = RNA.translate()
prot

Seq('NSS*KMELSWHVVFIALLSFSCWGSDWESDRNFISTAGPLTNDLLHNLSGLLGD...QTV')