In [None]:
import os

DIR = r'c://downloads'

# Parsing FASTA via SeqIO

In [None]:
from Bio import SeqIO

f = open(os.path.join(DIR, '4 example.fasta'), 'r')

# SeqIO.parse returns an iterator of records
# it is a "lazy" iterator - loads into memory only the current "record" (big file friendly)
for record in SeqIO.parse(f, 'fasta'):
    print('ID: ' + record.id)
    print('Description: ' + record.description)
    print('Sequence:')
    print(record.seq)
    print('*' * 20)
    last_seq = record.seq

f.close()

In [None]:
# Not sure how to handle an unkown object? Use <object>. + <TAB> to see what it can offer.
record.

In [None]:
print(type(last_seq), len(last_seq)) # biopython has its own objects
print(str(last_seq)) # you can turn them into standard ones, if you prefer

In [None]:
records = SeqIO.parse(os.path.join(DIR, '4 example.fasta'), 'fasta') # Can give SeqIO the file path as a string
records_dict = SeqIO.to_dict(records)
print(records_dict)
print('*' * 20)
print(records_dict['MCHU'])
# turning it to dictionary loads all of the file into memory, since it is no longer a generator (FYI)

##### Exercise

In [None]:
# Use SeqIO parser to print the every id in the fasta file, together with the GC percentage


# Seq and SeqRecord objects

In [None]:
# We can make our own seq record file
from Bio.Seq import Seq

seq = Seq('ATGGAGTGTTAGCAT')

print(seq, len(seq))

In [None]:
# Seq objects act similar to strings

print(seq[2:10:3])
print(seq * 3)

seq2 = Seq('TTACCA')
print(seq + seq2)

print(seq.count('G'))
print('CAT' in seq)
print(seq2 == 'TTACCA')

print(str(seq))

In [None]:
# Although there are some differences, lacking certain functions...
seq.join(['AT', 'TA'])

In [None]:
# But also having some additional functions...
print(seq.transcribe())
print(seq.transcribe().back_transcribe())

In [None]:
print(seq.translate())
print(seq.transcribe().translate())

In [None]:
print(seq.complement())
print(seq.reverse_complement())
print(seq.reverse_complement().translate())
print(seq.translate().reverse_complement())

In [None]:
# SeqRecord object contains/requires some additional info (compared to seq object)
from Bio.SeqRecord import SeqRecord

description1 = 'A random sequence'
description2 = 'Another random sequence'

records = [
    SeqRecord(Seq('ATGGAGTGTTAGCAT'), id = 'random-seq-1', description = description1),
    SeqRecord(Seq('ATGAATAGCCGTATC'), id = 'random-seq-2', description = description2),
]

f = open(os.path.join(DIR, 'output.fasta'), 'w')
SeqIO.write(records, f, 'fasta')
f.close()

##### Exercise

In [None]:
# open the output.fasta file with SeqIO parser
# and compare newly read descriptions to description 1 and 2
# what is going on here?

# Parsing Gene Bank files

In [None]:
# Download GenBank full format from NCBI RefSeq at: 
# https://www.ncbi.nlm.nih.gov/nuccore/NM_007294.3?report=gbwithparts&log$=seqview
f = open(os.path.join(DIR, 'sequence.gb'), 'r')
record, = SeqIO.parse(f, 'genbank')
# using "," we specify that we want only the element from the generator, not the generator itself
# we could use multiple variables if we have multiple elements
f.close()

In [None]:
# we still get the SeqRecord object, but with the additional info
print(type(record))
print('*' * 20)

print(record.id)
print(record.description)
print('*' * 20)

print(type(record.seq))
print(record.seq[:100])

In [None]:
# we can also access all annotations...
print(type(record.annotations))
print(record.annotations.keys())
print(record.annotations['taxonomy'])

In [None]:
# and all features...
for feature in record.features[:3]:
    print(feature)

In [None]:
# as well as extract individual features
exons = []

for feature in record.features:
    if feature.type == 'exon':
        exons.append((int(feature.location.start), int(feature.location.end)))
    
print('%d exons:' % len(exons))
print(exons)

In [None]:
# Converting to FASTA
f = open(os.path.join(DIR, 'output2.fasta'), 'w')
SeqIO.write([record], f, 'fasta')
f.close()

There are many other formats supported by __SeqIO.parse__ (http://biopython.org/wiki/SeqIO), including:
- FASTQ 
- UniProt XML
- PDB (sequence only; for structure use Bio.PDB)

##### Exercise

In [None]:
# go through the sequence.gb file once more
# compare the length of the "gene" feature.type to all of the "exon" feature.type(s) combined
# A reminder: We've talked about human genes being mostly "empty" DNA,
# with exons making up only a small percentage of the length...

# CodonTable & GC

In [None]:
from Bio.Data.CodonTable import standard_dna_table
print(standard_dna_table)

In [None]:
# we can also get it in a "more friendly" format
print(standard_dna_table.forward_table)
print('*' * 20)
print(standard_dna_table.forward_table['TTG'])
print('*' * 20)
# we have to explicitly ask for stop codons
print(standard_dna_table.stop_codons)

In [None]:
# There are a bunch of other functions as well, e.g. GC content from SeqUtils
# (as well as many other functions in SeqUtils)
from Bio.SeqUtils import GC
print(GC('ACTGN'))

# StringIO

In [None]:
fasta_string = '''
>seq1
MAGTQEDVW
>seq2
MCPIYTRKRAVCSFR
'''

# Won't work, SeqIO thinks it's a file path...
print(list(SeqIO.parse(fasta_string, 'fasta')))

In [None]:
from io import StringIO # behaves like a file in that is in RAM, not on the disk

f = StringIO(fasta_string)
print(f)
print(f.read())

In [None]:
print(list(SeqIO.parse(StringIO(fasta_string), 'fasta')))

In [None]:
# we can even write to it
f = StringIO()
f.write('text')
f.seek(0)
print(f.read())

In [None]:
# we can combine StringIO with other modules we've already learned

import csv

numbers = [
    [1, 'One', 'N/A'],
    [2, 'Two', True],
    [3, 'Three', True],
    [4, 'Four', False],
    [5, 'Five', True],
]

f = StringIO()
csv_writer = csv.writer(f)
csv_writer.writerows(numbers)

f.seek(0)
print(f.read())