In [2]:
#I decided to do Biopython
#Biopython is a package that can be used for sequence and structure analysis, phylogenetics,
#and genomic data retrieval
pip install biopython

Note: you may need to restart the kernel to use updated packages.


In [2]:

from Bio.Seq import Seq

In [3]:
#In Biopython, sequences act as strings but have a few other properties
#This sequence is mitochondrial DNA from a king cobra
my_seq = Seq('ATAATCACTACAATCCTACTAAACATTATTAACCCCCTGCTCTACATTCTACCTATCCTAATCGCCGTTGCATTCCTAACCCTACTAGAACGAAAACTTTTAGGATATATACAACTTCGAAAAGGGCCAAACCTAGTAGGCCCCATAGGCCTCCTACAACCTATCGCTGACGGATTAAAACTAATCTCCAAAGAACCAACCAAACCCACCATATCATCCCCTATCCTATTTACCATCTCTCCAATCATAGCCCTCACCCTAGCACTAATCTCCTGAACTCCAATACCAATACCATCACCACTAATCAATATAAACTTAGGTCTCCTCTTCATTATAGCTATATCTGGGATATTCACCTACACCATTCTATGATCCGGATGATCATCCAACTCAAAATACCCCCTAATAGGAGCAATACGCGCTGTTGCACAAATCATCTCATATGAGGTTACCCTAGGATTAATCATCATTTCCATAGCCACACTAACAGGCGGATACTCCCTACTAACATTCACAGAAACACAAGAACGCCTATGGCTCCTCCTACCATCATGGCCCCTCGCTATAATATGATTTACTTCAACTCTGGCTGAAACCAACCGCTCCCCCTTCGATCTCACCGAAGGTGAATCAGAACTGGTTTCAGGCTTTAATGTAGAATCCTCAGCCGGCCCATTCGCACTCCTATTCTTAGCCGAATACACTAATATCCTACTGATAAACACACTATCAACTACAATATTCTTAAACCCAGGACCAACAAACCCACAACTATTAATCGTCGACCTGATAGCAAACACAATAATCCTAACCACCCTATTCCTATGAACTCGAGCTTCATACCCTCGATTCCGATATGACCAACTCATACACCTCCTGTGAAAACAATACCTACCACTAACCCTAGCCATGTGCCTACTCAACCTCTCAACCTCAACAACACTCATAGGAACTCCCCCACAAT')
my_seq

Seq('ATAATCACTACAATCCTACTAAACATTATTAACCCCCTGCTCTACATTCTACCT...AAT')

In [4]:
#specific elements at different indexes can be called exactly like a normal string
print(my_seq[0])
print(my_seq[5])
print(my_seq[-1])

A
C
T


In [5]:
#len() works the same as well
len(my_seq)

964

In [6]:
#.count is also a viable command, same as a normal string
print(my_seq.count('A'))
print(my_seq.count('G'))
print(my_seq.count('C'))
print(my_seq.count('T'))

311
103
311
239


In [7]:
#Sequences can be concatenated with simple addition and the .join method
seq_1 = 'ACGTCTGCTA'
seq_2 = 'GCTATCTGATCGAT'
seq_3 = seq_1 + seq_2
print(seq_3)

ACGTCTGCTAGCTATCTGATCGAT


In [8]:
#The complementary sequence can be found using .complement()
my_seq.complement()

Seq('TATTAGTGATGTTAGGATGATTTGTAATAATTGGGGGACGAGATGTAAGATGGA...TTA')

In [9]:
#.reverse_complement finds the reverse complementary sequence of a given sequence
my_seq.reverse_complement()

Seq('ATTGTGGGGGAGTTCCTATGAGTGTTGTTGAGGTTGAGAGGTTGAGTAGGCACA...TAT')

In [10]:
#When using a sequence in the fasta format, the following commands will show the info about the sequence
from Bio import SeqIO

for seq_record in SeqIO.parse('O. Hannah seq.fasta.txt', "fasta"):
    print(seq_record.id)
    print(repr(seq_record.seq))
    print(len(seq_record.seq))

NC_011394.1:2542-3505
Seq('ATAATCACTACAATCCTACTAAACATTATTAACCCCCTGCTCTACATTCTACCT...AAT')
964


In [11]:
#Biopython can translate from DNA to mRNA and back using .translate
#my sequence length isn't divisable by three so I get a codon error
my_seq_rna = my_seq.translate()
print('RNA:', my_seq_rna)

RNA: IITTILLNIINPLLYILPILIAVAFLTLLERKLLGYIQLRKGPNLVGPIGLLQPIADGLKLISKEPTKPTISSPILFTISPIIALTLALIS*TPIPIPSPLININLGLLFIIAISGIFTYTIL*SG*SSNSKYPLIGAIRAVAQIISYEVTLGLIIISIATLTGGYSLLTFTETQERLWLLLPSWPLAII*FTSTLAETNRSPFDLTEGESELVSGFNVESSAGPFALLFLAEYTNILLINTLSTTIFLNPGPTNPQLLIVDLIANTIILTTLFL*TRASYPRFRYDQLIHLL*KQYLPLTLAMCLLNLSTSTTLIGTPPQ




In [3]:
#Most importantly, Biopython can be used to call data from the NCBI database online
from Bio import Entrez

#Have to provide email
Entrez.email = "kolbybray@gmail.com"

# Search for a specific gene, I used the NADH dehydrogenase subunit 2 gene in King cobra
gene_name = "ND2"
search_term = f"{gene_name}[NADH dehydrogenase subunit 2] AND Ophiophagus hannah[Organism]"

#Search the nucleotide database
handle = Entrez.esearch(db="nucleotide", term=search_term, retmax=5)
record = Entrez.read(handle)

#Gets a list of matching IDs
matching_ids = record["IdList"]

#Gets information for each match
for gene_id in matching_ids:
    handle = Entrez.efetch(db="nucleotide", id=gene_id, rettype="gb", retmode="text")
    gene_record = handle.read()
    print(gene_record)

# Close the handle
handle.close()

LOCUS       NC_011394              17267 bp    DNA     circular VRT 03-APR-2023
DEFINITION  Ophiophagus hannah mitochondrion, complete genome.
ACCESSION   NC_011394
VERSION     NC_011394.1
DBLINK      BioProject: PRJNA927338
KEYWORDS    RefSeq.
SOURCE      mitochondrion Ophiophagus hannah (king cobra)
  ORGANISM  Ophiophagus hannah
            Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
            Lepidosauria; Squamata; Bifurcata; Unidentata; Episquamata;
            Toxicofera; Serpentes; Colubroidea; Elapidae; Elapinae;
            Ophiophagus.
REFERENCE   1  (bases 1 to 17267)
  AUTHORS   Chen,N. and Lai,X.P.
  TITLE     [Sequencing and analysis of the complete mitochondrial genome of
            the King Cobra, Ophiophagus hannah (Serpents: Elapidae)]
  JOURNAL   Yi Chuan 32 (7), 719-725 (2010)
   PUBMED   20650853
REFERENCE   2  (bases 1 to 17267)
  CONSRTM   NCBI Genome Project
  TITLE     Direct Submission
  JOURNAL   Submitted (22-OCT-2008) National Cent

LOCUS       EU921899               17267 bp    DNA     circular VRT 30-JUN-2015
DEFINITION  Ophiophagus hannah mitochondrion, complete genome.
ACCESSION   EU921899
VERSION     EU921899.1
KEYWORDS    .
SOURCE      mitochondrion Ophiophagus hannah (king cobra)
  ORGANISM  Ophiophagus hannah
            Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
            Lepidosauria; Squamata; Bifurcata; Unidentata; Episquamata;
            Toxicofera; Serpentes; Colubroidea; Elapidae; Elapinae;
            Ophiophagus.
REFERENCE   1  (bases 1 to 17267)
  AUTHORS   Chen,N. and Lai,X.P.
  TITLE     [Sequencing and analysis of the complete mitochondrial genome of
            the King Cobra, Ophiophagus hannah (Serpents: Elapidae)]
  JOURNAL   Yi Chuan 32 (7), 719-725 (2010)
   PUBMED   20650853
REFERENCE   2  (bases 1 to 17267)
  AUTHORS   Chen,N. and Fu,X.Y.
  TITLE     Direct Submission
  JOURNAL   Submitted (24-JUL-2008) Department of Pharmacology, General
            Hospital 

In [15]:
#Biopython can a do a whole lot of other stuff that would be too much to go into here,
#so I decided to focus on a particular subject that I enjoy: snake phylogenetics

from Bio import Phylo
from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor
from Bio import AlignIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment


#5 of my favorite snake species
snake_species = [
    "Ophiophagus hannah",
    "Laticauda laticaudata",
    "Naja naja",
    "Micrurus fulvius",
    "Agkistrodon piscivorus",
]

# I used the gene that code for the NADH dehydrogenase subunit 2, this gene is not evolutionaryily significant
# so my tree will probably be all sorts of messed up, also I'm only using 5 snakes, so the tree's gonna be pretty whack
snake_sequences = [
    ("Ophiophagus hannah", "ATTAACCTAACATCTTGACTAATAATTACAACAAGCATCGCCCTAAGCACTATCTTAACCACCACAGCACACACTGACTCATAGTATGAACCTGCTTAGAAATCAACACCCTATCTATAATCCCAATCATCTCGAAACCACACCACCCACGAGCAACAGAAGCTGCTACCAAATATTACCTTACACAAACCATAGCCTCCACAACACTACTGTTCGCAGCAACAACAAATGCCCTAAATACCTCAAACTGAGAAACAAATATTACAACAGATCCAGCAACAATAACAATCATTACCATGGCCCTAATAATAAAAATAGCAGCCGCACCATTCCACTTCTGACTGCCAGAAGTAGCACAAGGCACTACTACCATAACAACCCTAACCATTTTGACATGACAAAAAATTGCACCCCTAGCAATTATATTAACCATCCATAAAAACATAAACCAGACATTTTTATTATCATCTGCAATCCTGTCTGTTATAGTCGGCGGACTAGGCAGCCTCAACCAGACTCAGCTACGAAAACTATTAGCCTTTTCATCCATCGCCCACACAGGCTGAATTCTCGCCACAATAACCACCGCACCAGAAATCTCTGCCCTCACATTCATAATTTATGCCATAACTACAACCCCAATCTTCCTATCTATTAACCACACACTAACAACAACAATCAAAGACTTAGGAACTATTTGAACCACAACACCACACCTAATAATAGTCCTAACTCTGACAACCCTATCTCTGGGAGGATTACCTCCACTCACAGGATTTATACCCAAATGACTAATCCTTAACAAAATAACCTCCATAAATATAGTTATTGAAGCCACCACAATAGCTGTATCTTCCATAATAAGCCTATACGTCTACCTACGACTAACCTACACACTATCCATAACATTACCACCCCACACAACCCCAATACTAATAAAATGACGAACCCCACACAAAAAACATCCAATAACAACATCTCTACTAACAATAATAACTACCCTGCTCCTCCCCATATCACCAAGCATGT"),
    ("Laticauda laticaudata", "ATTAACATCACAACCTGACTAACAATTTCAACAAGCATTATTATTAGTACAATTTTAGTAACCATAACAACCCACTGACTCATAGTATGAGCATGCCTAGAAATCAACACCCTATCTATAATCCCTATTATCTCCAAGCCACACCATCCCCGAGCAACAGAAGCCGCCACCAAATACTACTTAACACAAGCCATAGCCTCTTCTACCCTTTTATTTGCGGCAACAATAAATGCCATAAACACATCCAACTGAGAAACCAACACCACATCAGAACCAACAGCAACCACAATAATTACCCTAGCCCTAATAATAAAAATAGCATCAGCACCATTCCACTTCTGACTTCCAGAAGTGGCACAAGGCGCCACCACCATAACAACCCTAACTATTCTCACATGACAAAAAATTGCACCCCTAACAATCATACTAACCACCTATAACAAAACAAACCAAACCTTATTATTAATATCAGCAACCCTATCCATTATTGTAGGAGGCCTCGGTAGCCTAAACCAAACCCAATTACGGAAACTAATAGCTTTCTCATCCATCGCCCACACCGGATGAATCCTAGCTACAATTACTACCGCACCCAAAATCTCAATACTTACTTTCATGGTTTATACCATAGCCACAACCCCAATTTTCCTCTCTATTAACCACACACCAACAATTACCATTAAAGACATCGGAACAATATGAATAACCTCACCCTGTCTTATTATAATCATTACATTAACCACCCTCTCCCTAGGGGGACTACCTCCACTTACAGGGTTTATACCCAAATGACTAATCCTTAATAAAATAATCACCAAAAATATAGCTATTGAAGCCACTACTATAGCTGTATCCTCTATACTAAGCCTATTCGTATACATCCGACTAATATACATTCTATCTATAACCATAACACCCCATACGACCACAATAACAATAAAATGACGAACACCACACAAAAAACACCCTATAACAACCGCCCTACTTACTATATTAACAACTTTCCTACTTCCACTAACACCAGACATGT"),
    ("Naja naja", "ATCAACCTAATATCTTGACTAGTAATCTCAACAAGCATTATCACCAGCACGCTACTAGTCACTATAGCAACACACTGACTTATAGTCTGAGCATGCCTAGAAATTAATACCCTATCTATAATCCCAATTATCTCTAAACCCCACCACCCACGGGCAACAGAAGCCGCTACCAAATACTACCTTACACAAACTATAGCCTCTACAACCCTTATATTCGCAACAACAACAAACGCCATAAACACATCAAACTGAGAAACGCACATTACAACAGACCCAACAACAACTACAATCATCACCCTAACATTAATAATAAAAATAGCTGCTGCACCATTCCACTTCTGACTACCAGAAGTCGCACAAGGCTCAACCACCATAACAACCCTAACCATTCTAACATGACAGAAAATTGCACCATTAGCAGTTATACTAACCACACACAACAAAATAAACCAAACACTGTTACTATTATCAGCAATACTATCTATTATCATTGGCGGACTAGGCAGCCTAAACCAAACCCAACTTCGAAAGCTAATAGCCTTCTCATCTATTGCCCACACAGGCTGAATTATAGCCACAATAACCATTGCGCCAAAAATCTCAATATTAACCTTTATAGTCTACACTATAACTACTACCCCTATATTCCTATCCATAAATCACACCACAATAACCACAATTAAAGATATGGGAACCGCTTGAACCACCTCACCACACCTAATAATAGTCGTAACACTGACAATACTTTCCCTAGGGGGCTTACCCCCACTCACAGGATTTATACCAAAATGACTAATTCTTAATAAAATAACCGCCCTTAACCTGACCACAGAAGCCACCCTCATAGCAATATCCTCCTTACCAGGCCTATATGTCTACATCCGACTTACTTACATCCTGTCCATAACAATACCCGCCCACACATCCACCACACAAATAAAATGACGATCACCACACAAGAAATTCCCACTATCCTCAATCACACTAGCAACCATAATAACACTACTTCTACCCCTCTCACCAAACCTCT"),
    ("Micrurus fulvius", "ATCAACCTAACATCCTGATCAGTAATCACTACTAGCATTATTATAAGCACACTATTAACTACTATTGCCACCCATTGGTTAATAGTATGAGTCTGCCTAGAAATCAACACCCTCTCCATAATCCCAGTAATCTCCAAACCCTATCACCCACGAGCAACAGAGGCTGCTACTAAATACTACCTAACACAAATTACAGCATCCACAACCCTACTCTTTGCAACTACAGTAAATGCCATAAACACATCCAACTGAGAAACCCACATCACAACAGATCCAATAACAACAACAATTATTACCCTAACCCTAATAATAAAAATAGCAGCTGCACCCTTCCACTTCTGATTACCAGAAGTAACACAAGGCACAACTACCCTAACAGCCCTAGCCATCCTTACATGACAAAAAATTGCACCCCTGACAGTTATACTAACCACCCATAACAAGATAAACCAAACACTCCTACTTATGTCAGCAATCTTGTCGGTCATTACAGGGGGGCTGGGTAGCCTAAACCAAACCCAACTCCGAAAACTAATAGCCTTCTCATCTATTGCCCACACAGGCTGAATCATCGCCACAATAACCACATCACCAAAAATTTCAGCCCTAACCTTTATAATCTATACCATAGCTACAGCCCCTATCTTCCTAGCTATCAACCACGCATCAACAACAACAATTAAAGATATAAGCACAATACGAACCACTTCACCCCACCTAATGTTAGTAATAGTATTAACCATCCTTTCCCTGGGGGGACTGCCACCCCTCACAGGATTTATACCAAAATGGTTAATCCTTAACAAACTCACCTCTACTAATATAATTATCGAAGCCACCACAATAGCCATAGCATCTATACTAAGCCTATTCATTTACTTAAAATTAACTTACATACTAGCCATAACACTTCCACCTCACACCACCCCCATACTAATAAAATGACGGACACCTCATAAAAAGTACCCCATACCAATAGCGATTTTAACAGTAATAACCGCCCTCCTACTCCCACTATCACCAAATATGT"),
    ("Agkistrodon piscivorus", "ATCAACCCAACATCCCTAGTAACCATCATGACCAGCATTATCCTAAGCACTGCTCTAATTACCACAACAACGCACTGACTAATAGCCTGAGTCTGCTTAGAAATTAATACCCTATCAATAGTACCAATTATCTCAAAACCACACCACCCCCGAGCAACAGAAGCAACAACAAAATACTTCCTAACACAGACTATCGCCTCCACAGCCATCCTATTCGCAGCAACAATAAATGCACTAAATACCTCAAACTGAGAAATTACCCTCACAACAGAAACCACAACCATAAAAATCATTACACTAGCTCTAATAATAAAAATAGCTGCGGCCCCATTCCACTTCTGATTACCAGAAGTAGTACAAGGAGCTACAACCCTAACAGCCCTAACAATCCTAACTTGACAGAAAATTGCGCCCCTCAGTATCCTTCTTACCAGCCACAACAACACCAACCTAACAATTCTTAGCTCGTCAGCAATCCTATCCGTACTAATTGGCGGAATTGGAGGATTAAACCAAACCCAACTACGAAAACTTATAGCCTTCTCATCCATCACACACACAGGATGAGTCCTCGCAACCATCACCCTAGCACCAAATATCTCCATCCTAACCTTCTTAATCTACACAATAACTACCACCCCAATCTTCATCACACTTAATACATCATCAGCAACAACCATTAAAGACCTAGGAATTATATGAACCATCTCCCCCCACCTAATACTTATTATATTAATAACCATCCTATCCCTAACCGGCCTGCCCCCCCTCACAGGGTTTATACCAAAATGACTAATTCTTAATAAAATAACCGCCCTTAACCTGACCACAGAAGCCACCCTCATAGCAATATCCTCCTTACCAGGCCTATATGTCTACATCCGACTTACTTACATCCTGTCCATAACAATACCCGCCCACACATCCACCACACAAATAAAATGACGATCACCACACAAGAAATTCCCACTATCCTCAATCACACTAGCAACCATAATAACACTACTTCTACCCCTCTCACCAAACCTCT"),
]

#sequences have to be the same length so this code here adds some padding
max_length = max(len(sequence) for _, sequence in snake_sequences)
padded_sequences = [(species, sequence.ljust(max_length, '-')) for species, sequence in snake_sequences]

sequences = [SeqRecord(Seq(sequence), id=species) for species, sequence in padded_sequences]

#Sequence alignment
alignment = MultipleSeqAlignment(sequences)

#This code calculates distance and constructs a tree
calculator = DistanceCalculator("identity")
constructor = DistanceTreeConstructor(calculator)

#Code here creates a distance matrix
distance_matrix = calculator.get_distance(alignment)

#create a tree from the distance matrix
tree = constructor.upgma(distance_matrix)

#Draw the tree, baby
Phylo.draw_ascii(tree)

                                         ______________ Agkistrodon piscivorus
                                    ____|
                                   |    |______________ Naja naja
  _________________________________|
 |                                 |  _________________ Micrurus fulvius
_|                                 |_|
 |                                   |_________________ Laticauda laticaudata
 |
 |___________________________________________________ Ophiophagus hannah



In [None]:
#Yeah, so that tree is not evolutionarily significant.
#But it is a phylogenetic tree and I'm happy with it.
#Anyway, Biopython can do a heck of a lot more than just draw trees,
#But I hope that this was a sufficient demonstration of being able to code from documentation.