In [1]:
from Bio import SeqIO
from Bio import Phylo
from Bio.Phylo.TreeConstruction import DistanceMatrix
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor

In [2]:
# read the database and mystery sequence
database = {record.id : str(record.seq) for record in SeqIO.parse("dog_breeds.fa", "fasta")}
unknown = {record.id : str(record.seq) for record in SeqIO.parse("unknown.fa", "fasta")}

In [3]:
def hamming_distance(seq1, seq2):
    """
    Determine hamming distance between two sequences i.e., number of mutations between two sequences
    """
    return sum([1 for x, y in zip(seq1, seq2) if x!=y])

In [4]:
# take the mystery sequence from unknown dict
mystery = unknown['gb|KM061522.1|']
pairs = {}
# iterate over all database sequences and compare with mystery sequence
for Id, seq in database.items():
    pairs[(Id, seq)] = hamming_distance(seq, mystery)
    
# find out the closest breed and print the result
closest_breed = sorted(pairs.items(), key=lambda x: x[1])[0]
Id, seq, mut = closest_breed[0][0], closest_breed[0][1], closest_breed[1]
# print 90 bases of the closest breed sequence
print(f"The closest breed (mutation={mut}) to unknown sequence in the database is:\n{seq[:90]}...\t{Id}")

The closest breed (mutation=24) to unknown sequence in the database is:
GTTAATGTAGCTTAATTAATAAAGCAAGGCACTGAAAATGCCAAGATGAGTCGCACGACTCCATAAACATAAAGGTTTGGTCCTAGCCTT...	gb|AY656744.1|


In [5]:
# extract ids and sequences from dict record
ids = list(database.keys()) + list(unknown.keys())
sequences = list(database.values()) + list(unknown.values())

# construct distance matrix to draw phylogenetic tree with Bio.Phylo module
distM = [[0]*i for i in range(1, len(sequences)+1)]
for i, x in enumerate(sequences):
    for j, y in zip(range(i), sequences):
        if i != j:
            distM[i][j] = hamming_distance(x, y)

In [6]:
# construct and draw the phylogenetic tree with upgma method
dm = DistanceMatrix(ids, distM)
constructor = DistanceTreeConstructor()
tree = constructor.upgma(dm)
Phylo.draw_ascii(tree)

             ______________________ gb|KM061522.1|
  __________|
 |          |   ___________________ gb|DQ480496.1|
 |          |__|
 |             |     ______________ gb|AY656744.1|
 |             |____|
 |                  |______________ gb|MH105046.1|
 |
 |                          _____________ gb|DQ480498.1|
 |      ___________________|
 |     |                   |_____________ gb|MW916030.1|
 |     |
 |     |       __________________________ gb|MW916075.1|
 |     |      |
_|     |      |                           __ gb|MW916065.1|
 |     |      |   _______________________|
 |     |      |  |                       |__ gb|MW916066.1|
 |     |      |  |
 |     |      |  |       ___________________ gb|KU290523.1|
 |     |      |  |      |
 |     |      |  |      |                               __ gb|KU290610.1|
 |     |      |  |      |                          ____|
 |     |      |  |      |                         |    | _ gb|KU290953.1|
 |     |      |  |      |                 