In [79]:

from Bio.Seq import Seq
from Bio import pairwise2 as pw2
from Bio.pairwise2 import format_alignment
from Levenshtein import distance

In [80]:
dna_1 = Seq('GCATGCTGATCAGCTAGTCGACACGAGCTCAGCA')
dna_2 = Seq('GTACATCTAGCGCATGCATCAGCGATGCA')

In [81]:
# GLOBAL ALIGNMENT
alignments = pw2.align.globalxx(dna_1, dna_2)
for a in alignments:
    print(format_alignment(*a))

G--CATGCTGAT-CAGCTAGTCGACA-CGAGCTC-A-GCA
|  ||| || |  | || | | | || | ||  | | |||
GTACAT-CT-A-GC-GC-A-T-G-CATC-AG--CGATGCA
  Score=23

G--CATGCTGATCAGCTAGTCGACA-CGAGCTC-A-GCA
|  ||| || |.| || | | | || | ||  | | |||
GTACAT-CT-AGC-GC-A-T-G-CATC-AG--CGATGCA
  Score=23

G--CATGCT-GATCAGCTAGTCGACA-CGAGCTC-A-GCA
|  ||| || |  | || | | | || | ||  | | |||
GTACAT-CTAG--C-GC-A-T-G-CATC-AG--CGATGCA
  Score=23

G--CATGCTGAT-CAGCTAGTCGACA-CGAGCTC-A-GCA
|  ||| || |  | || | | | || | |||   | |||
GTACAT-CT-A-GC-GC-A-T-G-CATC-AGC--GATGCA
  Score=23

G--CATGCTGATCAGCTAGTCGACA-CGAGCTC-A-GCA
|  ||| || |.| || | | | || | |||   | |||
GTACAT-CT-AGC-GC-A-T-G-CATC-AGC--GATGCA
  Score=23

G--CATGCT-GATCAGCTAGTCGACA-CGAGCTC-A-GCA
|  ||| || |  | || | | | || | |||   | |||
GTACAT-CTAG--C-GC-A-T-G-CATC-AGC--GATGCA
  Score=23

G--CATGCTGAT-CAGCTAGTCGACA-CGAGCTCA-GCA
|  ||| || |  | || | | | || | ||| .| |||
GTACAT-CT-A-GC-GC-A-T-G-CATC-AGC-GATGCA
  Score=23

G--CATGCTGATCAGCTAGTCGACA-CGAGCTCA-GCA
|  ||| || |.| || | | | ||

In [82]:
# global alignment 
# match, mismatch, gap start, gap extend
alignments = pw2.align.globalms(dna_1, dna_2, 2, 1, -0.5, -0.3)
for a in alignments:
    print(format_alignment(*a))

GCATGCTGATC-AGCTAGTCGACACGAGCTCAGCA
|  |.|  ||| |||...| | ||..|||...|||
G--TAC--ATCTAGCGCAT-G-CATCAGCGATGCA
  Score=43.9

GCATGCTGATC-AGCTAGTCGACACGAGCTCAGCA
|.|  |  ||| |||...| | ||..|||...|||
GTA--C--ATCTAGCGCAT-G-CATCAGCGATGCA
  Score=43.9



In [83]:
# local alignment
alignments = pw2.align.localxx(dna_1, dna_2);
for a in alignments:
    print(format_alignment(*a))

G--CATGCTGAT-CAGCTAGTCGACA-CGAGCTC-A-GCA
|  ||| || |  | || | | | || | ||  | | |||
GTACAT-CT-A-GC-GC-A-T-G-CATC-AG--CGATGCA
  Score=23

G--CATGCTGATCAGCTAGTCGACA-CGAGCTC-A-GCA
|  ||| || |.| || | | | || | ||  | | |||
GTACAT-CT-AGC-GC-A-T-G-CATC-AG--CGATGCA
  Score=23

G--CATGCT-GATCAGCTAGTCGACA-CGAGCTC-A-GCA
|  ||| || |  | || | | | || | ||  | | |||
GTACAT-CTAG--C-GC-A-T-G-CATC-AG--CGATGCA
  Score=23

G--CATGCTGAT-CAGCTAGTCGACA-CGAGCTC-A-GCA
|  ||| || |  | || | | | || | |||   | |||
GTACAT-CT-A-GC-GC-A-T-G-CATC-AGC--GATGCA
  Score=23

G--CATGCTGATCAGCTAGTCGACA-CGAGCTC-A-GCA
|  ||| || |.| || | | | || | |||   | |||
GTACAT-CT-AGC-GC-A-T-G-CATC-AGC--GATGCA
  Score=23

G--CATGCT-GATCAGCTAGTCGACA-CGAGCTC-A-GCA
|  ||| || |  | || | | | || | |||   | |||
GTACAT-CTAG--C-GC-A-T-G-CATC-AGC--GATGCA
  Score=23

G--CATGCTGAT-CAGCTAGTCGACA-CGAGCTCA-GCA
|  ||| || |  | || | | | || | ||| .| |||
GTACAT-CT-A-GC-GC-A-T-G-CATC-AGC-GATGCA
  Score=23

G--CATGCTGATCAGCTAGTCGACA-CGAGCTCA-GCA
|  ||| || |.| || | | | ||

In [84]:
# hamming distance
def hamming_distance(dna_1, dna_2):
    result = 0
    for x, y in zip(dna_1, dna_2):
        if x != y: result += 1
    return result

h_distance = hamming_distance(dna_1, dna_2)
print(f'Hamming: {h_distance}')

Hamming: 21


In [85]:
# levenshtein distance
levenshtein_distance = distance(dna_1, dna_2)
print(f'Levenshtein: {levenshtein_distance}')

Levenshtein: 15


In [100]:
# plot
def dot_plot(dna_1, dna_2):
    print(' |', end='')
    for x in dna_1:
        print(f' {x}', end='')
    print('')
    print('-'*(3+len(dna_1)*2))
    for nuc in dna_2:
        print(f'{nuc}|', end='')
        for x in dna_1:
            if x == nuc: print('  ', end='')
            else: print(' X', end='')
        print("")

dot_plot(dna_1, dna_2)

 | G C A T G C T G A T C A G C T A G T C G A C A C G A G C T C A G C A
-----------------------------------------------------------------------
G|   X X X   X X   X X X X   X X X   X X   X X X X   X   X X X X   X X
T| X X X   X X   X X   X X X X   X X   X X X X X X X X X X   X X X X X
A| X X   X X X X X   X X   X X X   X X X X   X   X X   X X X X   X X  
C| X   X X X   X X X X   X X   X X X X   X X   X   X X X   X   X X   X
A| X X   X X X X X   X X   X X X   X X X X   X   X X   X X X X   X X  
T| X X X   X X   X X   X X X X   X X   X X X X X X X X X X   X X X X X
C| X   X X X   X X X X   X X   X X X X   X X   X   X X X   X   X X   X
T| X X X   X X   X X   X X X X   X X   X X X X X X X X X X   X X X X X
A| X X   X X X X X   X X   X X X   X X X X   X   X X   X X X X   X X  
G|   X X X   X X   X X X X   X X X   X X   X X X X   X   X X X X   X X
C| X   X X X   X X X X   X X   X X X X   X X   X   X X X   X   X X   X
G|   X X X   X X   X X X X   X X X   X X   X X X X   X   X X X X   X X
C| X 