In [1]:
import textdistance
from collections import Counter

### Create strings

In [2]:
s1 = 'SometimeImadeWrongDecisions'
s2 = 'MosttimeImadeTerribleDecisions'

### Dice coefficient similarity based on unigrams

$\text{Formula} = \frac{2 \times \left| s_1 \cap s_2 \right|}{\left| s_1 \right| + \left| s_2 \right|}$

In [3]:
def dice_coefficient_unigrams(s1, s2):  
    set1 = set(s1)  
    set2 = set(s2)  
    intersection = len(set1.intersection(set2))  
    return (2. * intersection) / (len(set1) + len(set2))

In [4]:
dice_coefficient_unigrams(s1,s2)

0.7878787878787878

### Jaccard similarity based on bigrams

In [5]:
def Jaccard_similarity_bigrams(s1,s2):
    lis1,lis2 = [s1[i:i+2] for i in range(len(s1)-1)],[s2[i:i+2] for i in range(len(s2)-1)]
    set1,set2 = set(lis1),set(lis2)
    
    print(f"s1 is {set1}, s2 is {set2}")
    
    intersection = len(set1.intersection(set2))
    
    print(f"intersection is {intersection}")
    
    return intersection/(len(set1)+len(set2)-intersection)

In [6]:
Jaccard_similarity_bigrams(s1,s2)

s1 is {'ma', 'ro', 'de', 'on', 'im', 'io', 'om', 'De', 'et', 'ad', 'eW', 'ns', 'ti', 'eI', 'gD', 'me', 'ec', 'Im', 'is', 'Wr', 'So', 'si', 'ng', 'ci'}, s2 is {'ma', 'er', 'de', 'on', 'Mo', 'im', 'io', 'rr', 'ib', 'st', 'De', 'ad', 'bl', 'os', 'ri', 'ns', 'ti', 'Te', 'eI', 'eD', 'le', 'me', 'ec', 'Im', 'is', 'eT', 'si', 'ci', 'tt'}
intersection is 16


0.43243243243243246

### Levenshtein edit distance

In [7]:
def levenshtein_distance(s1, s2, cost_sub=2, cost_ins=1, cost_del=1):
    if not s1:
        return len(s2) * cost_ins
    if not s2:
        return len(s1) * cost_del

    len_str1 = len(s1) + 1
    len_str2 = len(s2) + 1

    # Create a matrix to store distances
    dp = [[0 for n in range(len_str2)] for m in range(len_str1)]
    
    # Initialize the matrix
    for i in range(len_str1):
        dp[i][0] = i * cost_del
    for j in range(len_str2):
        dp[0][j] = j * cost_ins
    
    # print(f"dp is {dp}")
    
    # Compute Levenshtein distance
    for i in range(1, len_str1):
        for j in range(1, len_str2):
            if s1[i-1] == s2[j-1]:
                cost = 0
            else:
                cost = cost_sub

            dp[i][j] = min(dp[i-1][j] + cost_del,  # deletion
                           dp[i][j-1] + cost_ins,  # insertion
                           dp[i-1][j-1] + cost)   # substitution
            
    print(dp)

    return dp[-1][-1]

In [8]:
levenshtein_distance(s1, s2)

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [2, 3, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], [3, 4, 3, 4, 5, 6, 7, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29], [4, 5, 4, 5, 6, 7, 8, 7, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28], [5, 6, 5, 6, 5, 6, 7, 8, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29], [6, 7, 6, 7, 6, 7, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28], [7, 8, 7, 8, 7, 8, 7, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29], [8, 9, 8, 9, 8, 9, 8, 7, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19

17

In [9]:
def levenshtein_distance_with_operations(s1, s2, cost_sub=2, cost_ins=1, cost_del=1):
    len_str1 = len(s1) + 1
    len_str2 = len(s2) + 1

    # Create a matrix to store distances
    dp = [[0 for n in range(len_str2)] for m in range(len_str1)]
    operations = [[[] for n in range(len_str2)] for m in range(len_str1)]

    # Initialize the matrix
    for i in range(len_str1):
        dp[i][0] = i * cost_del
        if i > 0:
            operations[i][0] = operations[i-1][0] + [('Delete', s1[i-1])]
    for j in range(len_str2):
        dp[0][j] = j * cost_ins
        if j > 0:
            operations[0][j] = operations[0][j-1] + [('Insert', s2[j-1])]

    # Compute Levenshtein distance
    for i in range(1, len_str1):
        for j in range(1, len_str2):
            cost = 0 if s1[i-1] == s2[j-1] else cost_sub
            choices = [
                (dp[i-1][j] + cost_del, operations[i-1][j] + [('Delete', s1[i-1])]),
                (dp[i][j-1] + cost_ins, operations[i][j-1] + [('Insert', s2[j-1])]),
                (dp[i-1][j-1] + cost, operations[i-1][j-1] + (['Substitute', s1[i-1], s2[j-1]] if cost else []))
            ]
            dp[i][j], operations[i][j] = min(choices, key=lambda x: x[0])

    return dp[-1][-1], operations[-1][-1]

s1 = '1236523'
s2 = '1233462'

distance, ops = levenshtein_distance_with_operations(s1, s2)
print(f"Distance: {distance}")
for operation in ops:
    print(operation)

    
distance, ops = levenshtein_distance_with_operations(s2, s1)
print(f"\nDistance: {distance}")
for operation in ops:
    print(operation)

Distance: 4
('Insert', '3')
('Insert', '4')
('Delete', '5')
('Delete', '3')

Distance: 4
('Delete', '3')
('Delete', '4')
('Insert', '5')
('Insert', '3')


### Jaro similarity

In [10]:
from math import floor, ceil

# Function to calculate the
# Jaro Similarity of two s
def jaro_distance(s1, s2):
	
	# If the s are equal
	if (s1 == s2):
		return 1.0

	# Length of two s
	len1 = len(s1)
	len2 = len(s2)

	# Maximum distance upto which matching
	# is allowed
	max_dist = floor(max(len1, len2) / 2) - 1

	# Count of matches
	match = 0

	# Hash for matches
	hash_s1 = [0] * len(s1)
	hash_s2 = [0] * len(s2)

	# Traverse through the first
	for i in range(len1):

		# Check if there is any matches
		for j in range(max(0, i - max_dist),
					min(len2, i + max_dist + 1)):
			
			# If there is a match
			if (s1[i] == s2[j] and hash_s2[j] == 0):
				hash_s1[i] = 1
				hash_s2[j] = 1
				match += 1
				break

	# If there is no match
	if (match == 0):
		return 0.0

	# Number of transpositions
	t = 0
	point = 0

	# Count number of occurrences
	# where two characters match but
	# there is a third matched character
	# in between the indices
	for i in range(len1):
		if (hash_s1[i]):

			# Find the next matched character
			# in second
			while (hash_s2[point] == 0):
				point += 1

			if (s1[i] != s2[point]):
				t += 1
			point += 1
	t = t//2

	# Return the Jaro Similarity
	return (match/ len1 + match / len2 +
			(match - t) / match)/ 3.0

print(round(jaro_distance(s1, s2),6))

0.809524
