In [1]:
import textdistance
from collections import Counter

### Create strings

In [2]:
s1 = 'SometimeImadeWrongDecisions'
s2 = 'MosttimeImadeTerribleDecisions'

### 1. Dice coefficient similarity based on unigrams

$\text{Formula} = \frac{2 \times \left| s_1 \cap s_2 \right|}{\left| s_1 \right| + \left| s_2 \right|}$

In [3]:
def dice_coefficient_unigrams(s1, s2):  
    set1 = set(s1)  
    set2 = set(s2)  
    intersection = len(set1.intersection(set2))  
    return (2. * intersection) / (len(set1) + len(set2))

In [4]:
dice_coefficient_unigrams(s1,s2)

0.7878787878787878

### 2. Jaccard similarity based on bigrams

In [5]:
def Jaccard_similarity_bigrams(s1,s2):
    lis1,lis2 = [s1[i:i+2] for i in range(len(s1)-1)],[s2[i:i+2] for i in range(len(s2)-1)]
    set1,set2 = set(lis1),set(lis2)
    
    print(f"s1 is {set1}, s2 is {set2}")
    
    intersection = len(set1.intersection(set2))
    
    print(f"intersection is {intersection}")
    
    return intersection/(len(set1)+len(set2)-intersection)

In [6]:
Jaccard_similarity_bigrams(s1,s2)

s1 is {'ro', 'on', 'gD', 'ti', 'eI', 'is', 'ng', 'Im', 'im', 'ns', 'ad', 'et', 'om', 'So', 'ma', 'me', 'io', 'de', 'Wr', 'ci', 'si', 'De', 'eW', 'ec'}, s2 is {'on', 'os', 'ti', 'le', 'eI', 'rr', 'ri', 'bl', 'tt', 'is', 'Im', 'ib', 'im', 'eT', 'ns', 'ad', 'Mo', 'er', 'Te', 'ma', 'me', 'io', 'de', 'ci', 'eD', 'si', 'De', 'st', 'ec'}
intersection is 16


0.43243243243243246

### 3. Bag distance similarity

In [7]:
def bag_dice_similarity(s1, s2):
    bag_s1 = Counter(s1)
    bag_s2 = Counter(s2)
    
    # intersection - counter
    intersection = sum((bag_s1 & bag_s2).values())
    
    print(f"intersection is {intersection}, bags are {bag_s1},{bag_s2}")
    
    return (2.0 * intersection) / (sum(bag_s1.values()) + sum(bag_s2.values()))

In [8]:
bag_dice_similarity(s1, s2)

intersection is 21, bags are Counter({'e': 4, 'o': 3, 'm': 3, 'i': 3, 'n': 2, 's': 2, 'S': 1, 't': 1, 'I': 1, 'a': 1, 'd': 1, 'W': 1, 'r': 1, 'g': 1, 'D': 1, 'c': 1}),Counter({'e': 5, 'i': 4, 's': 3, 'o': 2, 't': 2, 'm': 2, 'r': 2, 'M': 1, 'I': 1, 'a': 1, 'd': 1, 'T': 1, 'b': 1, 'l': 1, 'D': 1, 'c': 1, 'n': 1})


0.7368421052631579

### Levenshtein edit distance

In [9]:
def levenshtein_distance(s1, s2, cost_sub=2, cost_ins=1, cost_del=1):
    if not s1:
        return len(s2) * cost_ins
    if not s2:
        return len(s1) * cost_del

    len_str1 = len(s1) + 1
    len_str2 = len(s2) + 1

    # Create a matrix to store distances
    dp = [[0 for n in range(len_str2)] for m in range(len_str1)]
    
    # Initialize the matrix
    for i in range(len_str1):
        dp[i][0] = i * cost_del
    for j in range(len_str2):
        dp[0][j] = j * cost_ins
    
    # print(f"dp is {dp}")
    
    # Compute Levenshtein distance
    for i in range(1, len_str1):
        for j in range(1, len_str2):
            if s1[i-1] == s2[j-1]:
                cost = 0
            else:
                cost = cost_sub

            dp[i][j] = min(dp[i-1][j] + cost_del,  # deletion
                           dp[i][j-1] + cost_ins,  # insertion
                           dp[i-1][j-1] + cost)   # substitution
            
    print(dp)

    return dp[-1][-1]

s1 = '1236523'
s2 = '1233462'

In [10]:
levenshtein_distance(s1, s2)

[[0, 1, 2, 3, 4, 5, 6, 7], [1, 0, 1, 2, 3, 4, 5, 6], [2, 1, 0, 1, 2, 3, 4, 5], [3, 2, 1, 0, 1, 2, 3, 4], [4, 3, 2, 1, 2, 3, 2, 3], [5, 4, 3, 2, 3, 4, 3, 4], [6, 5, 4, 3, 4, 5, 4, 3], [7, 6, 5, 4, 3, 4, 5, 4]]


4

In [11]:
def levenshtein_distance_with_operations(s1, s2, cost_sub=2, cost_ins=1, cost_del=1):
    len_str1 = len(s1) + 1
    len_str2 = len(s2) + 1

    # Create a matrix to store distances
    dp = [[0 for n in range(len_str2)] for m in range(len_str1)]
    operations = [[[] for n in range(len_str2)] for m in range(len_str1)]

    # Initialize the matrix
    for i in range(len_str1):
        dp[i][0] = i * cost_del
        if i > 0:
            operations[i][0] = operations[i-1][0] + [('Delete', s1[i-1])]
    for j in range(len_str2):
        dp[0][j] = j * cost_ins
        if j > 0:
            operations[0][j] = operations[0][j-1] + [('Insert', s2[j-1])]

    # Compute Levenshtein distance
    for i in range(1, len_str1):
        for j in range(1, len_str2):
            cost = 0 if s1[i-1] == s2[j-1] else cost_sub
            choices = [
                (dp[i-1][j] + cost_del, operations[i-1][j] + [('Delete', s1[i-1])]),
                (dp[i][j-1] + cost_ins, operations[i][j-1] + [('Insert', s2[j-1])]),
                (dp[i-1][j-1] + cost, operations[i-1][j-1] + (['Substitute', s1[i-1], s2[j-1]] if cost else []))
            ]
            dp[i][j], operations[i][j] = min(choices, key=lambda x: x[0])

    return dp[-1][-1], operations[-1][-1]

s1 = '1236523'
s2 = '1233462'

distance, ops = levenshtein_distance_with_operations(s1, s2)
print(f"Distance: {distance}")
for operation in ops:
    print(operation)

    
distance, ops = levenshtein_distance_with_operations(s2, s1)
print(f"\nDistance: {distance}")
for operation in ops:
    print(operation)

Distance: 4
('Insert', '3')
('Insert', '4')
('Delete', '5')
('Delete', '3')

Distance: 4
('Delete', '3')
('Delete', '4')
('Insert', '5')
('Insert', '3')
