In [1]:
import numpy as np

def lev_ratio_and_distance(s1, s2, ratio_calculation = False):
    
    """ Levenshtein, in 1965, developed a mathematical way of finding similarities and assign a value
        to it when comparing two texts. It was a smart approach basically looking for how many edits
        meaning additions, deletions and substitutions would be needed to make both texts the same.
    
        lev_ratio_and_distance is a function that calculates the Levenshtein distance between two strings.
        If ratio_calcuation = True, the function computes the levenshtein distance ratio of similarity
        between two strings. For all i and j, distance[i,j] will contain the Levenshtein distance
        between the first i characters of s1 and the first j characters of s2
    """

    # Set-up matrix of zeros

    rows = len(s1)+1
    cols = len(s2)+1
    distance = np.zeros((rows,cols),dtype = int)

    # Populate matrix of zeros with the indexes of each character of both strings

    for i in range(1, rows):
        for k in range(1,cols):
            distance[i][0] = i
            distance[0][k] = k

    # Iterate with the matrix to compute the cost of deletions, insertions and/or substitutions    

    for col in range(1, cols):
        for row in range(1, rows):
            if s1[row-1] == s2[col-1]:
                cost = 0 # If the characters are the same in the two strings in a given position [i,j] then the cost is 0
            else:
                # In order to align the results with those of the Levenshtein existing models, if we choose to calculate the ratio
                # the cost of a substitution is 2. If we calculate just distance, then the cost of a substitution is 1. We can decide and change easily
                # the cost we want for each substitution.
                if ratio_calculation == True:
                    cost = 2
                else:
                    cost = 1
            distance[row][col] = min(distance[row-1][col] + 1,      # Cost of deletions
                                 distance[row][col-1] + 1,          # Cost of insertions
                                 distance[row-1][col-1] + cost)     # Cost of substitutions
    if ratio_calculation == True:

        # Computation of the Levenshtein Distance Ratio
        
        Ratio = ((len(s1)+len(s2)) - distance[row][col]) / (len(s1)+len(s2))
    
        return Ratio
    else:
        
        # print(distance) 
        
        # Uncomment if you would like to see the matrix showing how the algorithm computes the cost
        # of deletions, insertions and/or substitutions. Comment to avoid that.
        # This is the minimum number of edits needed to convert string 1 to string 2
        
        return "The strings are {} edits away".format(distance[row][col])

In [2]:
s1 = open("sample1.txt","r")

In [3]:
String1 = s1.read()

In [4]:
s2 = open("sample2.txt","r")

In [5]:
String2 = s2.read()

In [6]:
s3 = open("sample3.txt","r")

In [13]:
String3 = s3.read()

In [7]:
d1d2= lev_ratio_and_distance(String1.lower(),String2.lower())

In [8]:
print(d1d2)

The strings are 65 edits away


In [9]:
ratiod1d2= lev_ratio_and_distance(String1.lower(),String2.lower(),ratio_calculation=True)

In [10]:
print(ratiod1d2)

0.8753541076487252


In [17]:
d1d3= lev_ratio_and_distance(String1.lower(),String3.lower())

In [18]:
print(d1d3)

The strings are 296 edits away


In [19]:
ratiod1d3= lev_ratio_and_distance(String1.lower(),String3.lower(),ratio_calculation=True)

In [15]:
print(ratiod1d3)

0.4681404421326398


In [21]:
d2d3= lev_ratio_and_distance(String2.lower(),String3.lower())

In [23]:
print(d2d3)

The strings are 289 edits away


In [24]:
ratiod2d3= lev_ratio_and_distance(String2.lower(),String3.lower(),ratio_calculation=True)

In [80]:
print(ratiod2d3)

0.4589308996088657
