In [None]:
class StringDistanceCalculator:
    def hamming_distance(self, str1, str2):
        if len(str1) != len(str2):
            raise ValueError("Strings must be of equal length")
        
        distance = 0
        for i in range(len(str1)):
            if str1[i] != str2[i]:
                distance += 1
        
        return distance

    def levenshtein_distance(self, str1, str2):
        len_str1 = len(str1)
        len_str2 = len(str2)

        matrix = [[0] * (len_str2 + 1) for _ in range(len_str1 + 1)]

        for i in range(len_str1 + 1):
            matrix[i][0] = i

        for j in range(len_str2 + 1):
            matrix[0][j] = j

        for i in range(1, len_str1 + 1):
            for j in range(1, len_str2 + 1):
                cost = 0 if str1[i - 1] == str2[j - 1] else 1
                matrix[i][j] = min(
                    matrix[i - 1][j] + 1,
                    matrix[i][j - 1] + 1,
                    matrix[i - 1][j - 1] + cost
                )

        return matrix[len_str1][len_str2]

# Example usage:
calculator = StringDistanceCalculator()
string1 = "karolin"
string2 = "kathrin"
distance = calculator.hamming_distance(string1, string2)
print(f"The Hamming distance between '{string1}' and '{string2}' is {distance}.")

string1 = "kitten"
string2 = "sitting"
distance = calculator.levenshtein_distance(string1, string2)
print(f"The Levenshtein distance between '{string1}' and '{string2}' is {distance}.")
```

This class `StringDistanceCalculator` now contains both the `hamming_distance` and `levenshtein_distance` methods. You can add the other distance calculation methods to this class in a similar manner. Remember to instantiate the class before calling its methods, as shown in the example usage.

Source: Conversation with Bing, 5/16/2024
(1) github.com. https://github.com/ouprince/basic/tree/2d3680f223d3ce3ae2e95755be054bbc9b8bdf13/basic%2Futils.py.
(2) github.com. https://github.com/t0ms0n00/Tekstowe/tree/8414f455758325b262f038d5ae6413455151e54b/Lab5.py.

In [71]:
class StringDistanceCalculator:
    def hamming_distance(self, str1, str2):
        if len(str1) != len(str2):
            raise ValueError("Strings must be of equal length")
        
        distance = 0
        for i in range(len(str1)):
            if str1[i] != str2[i]:
                distance += 1
        
        return distance

    def levenshtein_distance(self, str1, str2):
        len_str1 = len(str1)
        len_str2 = len(str2)

        matrix = [[0] * (len_str2 + 1) for _ in range(len_str1 + 1)]

        for i in range(len_str1 + 1):
            matrix[i][0] = i

        for j in range(len_str2 + 1):
            matrix[0][j] = j

        for i in range(1, len_str1 + 1):
            for j in range(1, len_str2 + 1):
                cost = 0 if str1[i - 1] == str2[j - 1] else 1
                matrix[i][j] = min(
                    matrix[i - 1][j] + 1,
                    matrix[i][j - 1] + 1,
                    matrix[i - 1][j - 1] + cost
                )

        return matrix[len_str1][len_str2]

    def damerau_levenshtein_distance(self, str1, str2):
        len_str1 = len(str1)
        len_str2 = len(str2)

        matrix = [[0] * (len_str2 + 1) for _ in range(len_str1 + 1)]

        for i in range(len_str1 + 1):
            matrix[i][0] = i

        for j in range(len_str2 + 1):
            matrix[0][j] = j

        for i in range(1, len_str1 + 1):
            for j in range(1, len_str2 + 1):
                cost = 0 if str1[i - 1] == str2[j - 1] else 1

                # Calculate transposition cost
                transposition_cost = 1
                if i > 1 and j > 1 and str1[i - 1] == str2[j - 2] and str1[i - 2] == str2[j - 1]:
                    transposition_cost = 0

                matrix[i][j] = min(
                    matrix[i - 1][j] + 1,
                    matrix[i][j - 1] + 1,
                    matrix[i - 1][j - 1] + cost,
                    matrix[i - 2][j - 2] + transposition_cost  # Transposition
                )

        return matrix[len_str1][len_str2]
            

    def jaro_winkler_distance(self, str1, str2, prefix_scale=0.1):
        
        print("inputs : ",str1, str2, prefix_scale)
        # Jaro similarity
        jaro_similarity = self.jaro_distance(str1, str2)
        print("jaro_similarity : ",jaro_similarity)

        # Length of common prefix (up to a maximum of 4 characters)
        prefix_length = 0
        max_prefix_length = min(4, min(len(str1), len(str2)))
        print("max_prefix_length,prefix_length : ",max_prefix_length,prefix_length)
        
        for i in range(max_prefix_length):
            if str1[i] == str2[i]:
                prefix_length += 1
            else:
                break

        print("after for prefix_length,jaro_similarity,prefix_scale : ",prefix_length,jaro_similarity,prefix_scale)
                
        # Before calculating Jaro-Winkler distance, check if prefix_scale and jaro_similarity are numbers
        if prefix_scale is None or jaro_similarity is None:
            raise ValueError("prefix_scale and jaro_similarity must be numbers")
            
        print("after for prefix_length : ",prefix_length)

        # Now you can safely calculate Jaro-Winkler distance
        jaro_winkler_distance = jaro_similarity + (prefix_length * prefix_scale * (1 - jaro_similarity))

#         # Calculate Jaro-Winkler distance
#         jaro_winkler_distance = jaro_similarity + (prefix_length * prefix_scale * (1 - jaro_similarity))

        return jaro_winkler_distance

    def jaro_distance(self, str1, str2):
        # Length of strings
        len_str1 = len(str1)
        len_str2 = len(str2)

        # Matching distance (maximum number of characters to match)
        match_distance = max(len_str1, len_str2) // 2 - 1
        if match_distance < 0:
            match_distance = 0

        # Arrays to store matching characters
        str1_matches = [False] * len_str1
        str2_matches = [False] * len_str2

        # Count of matching characters
        matches = 0

        # Count of transpositions
        transpositions = 0

        # Find matching characters
        for i in range(len_str1):
            start = max(0, i - match_distance)
            end = min(i + match_distance + 1, len_str2)

            for j in range(start, end):
                if not str2_matches[j] and str1[i] == str2[j]:
                    str1_matches[i] = True
                    str2_matches[j] = True
                    matches += 1
                    break

        # If there are no matches, return 0
        if matches == 0:
            return 0.0

        # Count transpositions
        k = 0
        for i in range(len_str1):
            if str1_matches[i]:
                while not str2_matches[k]:
                    k += 1
                if str1[i] != str2[k]:
                    transpositions += 1
                k += 1
        if not isinstance(transpositions, (int, float)):
            raise ValueError("transpositions must be a numeric value")

        print("transpositions : ",transpositions )
        print("transpositions : ",transpositions / 2)
        # Perform the division
        transpositions //= 2 # Divide by 2 as transpositions were counted twice

        # Calculate Jaro distance
        similarity = (
            matches / len_str1 +
            matches / len_str2 +
            (matches - transpositions) / matches
        ) / 3

        return similarity

# Example usage:
calculator = StringDistanceCalculator()
string1 = "karolin"
string2 = "kathrin"
hamming_dist = calculator.hamming_distance(string1, string2)
levenshtein_dist = calculator.levenshtein_distance(string1, string2)
damerau_levenshtein_dist = calculator.damerau_levenshtein_distance(string1, string2)
jaro_winkler_dist = calculator.jaro_winkler_distance(string1, string2)

print(f"Hamming distance: {hamming_dist}")
print(f"Levenshtein distance: {levenshtein_dist}")
print(f"Damerau-Levenshtein distance: {damerau_levenshtein_dist}")
print(f"Jaro-Winkler distance: {jaro_winkler_dist}")


inputs :  karolin kathrin 0.1
transpositions :  0
transpositions :  0.0
jaro_similarity :  0.8095238095238096
max_prefix_length,prefix_length :  4 0
after for prefix_length,jaro_similarity,prefix_scale :  2 0.8095238095238096 0.1
after for prefix_length :  2
Hamming distance: 3
Levenshtein distance: 3
Damerau-Levenshtein distance: 2
Jaro-Winkler distance: 0.8476190476190477


In [64]:
# def jaro_distance(transpositions):
#         # Length of strings
#         len_str1 = len(str1)
#         len_str2 = len(str2)

#         # Matching distance (maximum number of characters to match)
#         match_distance = max(len_str1, len_str2) // 2 - 1
#         if match_distance < 0:
#             match_distance = 0

#         # Arrays to store matching characters
#         str1_matches = [False] * len_str1
#         str2_matches = [False] * len_str2

#         # Count of matching characters
#         matches = 0

#         # Count of transpositions
#         transpositions = 0

#         # Find matching characters
#         for i in range(len_str1):
#             start = max(0, i - match_distance)
#             end = min(i + match_distance + 1, len_str2)

#             for j in range(start, end):
#                 if not str2_matches[j] and str1[i] == str2[j]:
#                     str1_matches[i] = True
#                     str2_matches[j] = True
#                     matches += 1
#                     break

#         # If there are no matches, return 0
#         if matches == 0:
#             return 0.0

#         # Count transpositions
#         k = 0
#         for i in range(len_str1):
#             if str1_matches[i]:
#                 while not str2_matches[k]:
#                     k += 1
#                 if str1[i] != str2[k]:
#                     transpositions += 1
#                 k += 1
#         if not isinstance(transpositions, (int, float)):
#             raise ValueError("transpositions must be a numeric value")

#         print("transpositions : ",transpositions )
#         print("transpositions : ",transpositions / 2)
#         # Perform the division
transpositions =22
transpositions //= 22  # Divide by 2 as transpositions were counted twice

#         # Calculate Jaro distance
#         similarity = (
#             matches / len_str1 +
#             matches / len_str2 +
#             (matches - transpositions) / matches ) / 3

#         return transpositions

SyntaxError: invalid decimal literal (3602885525.py, line 53)

In [60]:
transpositions =22
transpositions //= 22

In [1]:
x = 3
x

3

In [1]:
from src.pytance import CharacterBased
from src.pytance import PhoneticSensitive
from src.pytance import SequenceBased
from src.pytance import TokenBased

In [3]:
from src.pytance.TokenBased import QGrams



In [6]:
str1 = "apple"
str2 = "apricot"
q = 2
result = QGrams.qgram_similarity(str1, str2, q)
print(f"Q-Gram Similarity (q={q}): {result}")


Q-Gram Similarity (q=2): 0.16666666666666666
