In [1]:
print("Installation/Importation des librairies necessaires.")

Installation/Importation des librairies necessaires.


In [2]:
%%capture capt
!pip install jellyfish
!pip install levenshtein

In [3]:
%%capture capt
import numpy as np
from jellyfish import jaro_winkler_similarity
import pandas as pd
from tqdm import tqdm
import Levenshtein
import math

# Formules de distance entre string

##### Jaro-Winkler

In [4]:
def convert_to_sentence(arr):
    sentence = ''
    for x in arr:
        sentence += x + ","
    return sentence[:-1]

In [5]:
def string_distance(s1, s2):
    # Compute distance between strings using the jaro winkler similarity formula
    return 1 - jaro_winkler_similarity(s1, s2)

def jaro_winkler_distance(arr1, arr2):
    s1 = convert_to_sentence(arr1)
    s2 = convert_to_sentence(arr2)
    return abs(string_distance(s1, s2))

##### Levenshtein 

In [6]:
def string_distance_levenshtein(s1, s2):
    return 1 - Levenshtein.ratio(s1, s2)

def levenshtein_distance(arr1, arr2):
    s1 = convert_to_sentence(arr1)
    s2 = convert_to_sentence(arr2)
    return abs(string_distance_levenshtein(s1, s2))

##### Custom

Tous les GHM ont 6 caractères

In [7]:
def custom_ghm_distance_norm(ghm1, ghm2, distance=string_distance_levenshtein, weights=[50,10,1]):
    return (weights[0]*distance(ghm1[:2], ghm2[:2]) + weights[1]*distance(ghm1[2], ghm2[2]) + weights[2]*distance(ghm1[3:], ghm2[3:])) / np.sum(weights)

def custom_distance_norm(arr1, arr2, distance_ghm=string_distance_levenshtein, weights=[50, 10, 1]):
    min_length = min(len(arr1), len(arr2))
    max_length = max(len(arr1), len(arr2))
    distance = np.sum([custom_ghm_distance_norm(arr1[i], arr2[i], distance=distance_ghm,weights=weights) for i in range(min_length)]) + (max_length - min_length)
    return distance / max_length

In [8]:
def custom_ghm_distance(ghm1, ghm2, distance=string_distance_levenshtein, weights=[50, 10, 1]):
    return (weights[0]*distance(ghm1[:2], ghm2[:2]) + weights[1]*distance(ghm1[2], ghm2[2]) + weights[2]*distance(ghm1[3:], ghm2[3:]))

def custom_distance(arr1, arr2, distance_ghm=string_distance_levenshtein,weights=[50,10,1]):
    min_length = min(len(arr1), len(arr2))
    max_length = max(len(arr1), len(arr2))
    distance = np.sum([custom_ghm_distance(arr1[i], arr2[i], distance=distance_ghm,weights=weights) for i in range(min_length)]) + sum(weights)*(max_length - min_length)
    return distance / max_length

### Filter distance

In [9]:
def custom_ghm_distance_4weights(ghm1, ghm2, distance=string_distance_levenshtein, weights=[1, 1, 1, 1]):
    return weights[0]*distance(ghm1[:2], ghm2[:2]) + weights[1]*distance(ghm1[2], ghm2[2]) + weights[2]*distance(ghm1[3:5], ghm2[3:5]) + weights[3]*distance(ghm1[-1], ghm2[-1])

In [13]:
def coor_in_array(i, arr):
    return 0 <= i and i < len(arr)

def filter_distance_a_to_b__element_i(arr1, arr2, i, default_value, custom_dist, weights, distance_ghm=string_distance_levenshtein):
    distances = [0]*3
    for j, coor in enumerate([i-1, i, i+1]):
        if coor_in_array(coor, arr2):
            distances[j] = custom_dist(arr1[i], arr2[coor], distance_ghm, weights)
        else:
            distances[j] = float('inf')
    min_ = min(distances)
    if min_ == float('inf'):
        return default_value
    return min_

def filter_distance_a_to_b(arr1, arr2, default_value, custom_dist, weights, distance_ghm=string_distance_levenshtein):
    distance = 0
    for i in range(len(arr1)):
        distance += filter_distance_a_to_b__element_i(arr1, arr2, i, default_value, custom_dist, weights=weights, distance_ghm=distance_ghm)
    return distance

In [None]:
def distance_filter(arr1, arr2, weights=[1,1,1], distance_ghm=string_distance_levenshtein):
    min_length = min(len(arr1), len(arr2))
    max_length = max(len(arr1), len(arr2))
    default_value = sum(weights)*(max_length - min_length)
    distance = filter_distance_a_to_b(arr1, arr2, default_value, custom_ghm_distance, weights=weights, distance_ghm=distance_ghm) 
    distance += filter_distance_a_to_b(arr2, arr1, default_value, custom_ghm_distance, weights=weights, distance_ghm=distance_ghm)
    distance /= 2
    return distance / max_length

def distance_filter_4weights(arr1, arr2, weights=[1, 1, 1, 1], distance_ghm=string_distance_levenshtein):
    min_length = min(len(arr1), len(arr2))
    max_length = max(len(arr1), len(arr2))
    default_value = sum(weights)*(max_length - min_length)
    distance = filter_distance_a_to_b(arr1, arr2, default_value, custom_ghm_distance_4weights, weights=weights, distance_ghm=distance_ghm) 
    distance += filter_distance_a_to_b(arr2, arr1, default_value, custom_ghm_distance_4weights, weights=weights, distance_ghm=distance_ghm)
    distance /= 2
    return distance / max_length