In [105]:
import re
import numpy as np
import pandas as pd
import Levenshtein
from metaphone import doublemetaphone

In [149]:
df = pd.read_csv("resources/export_full_dataset.csv")
df.head(3)

Unnamed: 0,NAME,METAPHONE_A,METAPHONE_B,REGEX_CV
0,MARIA,MR,MR,CACIA
1,CARMEN,KRMN,KRMN,CACEC
2,JOSEFA,JSF,HSF,COCECA


In [150]:
cREGEX_NOT_REPEATED = re.compile(r"(.)\1+")

def pre_processing_token(iToken):
    return cREGEX_NOT_REPEATED.sub(r"\1", iToken.upper())

vToken = "Jjuaaannn"
print(f"Token {vToken} is transformed to {pre_processing_token(vToken)}.")

Token Jjuaaannn is transformed to JUAN.


In [199]:
cCONSONANTS = "BCDFGHIJKLMNÑPQRSTVWXZ"

def get_semantic_token(iToken):
    vTokenRegex = iToken.upper()
    vTokenRegex = vTokenRegex.translate(str.maketrans(cCONSONANTS, "C" * len(cCONSONANTS)))
    vTokenRegex = cREGEX_NOT_REPEATED.sub(r"\1", vTokenRegex)
    return vTokenRegex

def get_correct_token(iToken):
    vData = df[df["NAME"] == iToken.upper()]["NAME"].values.tolist()
    if len(vData) > 0:
        return vData[0], 0
        #return df[df["NAME"] == iToken]
    vToken = pre_processing_token(iToken)
    vData = df[df["NAME"] == vToken]["NAME"].values.tolist()
    if len(vData) > 0:
        return vData[0], 0.3
        #return df[df["NAME"] == vToken]
    vData = df.copy()
    #print(f"Processing token: {vToken}.")
    vMetaphoneA, vMetaphoneB = doublemetaphone(vToken)
    vMetaphoneB = vMetaphoneB if len(vMetaphoneB) > 0 else vMetaphoneA
    #print(f"Metaphones: {vMetaphoneA} y {vMetaphoneB}.")
    vSemanticRegex = get_semantic_token(iToken)
    #print(f"The REGEX is: {vSemanticRegex}.")
    
    vData["LEVENSHTEIN_TOKEN"]  = vData["NAME"].apply(lambda x : Levenshtein.distance(x, iToken))
    vData["LEVENSHTEIN_META_A"] = vData["METAPHONE_A"].apply(lambda x : Levenshtein.distance(x, vMetaphoneA))
    vData["LEVENSHTEIN_META_B"] = vData["METAPHONE_B"].apply(lambda x : Levenshtein.distance(x, vMetaphoneB))
    vData["LEVENSHTEIN_REGEX"]  = vData["REGEX_CV"].apply(lambda x : Levenshtein.distance(x, vSemanticRegex))
    
    vData["LEVENSHTEIN_TOTAL"] = np.power(vData["LEVENSHTEIN_TOKEN"], 2) + \
        np.power(vData["LEVENSHTEIN_META_A"], 2) + \
        np.power(vData["LEVENSHTEIN_META_B"], 2) + \
        np.power(vData["LEVENSHTEIN_REGEX"], 2)
    vData["LEVENSHTEIN_TOTAL"] = np.sqrt(vData["LEVENSHTEIN_TOTAL"])
    vData = vData.sort_values(by = "LEVENSHTEIN_TOTAL", ascending=True)
    vList = vData[0:1][["NAME", "LEVENSHTEIN_TOTAL"]].values.tolist()[0]
    return vList[0], vList[1]

def verify_name(iMalformedName : str):
    vArrayTokens = iMalformedName.split(" ")
    vResult = []
    for vToken in vArrayTokens:
        vNewToken, vDistance = get_correct_token(vToken)
        vResult.append({
            "original_token": vToken,
            "new_token": vNewToken,
            "token_difference": vDistance 
        })
    vResult = {
        "original_text": iMalformedName,
        "new_text": " ".join([x["new_token"] for x in vResult]),
        "distances": vResult
    }
    return vResult

#get_correct_token("VALELO")[0:1]["NAME"].values.tolist()[0]
verify_name("NATALiy DJFISDFNSDF VAÑERO")

{'original_text': 'NATALiy DJFISDFNSDF VAÑERO',
 'new_text': 'NATALY DESMOND VALERO',
 'distances': [{'original_token': 'NATALiy',
   'new_token': 'NATALY',
   'token_difference': 2.0},
  {'original_token': 'DJFISDFNSDF',
   'new_token': 'DESMOND',
   'token_difference': 11.704699910719626},
  {'original_token': 'VAÑERO',
   'new_token': 'VALERO',
   'token_difference': 1.7320508075688772}]}