In [58]:
import pandas as pd
import textdistance
from unidecode import unidecode

In [14]:
df = pd.read_csv("Prenoms.csv", sep=";", encoding="latin-1")

In [59]:
def clean(s):
    return unidecode(s).capitalize().replace("-", "").replace(" ", "").split("(")[0]

In [60]:
prenoms = df[df['03_langage'].str.contains("french", na=False)]["01_prenom"].apply(lambda x: clean(x)).tolist()

In [79]:
import re
import wikipedia

In [88]:
wikipedia.set_lang("fr")
page = wikipedia.page("Liste_de_prénoms_en_français").content
prenoms2 = re.findall(r"(\(f\)|\(x\)) (([A-Z]{1}\S*){1,2})", page, flags=re.UNICODE)

In [126]:
prenoms2 = [clean(x[-1]) for x in prenoms2]
prenoms_mix = [x for x in set(prenoms).union(prenoms2) if len(x) > 1]

In [None]:
import phonetics
from abydos import phonetic
fonem = phonetic.FONEM()
phonex = phonetic.Phonex()

In [152]:
weights = {
    0.25,  # metaphone 
    0.2,  # nysiis
    0.1,  # soundex
    0.3,  # fonem
    0.15,  # phonex
}

def cast(s):
    return [
        phonetics.metaphone(s),
        phonetics.nysiis(s),
        phonetics.soundex(s),
        fonem.encode(s),
        phonex.encode(s)
    ]

def distance(s1, s2):
    total = 0
    try:
        l1, l2 = cast(s1), cast(s2)
    except:
        return 5
    for i, w in enumerate(weights):
        lev = textdistance.levenshtein(l1[i], l2[i])
        total += lev * w
    return total

def query(table, target):
    return sorted(table, key=lambda x: distance(x, target))[:10]

In [156]:
query(prenoms_mix, "")

['Xavier',
 'Jolie',
 'Julie',
 'Jeanne',
 'Josue',
 'Joelle',
 'Gautier',
 'Joel',
 'Jean',
 'Josee']