In [1]:
import re, unicodedata, math
from typing import Dict, List, Tuple

In [2]:
# -------------------------------
# NORMALISATION
# -------------------------------
def strip_accents(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')

def normalize(s):
    s = strip_accents(s.lower().strip())
    s = re.sub(r'[^a-z0-9:/ \-]', ' ', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

In [3]:
# -------------------------------
# TOKENISATION LÉGÈRE
# -------------------------------
def tokenize(s):
    return normalize(s).split()

# -------------------------------
# n-GRAMMES + COSINE
# -------------------------------
def char_ngrams(s, n=3):
    s = f" {s} "
    grams = {}
    for i in range(max(0, len(s)-n+1)):
        g = s[i:i+n]
        grams[g] = grams.get(g, 0)+1
    return grams

def cosine(a, b):
    keys = set(a.keys()) | set(b.keys())
    dot = sum(a.get(k,0)*b.get(k,0) for k in keys)
    na = math.sqrt(sum(v*v for v in a.values()))
    nb = math.sqrt(sum(v*v for v in b.values()))
    return 0 if na==0 or nb==0 else dot/(na*nb)

def ngram_score(u, v):
    return sum(cosine(char_ngrams(u,n), char_ngrams(v,n)) for n in (3,4,5))/3

In [4]:
# -------------------------------
# TOKENISATION LÉGÈRE
# -------------------------------
def tokenize(s):
    return normalize(s).split()

# -------------------------------
# n-GRAMMES + COSINE
# -------------------------------
def char_ngrams(s, n=3):
    s = f" {s} "
    grams = {}
    for i in range(max(0, len(s)-n+1)):
        g = s[i:i+n]
        grams[g] = grams.get(g, 0)+1
    return grams

def cosine(a, b):
    keys = set(a.keys()) | set(b.keys())
    dot = sum(a.get(k,0)*b.get(k,0) for k in keys)
    na = math.sqrt(sum(v*v for v in a.values()))
    nb = math.sqrt(sum(v*v for v in b.values()))
    return 0 if na==0 or nb==0 else dot/(na*nb)

def ngram_score(u, v):
    return sum(cosine(char_ngrams(u,n), char_ngrams(v,n)) for n in (3,4,5))/3


In [5]:
# -------------------------------
# SYNONYMES & MOTS CLÉS
# -------------------------------
SYNONYMS = {
    "patient": ["malade", "personne", "cas"],
    "chirurgien": ["docteur", "chirurgie", "operateur"],
    "anesthesiste": ["anap", "anesth"],
    "salle": ["bloc", "piece", "chambre"],
    "heure": ["temps", "moment"],
    "site": ["zone", "endroit", "localisation"],
}

def replace_synonyms(tokens):
    out = []
    for t in tokens:
        for key, syns in SYNONYMS.items():
            if t in syns:
                t = key
                break
        out.append(t)
    return out

def keyword_overlap(u_tokens, v_tokens):
    u_set, v_set = set(replace_synonyms(u_tokens)), set(replace_synonyms(v_tokens))
    return len(u_set & v_set) / max(1, len(v_set))


In [6]:
# -------------------------------
# SCORE GLOBAL = pondération lexicale + contextuelle
# -------------------------------
def robust_similarity(utterance, candidate):
    u, c = normalize(utterance), normalize(candidate)
    # score de similarité caractères
    ngram_sim = ngram_score(u, c)
    # similarité mots / synonymes
    kw_sim = keyword_overlap(tokenize(u), tokenize(c))
    return 0.7*ngram_sim + 0.3*kw_sim


In [7]:
# -------------------------------
# GÉNÉRATION DES CANDIDATS
# -------------------------------
def variants_nom(full):
    full = normalize(full)
    parts = full.split()
    var = [full]
    if len(parts)==2:
        var += [f"{parts[1]} {parts[0]}"]
    return var

def variants_heure(h):
    return [h, h.replace(':','h')]

def variants_salle(s):
    num = re.findall(r'\d+', s)
    if num:
        n = num[0]
        return [s, f"salle {n}", f"bloc {n}"]
    return [s]

In [8]:
# -------------------------------
# FONCTION PRINCIPALE
# -------------------------------
def match_json_field(utterance, fiche):
    """
    Compare l'énoncé à tous les champs connus du JSON et renvoie les meilleurs match
    """
    fields = {}
    pat = fiche["patient"]
    inter = fiche["intervention"]

    candidates = {
        "PATIENT_IDENTITE": variants_nom(f"{pat['prenom']} {pat['nom']}"),
        "CHIRURGIEN": variants_nom(inter["chirurgien"]),
        "ANESTHESISTE": variants_nom(inter.get("anesthesiste","")),
        "HEURE_PREVUE": variants_heure(inter["heure_prevue"]),
        "SITE_OPERATOIRE": [inter["site_operatoire"]],
        "INTERVENTION_TYPE": [inter["type"]],
        "SALLE": variants_salle(inter.get("bloc",""))
    }

    best_field, best_score, best_value = None, 0, ""
    for field, vals in candidates.items():
        for v in vals:
            s = robust_similarity(utterance, v)
            if s > best_score:
                best_field, best_score, best_value = field, s, v

    decision = "OK" if best_score >= 0.88 else ("INCERTAIN" if best_score >= 0.70 else "KO")
    return {
        "utterance": utterance,
        "field": best_field,
        "value": best_value,
        "score": round(best_score,3),
        "decision": decision
    }

In [12]:
fiche = {
  "patient": {"prenom":"Paul","nom":"Dupont","date_naissance":"1975-03-12"},
  "intervention": {
    "type":"Appendicectomie",
    "heure_prevue":"10:30",
    "site_operatoire":"Fosse iliaque droite",
    "chirurgien":"Dr. Lefèvre",
    "anesthesiste":"Dr. Bernard",
    "bloc":"Salle 3"
  }
}

tests = [
    "aujourd'hui on opère Paul Dupont",
    "le patient c'est monsieur Dupont Paul",
    "on démarre à 10h30",
    "bloc numéro 3 prêt",
    "le docteur lefevre est là",
    "site fosse droite",
    "appendice prévu",
]
for t in tests:
    print(match_json_field(t, fiche))


{'utterance': "aujourd'hui on opère Paul Dupont", 'field': 'PATIENT_IDENTITE', 'value': 'paul dupont', 'score': 0.697, 'decision': 'KO'}
{'utterance': "le patient c'est monsieur Dupont Paul", 'field': 'PATIENT_IDENTITE', 'value': 'dupont paul', 'score': 0.684, 'decision': 'KO'}
{'utterance': 'on démarre à 10h30', 'field': 'HEURE_PREVUE', 'value': '10h30', 'score': 0.637, 'decision': 'KO'}
{'utterance': 'bloc numéro 3 prêt', 'field': 'SALLE', 'value': 'bloc 3', 'score': 0.547, 'decision': 'KO'}
{'utterance': 'le docteur lefevre est là', 'field': 'CHIRURGIEN', 'value': 'dr lefevre', 'score': 0.492, 'decision': 'KO'}
{'utterance': 'site fosse droite', 'field': 'SITE_OPERATOIRE', 'value': 'Fosse iliaque droite', 'score': 0.612, 'decision': 'KO'}
{'utterance': 'appendice prévu', 'field': 'INTERVENTION_TYPE', 'value': 'Appendicectomie', 'score': 0.349, 'decision': 'KO'}
