In [1]:
# Cell 1 — imports & helpers
import os, re, unicodedata, random, math
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.utils import shuffle

# Reproductibilité
SEED = 42
random.seed(SEED); np.random.seed(SEED); tf.random.set_seed(SEED)

# Normalisation texte
def strip_accents(s: str) -> str:
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def norm(s: str) -> str:
    s = s.lower().strip()
    s = strip_accents(s)
    s = re.sub(r"[^a-z0-9:/ \-]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s


2025-11-06 15:05:44.660633: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Cell 2 — load TSV
TSV_PATH = "../data/pairs_checklist_180k.tsv"  # ⬅️ adapte le chemin si besoin

df = pd.read_csv(TSV_PATH, sep="\t", header=None, names=["U","V","label"], dtype=str)
# Nettoyage doux
df["U"] = df["U"].astype(str).map(norm)
df["V"] = df["V"].astype(str).map(norm)
df["label"] = df["label"].astype(float)

print(df.head())
print("Total pairs:", len(df), "| positives:", int((df['label']==1.0).sum()), "| negatives:", int((df['label']==0.0).sum()))


                             U             V  label
0  le patient est chloe dupont  chloe dupont    1.0
1  le patient est chloe dupont  dupont chloe    1.0
2  le patient est chloe dupont        dupont    1.0
3  le patient est chloe dupont         chloe    1.0
4  le patient est chloe dupont         18:30    0.0
Total pairs: 180000 | positives: 76000 | negatives: 104000


In [3]:
# Cell 3 — lists + shuffle
U = df["U"].tolist()
V = df["V"].tolist()
y = df["label"].astype(float).tolist()

U, V, y = shuffle(U, V, y, random_state=SEED)

print("Sample:")
for i in range(5):
    print(U[i], "|", V[i], "=>", y[i])


Sample:
je repete lina petit | cholecystectomie => 0.0
le patient est ava da silva | nguyen => 0.0
a 18h00 | durand => 0.0
type prothese hanche | prothese hanche => 1.0
intervention arthroscopie genou | bloc 2 => 0.0


In [4]:
# Cell 4 — char vectorizer
SEQLEN = 200
MAX_TOKENS = 300  # vocab taille max (caractères)

vectorizer = layers.TextVectorization(
    standardize=None,
    split="character",
    output_mode="int",
    output_sequence_length=SEQLEN,
    max_tokens=MAX_TOKENS
)
vectorizer.adapt(np.array(U + V))  # IMPORTANT: adapter sur tout le texte

vocab = vectorizer.get_vocabulary()
print("Vocab size:", len(vocab))
# Sauvegarde pour Android
with open("char_vocab_embed.txt","w",encoding="utf-8") as f:
    f.write("\n".join(vocab))


2025-11-06 13:33:30.144878: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


Vocab size: 39


In [5]:
# ——— Cellule 6 (fix Keras 3) : encodeur compact (chars → 128d, L2-normalisé) ———
def build_encoder(vocab_size, seqlen, emb_dim=64, hid=64, out_dim=128):
    inp = keras.Input(shape=(seqlen,), dtype="int32")
    x = layers.Embedding(vocab_size, emb_dim, mask_zero=True)(inp)
    x = layers.Conv1D(64, 5, activation="relu")(x)
    x = layers.Conv1D(64, 3, activation="relu")(x)
    x = layers.GlobalMaxPooling1D()(x)
    x = layers.Dense(hid, activation="relu")(x)
    x = layers.Dense(out_dim, use_bias=False)(x)
    x = layers.UnitNormalization(axis=-1)(x)
    return keras.Model(inp, x, name="char_encoder")

encoder = build_encoder(len(vocab), SEQLEN)
encoder.summary()



In [6]:
# Cell 6 — TF datasets
def make_pairs_dataset(U, V, y, batch=256, val_split=0.1):
    # Vectoriser en numpy avec TextVectorization Keras (léger coût CPU)
    X1 = vectorizer(np.array(U)).numpy()
    X2 = vectorizer(np.array(V)).numpy()
    y_arr = np.array(y, dtype=np.float32)

    n = len(y_arr)
    cut = int(n * (1 - val_split))
    X1_tr, X2_tr, y_tr = X1[:cut], X2[:cut], y_arr[:cut]
    X1_va, X2_va, y_va = X1[cut:], X2[cut:], y_arr[cut:]

    def ds(x1, x2, yy):
        d = tf.data.Dataset.from_tensor_slices(((x1, x2), yy))
        return d.shuffle(8192, seed=SEED).batch(batch).prefetch(tf.data.AUTOTUNE)

    return ds(X1_tr, X2_tr, y_tr), ds(X1_va, X2_va, y_va)

train_ds, val_ds = make_pairs_dataset(U, V, y, batch=256, val_split=0.1)
for (x1b, x2b), yb in train_ds.take(1):
    print(x1b.shape, x2b.shape, yb.shape)


(256, 200) (256, 200) (256,)


2025-11-06 13:34:15.331466: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [7]:
# Cell 7 — Siamese with cosine ≡ dot (vectors L2-normalized)
u_in = keras.Input(shape=(SEQLEN,), dtype="int32")
v_in = keras.Input(shape=(SEQLEN,), dtype="int32")

u_vec = encoder(u_in)
v_vec = encoder(v_in)

# cos = dot because of UnitNormalization
cos = layers.Dot(axes=1, name="cosine_dot")([u_vec, v_vec])  # [B,1]

# Learnable scale α using Dense(1) without bias (init ~10)
scale = layers.Dense(1, use_bias=False,
                     kernel_initializer=keras.initializers.Constant(10.0),
                     name="scale_alpha")
logits = scale(cos)                          # [B,1]
out = layers.Activation("sigmoid", name="prob")(logits)

siamese = keras.Model([u_in, v_in], out, name="siamese_cosine")

siamese.compile(optimizer=keras.optimizers.Adam(1e-3),
                loss="binary_crossentropy",
                metrics=["accuracy", keras.metrics.AUC(name="AUC")])

history = siamese.fit(train_ds, validation_data=val_ds, epochs=8)


Epoch 1/8
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m148s[0m 223ms/step - AUC: 0.9965 - accuracy: 0.9875 - loss: 0.0543 - val_AUC: 1.0000 - val_accuracy: 0.9988 - val_loss: 0.0072
Epoch 2/8
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 214ms/step - AUC: 1.0000 - accuracy: 0.9986 - loss: 0.0079 - val_AUC: 1.0000 - val_accuracy: 0.9983 - val_loss: 0.0067
Epoch 3/8
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m132s[0m 209ms/step - AUC: 1.0000 - accuracy: 0.9990 - loss: 0.0065 - val_AUC: 1.0000 - val_accuracy: 0.9996 - val_loss: 0.0054
Epoch 4/8
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m133s[0m 210ms/step - AUC: 1.0000 - accuracy: 0.9992 - loss: 0.0059 - val_AUC: 1.0000 - val_accuracy: 0.9988 - val_loss: 0.0058
Epoch 5/8
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m132s[0m 209ms/step - AUC: 1.0000 - accuracy: 0.9994 - loss: 0.0055 - val_AUC: 1.0000 - val_accuracy: 0.9996 - val_loss: 0.0051
Epoch 6/8
[1m6

In [8]:
# Cell 8 — Export TFLite (encoder only)
converter = tf.lite.TFLiteConverter.from_keras_model(encoder)
converter.optimizations = [tf.lite.Optimize.DEFAULT]  # quant dynamic
tflite_model = converter.convert()

with open("encoder_embed.tflite", "wb") as f:
    f.write(tflite_model)

# Vocab déjà sauvé en Cell 4 -> char_vocab_embed.txt
print("✅ Export OK → encoder_embed.tflite | char_vocab_embed.txt")


INFO:tensorflow:Assets written to: /tmp/tmpdtna6iq4/assets


INFO:tensorflow:Assets written to: /tmp/tmpdtna6iq4/assets


Saved artifact at '/tmp/tmpdtna6iq4'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 200), dtype=tf.int32, name='keras_tensor')
Output Type:
  TensorSpec(shape=(None, 128), dtype=tf.float32, name=None)
Captures:
  140497700289808: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140497664737552: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140497664737360: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140497664740432: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140497664741584: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140497664741968: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140497664741008: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140497664742160: TensorSpec(shape=(), dtype=tf.resource, name=None)
✅ Export OK → encoder_embed.tflite | char_vocab_embed.txt


W0000 00:00:1762433685.226142   39141 tf_tfl_flatbuffer_helpers.cc:364] Ignored output_format.
W0000 00:00:1762433685.226524   39141 tf_tfl_flatbuffer_helpers.cc:367] Ignored drop_control_dependency.
2025-11-06 13:54:45.230083: I tensorflow/cc/saved_model/reader.cc:83] Reading SavedModel from: /tmp/tmpdtna6iq4
2025-11-06 13:54:45.231996: I tensorflow/cc/saved_model/reader.cc:52] Reading meta graph with tags { serve }
2025-11-06 13:54:45.232044: I tensorflow/cc/saved_model/reader.cc:147] Reading SavedModel debug info (if present) from: /tmp/tmpdtna6iq4
I0000 00:00:1762433685.249934   39141 mlir_graph_optimization_pass.cc:437] MLIR V1 optimization pass is not enabled
2025-11-06 13:54:45.253074: I tensorflow/cc/saved_model/loader.cc:236] Restoring SavedModel bundle.
2025-11-06 13:54:45.317436: I tensorflow/cc/saved_model/loader.cc:220] Running initialization op on SavedModel bundle at path: /tmp/tmpdtna6iq4
2025-11-06 13:54:45.342139: I tensorflow/cc/saved_model/loader.cc:471] SavedModel 

In [3]:
# Cell 9 — TFLite inference helpers (batch & resize)
interpreter = tf.lite.Interpreter(model_path="encoder_embed.tflite")
interpreter.allocate_tensors()
_in = interpreter.get_input_details()
_out = interpreter.get_output_details()
SEQLEN = int(_in[0]["shape"][1])  # doit être 200

# Charger vocab
with open("char_vocab_embed.txt","r",encoding="utf-8") as f:
    vocab = [l.rstrip("\n") for l in f]
tok2id = {t:i for i,t in enumerate(vocab)}
UNK = tok2id.get("[UNK]", 1)

def vectorize_texts(texts, seqlen=SEQLEN):
    X = np.zeros((len(texts), seqlen), dtype=np.int32)
    for i, t in enumerate(texts):
        t = norm(t)
        for j, ch in enumerate(t[:seqlen]):
            X[i, j] = tok2id.get(ch, UNK)
    return X

def embed_texts(texts):
    X = vectorize_texts(texts, SEQLEN)
    # resize to [N, SEQLEN]
    interpreter.resize_tensor_input(_in[0]['index'], [len(texts), SEQLEN])
    interpreter.allocate_tensors()
    in_d = interpreter.get_input_details()[0]
    out_d = interpreter.get_output_details()[0]
    interpreter.set_tensor(in_d['index'], X)
    interpreter.invoke()
    return interpreter.get_tensor(out_d['index'])  # [N, 128]

def cosine(a, b):
    return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-9))


    TF 2.20. Please use the LiteRT interpreter from the ai_edge_litert package.
    See the [migration guide](https://ai.google.dev/edge/litert/migration)
    for details.
    
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.


In [8]:
# Cell 10 — span search to avoid dilution on long utterances
def word_windows(text, min_w=2, max_w=6):
    toks = norm(text).split()
    if not toks:
        return [norm(text)]
    spans = []
    for w in range(min_w, min(max_w, len(toks)) + 1):
        for i in range(0, len(toks) - w + 1):
            spans.append(' '.join(toks[i:i+w]))
    return spans or [norm(text)]

# def best_cosine_over_spans(utterance: str, candidate: str):
#     spans = word_windows(utterance, 2, 6)
#     E_spans = embed_texts(spans)       # batch
#     E_cand = embed_texts([candidate])[0]
#     # cosines
#     dots = (E_spans @ E_cand) / (np.linalg.norm(E_spans, axis=1) * np.linalg.norm(E_cand) + 1e-9)
#     j = int(np.argmax(dots))
#     return float(dots[j]), spans[j]

# # Demo
# utter = "Opération en salle trois"
# cands = ["Paul Dupont", "Dupont Paul", "10:30", "Salle 3", "Dr. Lefevre", "Appendicectomie"]

# for c in cands:
#     s, span = best_cosine_over_spans(utter, c)
#     print(f"{c:20s} → {s:.3f} | best span: '{span}'")


In [14]:
# # Cell 11 — decision thresholds
# def decide(score: float, ok=0.88, maybe=0.70):
#     return "OK" if score >= ok else ("INCERTAIN" if score >= maybe else "KO")

# def match_utterance_to_candidates(utterance: str, candidates: list):
#     results = []
#     for c in candidates:
#         s, span = best_cosine_over_spans(utterance, c)
#         results.append((c, s, span, decide(s)))
#     # tri par score décroissant
#     results.sort(key=lambda x: -x[1])
#     return results

# # Demo decision
# res = match_utterance_to_candidates(
#     "patient dupont paul confirmé, bloc trois ok, démarrage 10h30",
#     ["Paul Dupont", "Dupont Paul", "10:30", "Salle 3", "Dr. Bernard", "Appendicectomie"]
# )
# for c, s, span, dec in res:
#     print(f"{c:20s} | {s:.3f} | {dec} | span='{span}'")


Dupont Paul          | 1.000 | OK | span='dupont paul'
Salle 3              | 0.992 | OK | span='demarrage 10h30'
Paul Dupont          | 0.985 | OK | span='dupont paul'
10:30                | 0.963 | OK | span='demarrage 10h30'
Dr. Bernard          | -0.059 | KO | span='ok demarrage'
Appendicectomie      | -0.222 | KO | span='demarrage 10h30'


In [9]:
# --- utilitaires n-gram char ---
import math, numpy as np, re, unicodedata

def char_ngrams(s, n=3):
    s = " " + s + " "
    out = {}
    for i in range(max(0, len(s)-n+1)):
        g = s[i:i+n]
        out[g] = out.get(g, 0) + 1
    return out

def cosine_counts(a, b):
    keys = set(a.keys()) | set(b.keys())
    dot = sum(a.get(k,0)*b.get(k,0) for k in keys)
    na = math.sqrt(sum(v*v for v in a.values()))
    nb = math.sqrt(sum(v*v for v in b.values()))
    return 0.0 if na==0 or nb==0 else dot/(na*nb)

def ngram_sim(u, v):
    u = norm(u); v = norm(v)
    return (cosine_counts(char_ngrams(u,3), char_ngrams(v,3)) +
            cosine_counts(char_ngrams(u,4), char_ngrams(v,4)) +
            cosine_counts(char_ngrams(u,5), char_ngrams(v,5))) / 3.0


In [10]:
# --- spans qui doivent contenir au moins 1 token du candidat (optionnel mais recommandé pour NOMS) ---
def best_cosine_over_spans_with_overlap(utterance: str, candidate: str, require_overlap_tokens=True):
    spans = word_windows(utterance, 2, 6)
    cand_tokens = set(norm(candidate).split())
    filtered = []
    if require_overlap_tokens:
        for sp in spans:
            if len(cand_tokens & set(sp.split())) > 0:
                filtered.append(sp)
    spans_eval = filtered if filtered else spans  # fallback si rien ne matche

    E_spans = embed_texts(spans_eval)
    E_cand = embed_texts([candidate])[0]
    dots = (E_spans @ E_cand) / (np.linalg.norm(E_spans, axis=1)*np.linalg.norm(E_cand)+1e-9)
    j = int(np.argmax(dots))
    best_span = spans_eval[j]
    embed_score = float(dots[j])

    # score hybride (embed + n-gram)
    ng = ngram_sim(best_span, candidate)
    final = 0.7*embed_score + 0.3*ng
    return final, best_span, embed_score, ng


In [11]:
# --- décision + wrapper multi-candidats ---
def decide(score: float, ok=0.88, maybe=0.70):
    return "OK" if score >= ok else ("INCERTAIN" if score >= maybe else "KO")

def match_utterance_to_candidates(utterance: str, candidates: list, require_overlap_for_names=True):
    results = []
    for c in candidates:
        # Heuristique "candidat ressemble à un nom de personne ?"
        tokens = norm(c).split()
        is_name = len(tokens) >= 2 and all(t.isalpha() for t in tokens[:2])
        req = (require_overlap_for_names and is_name)
        s, span, s_embed, s_ng = best_cosine_over_spans_with_overlap(utterance, c, require_overlap_tokens=req)
        results.append((c, s, span, decide(s), s_embed, s_ng))
    results.sort(key=lambda x: -x[1])
    return results


In [12]:
res = match_utterance_to_candidates(
    "Le patient est Paul Dupont, opération à dix heures trente en salle quatre avec le Dr. Bernard",
    ["Bruno Romuald", "Romuald Bruno", "Paul Dupont", "Dupont Paul", "10:30", "Salle 4", "Dr. Bernard", "Appendicectomie"]
)

for c, s, span, dec, s_embed, s_ng in res:
    print(f"{c:20s} | {s:.3f} | {dec} | span='{span}' | embed={s_embed:.3f} | ng={s_ng:.3f}")

Paul Dupont          | 1.000 | OK | span='paul dupont' | embed=1.000 | ng=1.000
Dr. Bernard          | 1.000 | OK | span='dr bernard' | embed=1.000 | ng=1.000
Dupont Paul          | 0.918 | OK | span='est paul dupont' | embed=0.988 | ng=0.755
Romuald Bruno        | 0.612 | KO | span='avec le' | embed=0.874 | ng=0.000
Appendicectomie      | 0.603 | KO | span='dr bernard' | embed=0.861 | ng=0.000
Bruno Romuald        | 0.597 | KO | span='heures trente' | embed=0.854 | ng=0.000
Salle 4              | 0.035 | KO | span='a dix' | embed=0.050 | ng=0.000
10:30                | -0.081 | KO | span='a dix' | embed=-0.116 | ng=0.000


In [None]:
# Cell X — use MatchEngine from src/match.py
import os, sys
# ensure src is on path so we can import match.py
sys.path.insert(0, os.path.join('..', 'src'))
from match import MatchEngine

# Try a few likely locations for the tflite model and vocab (notebook cwd, ../src)
tries = [
    ('encoder_embed.tflite', 'char_vocab_embed.txt'),
    (os.path.join('..','src','encoder_embed.tflite'), os.path.join('..','src','char_vocab_embed.txt')),
    (os.path.join('..','src','encoder_embed.tflite'), 'char_vocab_embed.txt'),
    ('encoder_embed.tflite', os.path.join('..','src','char_vocab_embed.txt'))
]
me = None
last_exc = None
for tfl, vocab in tries:
    try:
        me = MatchEngine(tflite_path=tfl, vocab_path=vocab)
        print('Loaded MatchEngine with', tfl, vocab)
        break
    except Exception as e:
        last_exc = e
        # continue to next candidate path

if me is None:
    raise last_exc

# Sample utterance and candidates to match
utter = 'Le patient est Paul Dupont, opération à dix heures trente en salle quatre avec le Dr. Bernard'
candidates = ["Bruno Romuald", "Romuald Bruno", "Paul Dupont", "Dupont Paul", "10:30", "Salle 4", "Dr. Bernard", "Appendicectomie"]

res = me.match_utterance_to_candidates(utter, candidates)
for c, s, span, dec, s_embed, s_ng in res:
    print(f"{c:20s} | {s:.3f} | {dec} | span='{span}' | embed={s_embed:.3f} | ng={s_ng:.3f}")
