In [None]:
# Cell 1 — imports & helpers
import os, re, unicodedata, random, math
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.utils import shuffle

# Reproductibilité
SEED = 42
random.seed(SEED); np.random.seed(SEED); tf.random.set_seed(SEED)

# Normalisation texte
def strip_accents(s: str) -> str:
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def norm(s: str) -> str:
    s = s.lower().strip()
    s = strip_accents(s)
    s = re.sub(r"[^a-z0-9:/ \-]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s


2025-11-06 13:28:15.561517: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Cell 2 — load TSV
TSV_PATH = "../data/pairs_checklist_180k.tsv"  # ⬅️ adapte le chemin si besoin

df = pd.read_csv(TSV_PATH, sep="\t", header=None, names=["U","V","label"], dtype=str)
# Nettoyage doux
df["U"] = df["U"].astype(str).map(norm)
df["V"] = df["V"].astype(str).map(norm)
df["label"] = df["label"].astype(float)

print(df.head())
print("Total pairs:", len(df), "| positives:", int((df['label']==1.0).sum()), "| negatives:", int((df['label']==0.0).sum()))


                             U             V  label
0  le patient est chloe dupont  chloe dupont    1.0
1  le patient est chloe dupont  dupont chloe    1.0
2  le patient est chloe dupont        dupont    1.0
3  le patient est chloe dupont         chloe    1.0
4  le patient est chloe dupont         18:30    0.0
Total pairs: 180000 | positives: 76000 | negatives: 104000


In [3]:
# Cell 3 — lists + shuffle
U = df["U"].tolist()
V = df["V"].tolist()
y = df["label"].astype(float).tolist()

U, V, y = shuffle(U, V, y, random_state=SEED)

print("Sample:")
for i in range(5):
    print(U[i], "|", V[i], "=>", y[i])


NameError: name 'shuffle' is not defined

In [None]:
# ——— Cellule 5 : vectorisation char-level identique Python/Android ———
SEQLEN = 200
VOCAB_SZ = 300

vectorizer = layers.TextVectorization(
    standardize=None, split="character",
    output_mode="int", output_sequence_length=SEQLEN,
    max_tokens=VOCAB_SZ
)
vectorizer.adapt(np.array(U + V))
vocab = vectorizer.get_vocabulary()
with open("char_vocab_embed.txt","w",encoding="utf-8") as f:
    f.write("\n".join(vocab))



In [None]:
# ——— Cellule 6 (fix Keras 3) : encodeur compact (chars → 128d, L2-normalisé) ———
def build_encoder(vocab_size, seqlen, emb_dim=64, hid=64, out_dim=128):
    inp = keras.Input(shape=(seqlen,), dtype="int32")
    x = layers.Embedding(vocab_size, emb_dim, mask_zero=True)(inp)
    x = layers.Conv1D(64, 5, activation="relu")(x)
    x = layers.Conv1D(64, 3, activation="relu")(x)
    x = layers.GlobalMaxPooling1D()(x)
    x = layers.Dense(hid, activation="relu")(x)
    x = layers.Dense(out_dim, use_bias=False)(x)
    x = layers.UnitNormalization(axis=-1)(x)
    return keras.Model(inp, x, name="char_encoder")

encoder = build_encoder(len(vocab), SEQLEN)
encoder.summary()



In [None]:
# ——— Cellule 7 (fix Keras 3) : Siamese head (cosine ≡ dot + BCE) ———
def make_pairs_dataset(U, V, y, batch=256, val_split=0.1):
    X1 = vectorizer(np.array(U)).numpy()
    X2 = vectorizer(np.array(V)).numpy()
    idx = np.arange(len(y)); np.random.shuffle(idx)
    cut = int(len(y)*(1-val_split))
    tr, va = idx[:cut], idx[cut:]

    def ds(x1,x2,y):
        ds = tf.data.Dataset.from_tensor_slices(((x1,x2), y))
        return ds.shuffle(8192).batch(batch).prefetch(2)
    return ds(X1[tr],X2[tr],y[tr]), ds(X1[va],X2[va],y[va])

train_ds, val_ds = make_pairs_dataset(U,V,y)

# Inputs
u_in = keras.Input(shape=(SEQLEN,), dtype="int32")
v_in = keras.Input(shape=(SEQLEN,), dtype="int32")
u_vec = encoder(u_in)
v_vec = encoder(v_in)
cos = layers.Dot(axes=1, name="cosine_dot")([u_vec, v_vec])
scale = layers.Dense(1, use_bias=False,
                     kernel_initializer=keras.initializers.Constant(10.0))
logits = scale(cos)
out = layers.Activation("sigmoid")(logits)

siamese = keras.Model([u_in, v_in], out)
siamese.compile(optimizer=keras.optimizers.Adam(1e-3),
                loss="binary_crossentropy",
                metrics=["accuracy","AUC"])
history = siamese.fit(train_ds, validation_data=val_ds, epochs=8)


Epoch 1/8
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 187ms/step - AUC: 0.9489 - accuracy: 0.9160 - loss: 0.3191 - val_AUC: 0.9900 - val_accuracy: 0.9708 - val_loss: 0.1313
Epoch 2/8
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 176ms/step - AUC: 0.9932 - accuracy: 0.9787 - loss: 0.1142 - val_AUC: 0.9965 - val_accuracy: 0.9825 - val_loss: 0.1019
Epoch 3/8
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 176ms/step - AUC: 0.9962 - accuracy: 0.9834 - loss: 0.0956 - val_AUC: 0.9976 - val_accuracy: 0.9844 - val_loss: 0.0922
Epoch 4/8
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 180ms/step - AUC: 0.9979 - accuracy: 0.9864 - loss: 0.0848 - val_AUC: 0.9984 - val_accuracy: 0.9881 - val_loss: 0.0816
Epoch 5/8
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 175ms/step - AUC: 0.9987 - accuracy: 0.9881 - loss: 0.0794 - val_AUC: 0.9990 - val_accuracy: 0.9896 - val_loss: 0.0774
Epoch 6/8
[1m190/19

In [None]:
# ——— Cellule 8 : Export TFLite du modèle encodeur (quantized) ———

# On n’exporte que l’encodeur, pas le Siamese complet
converter = tf.lite.TFLiteConverter.from_keras_model(encoder)
converter.optimizations = [tf.lite.Optimize.DEFAULT]  # quantization dynamique (facultatif)
tflite_model = converter.convert()

# Sauvegarde du modèle TFLite
with open("encoder_embed.tflite", "wb") as f:
    f.write(tflite_model)

# Sauvegarde du vocab utilisé pour la vectorisation (à embarquer sur Android)
with open("char_vocab_embed.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(vocab))

print("Export OK → encoder_embed.tflite + char_vocab_embed.txt")




INFO:tensorflow:Assets written to: /tmp/tmpko9u17f8/assets


INFO:tensorflow:Assets written to: /tmp/tmpko9u17f8/assets


Saved artifact at '/tmp/tmpko9u17f8'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 200), dtype=tf.int32, name='keras_tensor_27')
Output Type:
  TensorSpec(shape=(None, 128), dtype=tf.float32, name=None)
Captures:
  140009229496976: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140010767950928: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140009229497360: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140009229498896: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140009229498128: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140009229494096: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140009229500240: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140009229500048: TensorSpec(shape=(), dtype=tf.resource, name=None)


W0000 00:00:1762427026.172773   16259 tf_tfl_flatbuffer_helpers.cc:364] Ignored output_format.


Export OK → encoder_embed.tflite + char_vocab_embed.txt


W0000 00:00:1762427026.173096   16259 tf_tfl_flatbuffer_helpers.cc:367] Ignored drop_control_dependency.
2025-11-06 12:03:46.173748: I tensorflow/cc/saved_model/reader.cc:83] Reading SavedModel from: /tmp/tmpko9u17f8
2025-11-06 12:03:46.174696: I tensorflow/cc/saved_model/reader.cc:52] Reading meta graph with tags { serve }
2025-11-06 12:03:46.174721: I tensorflow/cc/saved_model/reader.cc:147] Reading SavedModel debug info (if present) from: /tmp/tmpko9u17f8
2025-11-06 12:03:46.179561: I tensorflow/cc/saved_model/loader.cc:236] Restoring SavedModel bundle.
2025-11-06 12:03:46.208423: I tensorflow/cc/saved_model/loader.cc:220] Running initialization op on SavedModel bundle at path: /tmp/tmpko9u17f8
2025-11-06 12:03:46.216687: I tensorflow/cc/saved_model/loader.cc:471] SavedModel load for tags { serve }; Status: success: OK. Took 42945 microseconds.


In [None]:
import numpy as np, re, unicodedata, tensorflow as tf

# Charger modèle TFLite + vocab
interpreter = tf.lite.Interpreter(model_path="encoder_embed.tflite")
interpreter.allocate_tensors()
in_det = interpreter.get_input_details(); out_det = interpreter.get_output_details()
SEQLEN = int(in_det[0]['shape'][1])

with open("char_vocab_embed.txt","r",encoding="utf-8") as f:
    vocab=[l.strip() for l in f]
tok2id={t:i for i,t in enumerate(vocab)}; UNK=tok2id.get("[UNK]",1)

def normalize_text(s):
    s=s.lower().strip()
    s=''.join(c for c in unicodedata.normalize('NFD',s) if unicodedata.category(c)!='Mn')
    s=re.sub(r'[^a-z0-9:/ \-]',' ',s); s=re.sub(r'\s+',' ',s).strip()
    return s

def vectorize(texts):
    X=np.zeros((len(texts),SEQLEN),dtype=np.int32)
    for i,t in enumerate(texts):
        t=normalize_text(t)
        for j,ch in enumerate(t[:SEQLEN]):
            X[i,j]=tok2id.get(ch,UNK)
    return X

def embed_texts(texts):
    X=vectorize(texts)
    interpreter.resize_tensor_input(in_det[0]['index'], [len(texts),SEQLEN])
    interpreter.allocate_tensors()
    in_d=interpreter.get_input_details()[0]; out_d=interpreter.get_output_details()[0]
    interpreter.set_tensor(in_d['index'],X)
    interpreter.invoke()
    return interpreter.get_tensor(out_d['index'])

def cosine(a,b): return float(np.dot(a,b)/(np.linalg.norm(a)*np.linalg.norm(b)+1e-9))

# ---- Fenêtrisation ----
def word_windows(text,min_w=2,max_w=6):
    toks=normalize_text(text).split(); spans=[]
    for w in range(min_w,min(max_w,len(toks))+1):
        for i in range(0,len(toks)-w+1):
            spans.append(' '.join(toks[i:i+w]))
    return spans or [normalize_text(text)]

def best_cosine_over_spans(utter,candidate):
    spans=word_windows(utter)
    e_sp=embed_texts(spans); e_c=embed_texts([candidate])[0]
    dots=(e_sp@e_c)/(np.linalg.norm(e_sp,axis=1)*np.linalg.norm(e_c)+1e-9)
    j=int(np.argmax(dots))
    return float(dots[j]),spans[j]

# ---- Démo ----
utter="opération de l'appendice"
cands=["Paul Dupont","Dupont Paul","10:30","Salle 3","Dr. Lefèvre","Appendicectomie"]

for c in cands:
    s,span=best_cosine_over_spans(utter,c)
    print(f"{c:20s} → {s:.3f} (best span: '{span}')")


Paul Dupont          → 0.486 (best span: 'de l')
Dupont Paul          → 0.452 (best span: 'de l')
10:30                → 0.382 (best span: 'de l')
Salle 3              → 0.006 (best span: 'operation de l')
Dr. Lefèvre          → 0.448 (best span: 'de l')
Appendicectomie      → 0.979 (best span: 'l appendice')


    TF 2.20. Please use the LiteRT interpreter from the ai_edge_litert package.
    See the [migration guide](https://ai.google.dev/edge/litert/migration)
    for details.
    
