In [4]:
!pip install jupyter-analysis-tools

Collecting jupyter-analysis-tools
  Downloading jupyter_analysis_tools-1.6.2-py3-none-any.whl.metadata (47 kB)
  Downloading jupyter_analysis_tools-1.6.2-py3-none-any.whl.metadata (47 kB)
Collecting matplotlib (from jupyter-analysis-tools)
Collecting matplotlib (from jupyter-analysis-tools)
  Downloading matplotlib-3.10.7-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
  Downloading matplotlib-3.10.7-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting ipywidgets (from jupyter-analysis-tools)
Collecting ipywidgets (from jupyter-analysis-tools)
  Downloading ipywidgets-8.1.8-py3-none-any.whl.metadata (2.4 kB)
  Downloading ipywidgets-8.1.8-py3-none-any.whl.metadata (2.4 kB)
Collecting pybis (from jupyter-analysis-tools)
Collecting pybis (from jupyter-analysis-tools)
  Downloading pybis-1.37.4.tar.gz (184 kB)
  Downloading pybis-1.37.4.tar.gz (184 kB)
  Installing build dependencies ... [?25l  Installing build dependencies ..

In [None]:
import os, random, re, unicodedata, pandas as pd

random.seed(42)
TARGET = 180_000

def strip_accents(s): return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
def norm(s):
    s = s.lower().strip(); s = strip_accents(s)
    s = re.sub(r"[^a-z0-9:/ \-]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s
def stt_noise(s):
    s = s.replace("’","").replace("'","")
    if random.random()<0.25: s=s.replace(":", "h")
    if random.random()<0.15: s=re.sub(r"\bdocteur\b","dr",s)
    if random.random()<0.08: s=s.replace("bloc ","blog ")
    return re.sub(r"\s+"," ",s).strip()

FIRST = ["Claire","Sophie","Julie","Emma","Zoe","Elise","Anais","Chloe","Lea","Maeva","Nina","Sarah","Ava","Lina","Noemie","Camille","Manon","Jeanne",
          "Paul","Michel","Thierry","Hugo","Jules","Arthur","Alexandre","Louis","Tom","Noah","Ethan","Lucas","Antoine","Maxime","Yanis"]
LAST = ["Dupont","Martin","Durand","Nguyen","Bernard","Petit","Lefevre","Marchand","Cohen","Leroy","Robert","Morel","Garcia","Da Silva","Rossi","De la Tour","ONeill"]
DOCS_LAST = ["Lefevre","Bernard","Marchand","Cohen","Leroy","Robert","Giraud","Parent","Perrot","Renard","Petit","Martin","Nguyen","Dubois","Lopez"]
DOC_PREFIX = ["Dr","Docteur","Pr", "Professeur"]
SHORT_VALID=["ok","oui","valide","confirme","present","c est ca"]

SITES={
    "Fosse iliaque droite":["FID","fosse droite","cote droit","cote droit bas ventre"],
    "Fosse iliaque gauche":["FIG","fosse gauche","cote gauche"],
    "Thorax":["poitrine","sternal"],
    "Genou droit":["genou droit"],
    "Genou gauche":["genou gauche"]
}
OPS={
    "Appendicectomie":["appendice","appendicite"],
    "Cholecystectomie":["vesicule biliaire","cholecystectomie"],
    "Herniorraphie inguinale":["hernie inguinale"],
    "Prothese de hanche":["prothese hanche","arthroplastie hanche"],
    "Arthroscopie du genou":["arthroscopie genou"],
    "Thyroidectomie":["thyroidectomie","thyroide"],
    "Mastectomie":["mastectomie","sein"]
}

TPL_PAT=["le patient est {fn} {ln}","la patiente est {fn} {ln}","je repete {fn} {ln}","{ln} {fn} ? {sv}"]
TPL_HEURE=["a {h}","heure prevue {h}","on commence a {h}"]
TPL_SALLE=["salle {n}","bloc {n}","en salle {n}"]
TPL_CHIR=["chirurgien {doc}","docteur {doc} present","chir {doc}","{doc} {sv}"]
TPL_ANES=["anesth {doc}","anesthesiste {doc}","docteur {doc} pour anesthesie","{doc} anesth {sv}"]
TPL_TYPE=["intervention {typ}","on fait une {typ}","type {typ}","{typ} prevue"]
TPL_SITE=["site {site}","fosse {dir}","site operatoire {site}","{site} {sv}"]

def rand_hour():
    h=random.randint(7,19); m=random.choice([0,15,30,45])
    return random.choice([f"{h:02d}:{m:02d}", f"{h}h{m:02d}"])
def rand_room(): n=random.randint(1,8); return random.choice([f"Salle {n}", f"Bloc {n}"])
def person_variants(fn,ln): return [f"{fn} {ln}", f"{ln} {fn}", ln, fn]
def surgeon_variants():
    ln=random.choice(DOCS_LAST); pref=random.choice(DOC_PREFIX)
    return [f"{pref} {ln}", ln]
def site_variants(canon,aliases):
    out=[canon]+aliases
    if "droite" in " ".join(out): out+=["droite","a droite"]
    if "gauche" in " ".join(out): out+=["gauche","a gauche"]
    return list(dict.fromkeys([norm(x) for x in out]))
def op_variants(canon,aliases): return list(dict.fromkeys([norm(x) for x in [canon]+aliases]))

def gen_pairs(fn,ln,hour,room,site_c,site_a,op_c,op_a,neg_k=2):
    pairs=[]
    # PATIENT
    vals=[norm(v) for v in person_variants(fn,ln)]
    for tpl in TPL_PAT:
        u=stt_noise(norm(tpl.format(fn=fn,ln=ln,sv=random.choice(SHORT_VALID))))
        for v in vals: pairs.append([u,v,1.0])
        for _ in range(neg_k):
            vneg=random.choice([norm(hour),norm(room),norm(random.choice(surgeon_variants())),norm(random.choice(op_variants(op_c,op_a)))])
            pairs.append([u,vneg,0.0])
    # HEURE
    for tpl in TPL_HEURE:
        u=stt_noise(norm(tpl.format(h=hour,sv=random.choice(SHORT_VALID))))
        pairs.append([u,norm(hour),1.0])
        for _ in range(neg_k):
            pairs.append([u,norm(random.choice(person_variants(fn,ln))),0.0])
    # SALLE
    n=re.findall(r"\d+",room)[0] if re.findall(r"\d+",room) else "3"
    for tpl in TPL_SALLE:
        u=stt_noise(norm(tpl.format(n=n,sv=random.choice(SHORT_VALID))))
        pairs.append([u,norm(room),1.0])
        for _ in range(neg_k):
            pairs.append([u,norm(random.choice(surgeon_variants())),0.0])
    # CHIRURGIEN / ANESTHESISTE
    for tpl in TPL_CHIR+TPL_ANES:
        doc=random.choice(DOCS_LAST)
        u=stt_noise(norm(tpl.format(doc=doc,sv=random.choice(SHORT_VALID))))
        pairs.append([u,norm(f"Dr {doc}"),1.0])
        for _ in range(neg_k): pairs.append([u,norm(hour),0.0])
    # TYPE / SITE
    for tpl in TPL_TYPE:
        typ=random.choice(op_variants(op_c,op_a))
        u=stt_noise(norm(tpl.format(typ=typ,sv=random.choice(SHORT_VALID))))
        pairs.append([u,typ,1.0])
        for _ in range(neg_k): pairs.append([u,norm(room),0.0])
    for tpl in TPL_SITE:
        s=random.choice(site_variants(site_c,site_a)); d="droite" if "droite" in s else "gauche"
        u=stt_noise(norm(tpl.format(site=s,dir=d,sv=random.choice(SHORT_VALID))))
        pairs.append([u,s,1.0])
        for _ in range(neg_k): pairs.append([u,norm(random.choice(op_variants(op_c,op_a))),0.0])
    return pairs

pairs=[]
while len(pairs)<TARGET:
    fn=random.choice(FIRST); ln=random.choice(LAST)
    hour=rand_hour(); room=rand_room()
    site_c,site_a=random.choice(list(SITES.items()))
    op_c,op_a=random.choice(list(OPS.items()))
    pairs.extend(gen_pairs(fn,ln,hour,room,site_c,site_a,op_c,op_a))
pairs=pairs[:TARGET]

out_path="../data/pairs_checklist_180k.tsv"
with open(out_path,"w",encoding="utf-8") as f:
    for u,v,l in pairs: f.write(f"{u}\t{v}\t{l}\n")

print(f"Généré : {len(pairs)} paires → {out_path}")
pd.DataFrame(pairs[:10],columns=["U","V","label"])


✅ Généré : 180000 paires → ../data/pairs_checklist_180k.tsv


Unnamed: 0,U,V,label
0,le patient est chloe dupont,chloe dupont,1.0
1,le patient est chloe dupont,dupont chloe,1.0
2,le patient est chloe dupont,dupont,1.0
3,le patient est chloe dupont,chloe,1.0
4,le patient est chloe dupont,18:30,0.0
5,le patient est chloe dupont,thyroidectomie,0.0
6,la patiente est chloe dupont,chloe dupont,1.0
7,la patiente est chloe dupont,dupont chloe,1.0
8,la patiente est chloe dupont,dupont,1.0
9,la patiente est chloe dupont,chloe,1.0
