In [None]:
import pandas as pd, random, re, unicodedata, os
def norm(s):
    if s is None:
        return ""
    s = unicodedata.normalize("NFD", str(s)).encode("ascii", "ignore").decode("utf-8")
    s = s.lower().strip()
    s = re.sub(r"[^a-z0-9:/ \ -]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s
# Expand name pools programmatically to get lots of variety
FIRST_BASE = ["Julie","Paul","Sophie","Michel","Hugo","Claire","Léa","Arthur","Louis","Emma","Nguyen"]
LAST_BASE  = ["Dupont","Nguyen","Bernard","Lefevre","Petit","Martin","Durand","Cohen"]
# Generate many synthetic first/last names (Prenom1..Prenom300 / Nom1..Nom300)
FIRST = FIRST_BASE + [f"Prenom{i}" for i in range(1,301)]
LAST  = LAST_BASE  + [f"Nom{i}" for i in range(1,301)]
# Create many doctor name variants by combining titles and last names
titles = ["Dr.", "Dr", "Docteur", "Professeur", "Prof.", "Pr."]
DOCS = [f"{t} {ln}" for ln in LAST_BASE for t in titles] + [f"{t} {ln}" for ln in LAST[:80] for t in titles]
# Sites and intervention types expanded
SITES = ["Fosse iliaque droite", "Fosse iliaque gauche", "Thorax", "Abdomen droit", "Abdomen gauche", "Epaule droite", "Epaule gauche", "Cuisse droite"]
TYPES = ["Appendicectomie", "Cholécystectomie", "Hernie inguinale", "Hysterectomie", "Prothese de hanche", "Arthroscopie", "Biopsie"] + [f"Procedure{i}" for i in range(1,51)]
# Rooms: many Salle and Bloc variants
ROOMS = [f"Salle {i}" for i in range(1,41)] + [f"Bloc {i}" for i in range(1,21)]
# Hours: generate quarter-hour times between 07:00 and 19:00 with several format variants
HOURS = []
for h in range(7,20):
    for m in (0,15,30,45):
        hh = f"{h:02d}:{m:02d}"
        HOURS.append(hh)
        HOURS.append(f"{h}h{m:02d}")
# also keep a few common variants
HOURS += ["10:30", "10h30", "11:00", "15:45"]
templates = {
    "PATIENT_IDENTITE": [
        "le patient est {fn} {ln}", "la patiente est {fn} {ln}",
        "je repete {fn} {ln}", "{ln} {fn} confirme"
    ],
    "HEURE_PREVUE": ["a {h}", "prevue a {h}", "heure prevue {h}"],
    "SALLE": ["salle {r}", "bloc {r}", "en salle {r}"],
    "CHIRURGIEN": ["chirurgien {doc}", "docteur {doc} present", "professeur {doc}", "{doc}"],
    "ANESTHESISTE": ["anesthesiste {doc}", "docteur {doc} pour anesthesie", "anesth {doc}"],
    "INTERVENTION_TYPE": ["intervention {typ}", "on fait une {typ}", "procedure: {typ}"],
    "SITE_OPERATOIRE": ["site {site}", "fosse {dir}", "au niveau de {site}"]
}
data = []
# Target number of pairs we want to produce (>= 150000)
TARGET_PAIRS = 150000
# Safety cap to avoid infinite loop (max records to attempt)
MAX_RECORDS = 200000
record_count = 0
# Keep generating records until we reach target pairs
while len(data) < TARGET_PAIRS and record_count < MAX_RECORDS:
    record_count += 1
    fn, ln = random.choice(FIRST), random.choice(LAST)
    doc = random.choice(DOCS)
    site = random.choice(SITES)
    typ = random.choice(TYPES)
    room = random.choice(ROOMS)
    hour = random.choice(HOURS)
    values = {
        "PATIENT_IDENTITE": [f"{fn} {ln}", f"{ln} {fn}"],
        "HEURE_PREVUE": [hour],
        "SALLE": [room],
        "CHIRURGIEN": [doc, doc.split()[-1]],
        "ANESTHESISTE": [random.choice(DOCS)],
        "INTERVENTION_TYPE": [typ],
        "SITE_OPERATOIRE": [site]
    }
    # générer positives + négatives pour ce record
    for field, temps in templates.items():
        vals = values.get(field, [])
        for t in temps:
            if field=="SITE_OPERATOIRE":
                dir_choice = "droite" if "droite" in site else ("gauche" if "gauche" in site else random.choice(["droite", "gauche"]))
                u = norm(t.format(site=site, dir=dir_choice))
            elif field=="HEURE_PREVUE":
                u = norm(t.format(h=hour))
            elif field=="SALLE":
                r = re.findall(r"\d+", room)
                r = r[0] if r else room
                u = norm(t.format(r=r))
            elif field=="CHIRURGIEN" or field=="ANESTHESISTE":
                # use last name only sometimes
                docname = doc.split()[-1] if random.random() < 0.7 else doc
                u = norm(t.format(doc=docname))
            elif field=="INTERVENTION_TYPE":
                u = norm(t.format(typ=typ))
            else:
                u = norm(t.format(fn=fn, ln=ln))
            # POSITIFS
            for v in vals:
                data.append([u, norm(v), 1.0])
            # NEGATIFS (valeurs d'autres champs)
            neg_field = random.choice([k for k in values.keys() if k!=field])
            vneg = norm(random.choice(values[neg_field]))
            data.append([u, vneg, 0.0])
# Sauvegarde
df = pd.DataFrame(data, columns=["U","V","label"])
path = os.path.join('..', 'data', 'pairs_checklist.tsv')
os.makedirs(os.path.dirname(path), exist_ok=True)
df.to_csv(path, sep="	", index=False, header=False, encoding="utf-8")
print("✅ Fichier généré :", path)
print("Nombre total de paires :", len(df))
print("Records générés :", record_count)


✅ Fichier généré : ../data/pairs_checklist.tsv
Nombre total de paires : 18000
