In [5]:
import pandas as pd
df = pd.read_csv("../data/raw/biocup_subset.csv")
df.head()


Unnamed: 0,case_id,primary_site,tcga_type,patient_id,patient_filename,report_text
0,BIOCUP_00001,lung,LUSC,TCGA-18-4086,TCGA-18-4086.f83c1343-f2dc-4602-a34c-3a259edfb343,Amended. SPECIMEN (s) RECEIVED. 1. Soft-Tissue...
1,BIOCUP_00002,lung,LUAD,TCGA-S2-AA1A,TCGA-S2-AA1A.83A6F838-3F30-411F-9982-B8985C7FA52E,Female. Admission Date: Discharge Date: Collec...
2,BIOCUP_00003,lung,LUSC,TCGA-22-5474,TCGA-22-5474.e0bf9b86-3d7e-4a37-9545-b74dd999e128,"DIAGNOSIS: Lung, left upper lobe, wedge resect..."
3,BIOCUP_00004,lung,LUAD,TCGA-55-A48Y,TCGA-55-A48Y.3542521A-EA16-4CD1-A61A-1FB3AE928FC9,PROCEDURE DATE: RECEIVED DATE: REPORT DATE: Pr...
4,BIOCUP_00005,lung,LUAD,TCGA-49-AARE,TCGA-49-AARE.D15D2040-F72D-4913-92FC-54741F573151,"INTERPRETATION AND DIAGNOSIS: 1. LUNG, RIGHT U..."


In [20]:
import re

def clean_medical_report_v3(text: str) -> str:
    if pd.isna(text):
        return ""

    import unicodedata
    text = unicodedata.normalize("NFKC", text)

    # Corriger mots collés par points
    text = re.sub(r"([a-z])\.([A-Z])", r"\1. \2", text)
    text = re.sub(r"([a-z])\.([a-z])", r"\1 \2", text)

    # Corrections OCR fréquentes
    corrections = {
        "tymph": "lymph",
        "attendir g": "attending",
        "alides": "slides",
        "materiala": "materials",
        "(D/": "(0/"
    }
    for wrong, right in corrections.items():
        text = text.replace(wrong, right)

    # Supprimer points inutiles après chiffres ou majuscules avant unité
    text = re.sub(r"(?<=\d)\.(?=\s*[a-zA-Z])", "", text)  # 5.5. cm → 5.5 cm
    text = re.sub(r"\s*\.\s*(?=[a-z])", " ", text)        # "tumor. does" → "tumor does"
    
    # Supprimer points multiples
    text = re.sub(r"\.{2,}", ".", text)

    # Supprimer parenthèses vides
    text = re.sub(r"\(\s*\)", "", text)
    text = re.sub(r"\(\s*[\W_]+\s*\)", "", text)

    # Nettoyer retours ligne inutiles
    text = re.sub(r"\s*\n\s*", " ", text)

    # Ajouter retours ligne avant sections clés
    sections = [
        "DIAGNOSIS",
        "TISSUE DESCRIPTION",
        "Comment",
        "Key Pathological Findings",
        "Specimen type",
        "Clinical History",
        "Preoperative Diagnosis",
        "Gross Description",
        "Intraoperative Consultation"
    ]
    for sec in sections:
        text = re.sub(sec, f"\n\n{sec}", text, flags=re.I)

    # Espaces propres
    text = re.sub(r"\s+", " ", text)

    return text.strip()


In [23]:
df["clean_text"] = df["report_text"].apply(clean_medical_report_v3)
print(f"\n=== REPORT {316} ===")
print("Original Text:\n")
print(df.loc[316, "report_text"][:600])  # afficher les 600 premiers caractères
print("\nCleaned Text:\n")
print(df.loc[316, "clean_text"][:600])  # afficher les 600 premiers caractères
for i in df.sample(2).index:
    print(f"\n=== REPORT {i} ===")
    print("Original Text:\n")
    print(df.loc[i, "report_text"][:600])  # afficher les 600 premiers caractères
    print("\nCleaned Text:\n")
    print(df.loc[i, "clean_text"][:600])  # afficher les 600 premiers caractères




=== REPORT 316 ===
Original Text:

GA-3A-A9IH-01A-PR. Final Diagnosis. A. OMENTUM, NODULE, EXCISION: Benign mesothelial cysts. There is no evidence of malignancy. B. LYMPH NODE, HEPATIC ARTERY, EXCISION: Lymph node with reactive changes and no evidence of malignancy (0/1). C. PANCREAS AND SPLEEN, SUBTOTAL PANCREATECTOMY AND SPLENECTOMY: Invasive pancreatic ductal adenocarcinoma, well to moderately differentiated. See Key Pathological Findings. Spleen with no significant pathologic changes and free of carcinoma. Five regional tymph nodes with reactive changes and no evidence of malignancy (D/5). D. LYMPH NODE, HEPATIC ARTERY #2,

Cleaned Text:

GA-3A-A9IH-01A-PR. Final DIAGNOSIS. A. OMENTUM, NODULE, EXCISION: Benign mesothelial cysts. There is no evidence of malignancy. B. LYMPH NODE, HEPATIC ARTERY, EXCISION: Lymph node with reactive changes and no evidence of malignancy (0/1). C. PANCREAS AND SPLEEN, SUBTOTAL PANCREATECTOMY AND SPLENECTOMY: Invasive pancreatic ductal adenocarcinoma, 