In [75]:
!pip install -q transformers accelerate pymupdf
!pip install transformers datasets pypdf
!pip install pdfplumber
!pip install openai



In [78]:
# Import libraries
import pdfplumber
from openai import OpenAI
from google.colab import files
from io import BytesIO
import re

# Initialize OpenAI client
client = OpenAI(api_key="your-openai-api-key")

# Upload PDFs
uploaded = files.upload()

# Extract text
def extract_text_from_pdf(file_bytes):
    with pdfplumber.open(BytesIO(file_bytes)) as pdf:
        return "\n".join(page.extract_text() or "" for page in pdf.pages)

raw_texts = []
for filename, file in uploaded.items():
    print(f"Reading {filename}...")
    raw_text = extract_text_from_pdf(file)
    raw_texts.append({"filename": filename, "text": raw_text})

# Fix Hebrew word direction
def reverse_hebrew_words_in_line(line):
    return re.sub(r'[\u0590-\u05FF]+', lambda m: m.group(0)[::-1], line)

def fix_hebrew_direction(text):
    return "\n".join(reverse_hebrew_words_in_line(line) for line in text.splitlines())

# Apply Hebrew fix
fixed_texts = []
for doc in raw_texts:
    fixed = fix_hebrew_direction(doc["text"])
    fixed_texts.append({"filename": doc["filename"], "text": fixed})

# Preview fixed text
for doc in fixed_texts:
    print(f"\n--- {doc['filename']} ---\n")
    print(doc["text"][:1000])


Saving 2test.pdf to 2test (8).pdf
Saving 1test.pdf to 1test (8).pdf
Reading 2test (8).pdf...
Reading 1test (8).pdf...

--- 2test (8).pdf ---

07:41 01/04/2025 :הדפסה ושעת תאריך
**רפואי חסוי**
אונקולוגיה :מערך
רפואית אנמנזה סיכום
רונן ברנר 'דר : מחלקה מנהל מרפאה : יחידה שם
דראושה עורוא : ת/אחראי ות/אח
29966.ר.מ אונקולוגית
ל"דוא
03­5018202 : פקס 03­5028795 : טלפון
Mailto:oncolog@wmc.gov.il:
320465776 :ז.ת ילנה : פרטי שם וולושינה : משפחה שם
050­9707038 : טלפון 51 : גיל 01/07/1973 : לידה תאריך
מוצקין קריית :כתובת
מכבי :חולים קופת נקבה : מין
מוצקין קרית 39/8 החשמונאים
אונקולוגיות אבחנות
אבחנה קוד צד תאריך S/P M/P Rec
MALIGNANT NEOPLASM OF BREAST (FEMALE), UNSPECIFIED 174.9
עקרית תלונה
28/1/2025 RT BREAST IDC ER POS PR POS HER2 NEG KI67=5% T3N1
23.2.2025 ACDD+T
נוכחית מחלה
אליי עברה ,ציון בני חולם ב במעקב הייתה החולה .במשרד עובדת .מוצקין בקרית מתגוררת, 2+ג, 51 בת
PS=0
בנות בליווי הגיעה
פרטי ביטוח
בטן מתיחת ,דיסליפידמיה ,התריס בלוטת פעילות תתת ,ד"יל ,סכרת :רקע מחלות
, ATOZET' GLUCOMINE, VECTO

In [80]:
# === STEP 4: Chunk the text into translation-safe segments ===
def chunk_text(text, max_chars=400):  # safe for < 2048 tokens
    return [text[i:i+max_chars] for i in range(0, len(text), max_chars)]

# Translation function (fixed for openai 1.0+)
def translate_text(text):
    if not text.strip():
        return ""

    prompt = f"Translate the following Hebrew text to English:\n\n{text}"
    try:
        response = client.chat.completions.create(
            model="gpt-4",  # or "gpt-3.5-turbo"
            messages=[{"role": "user", "content": prompt}],
            temperature=0.2,
            max_tokens=1500  # safe margin
        )
        translated_text = response.choices[0].message.content
        return translated_text.strip()
    except Exception as e:
        print(f"Translation error: {e}")
        return text  # fallback

# Process all docs
translated_chunks = []
for doc in fixed_texts:
    chunks = chunk_text(doc["text"])
    for chunk in chunks:
        translated = translate_text(chunk)
        translated_chunks.append({
            "filename": doc["filename"],
            "text": translated
        })


In [81]:
from collections import defaultdict

# Group translated chunks back by filename
merged_translations = defaultdict(list)
for record in translated_chunks:   # <-- not translated_dataset, but translated_chunks
    merged_translations[record["filename"]].append(record["text"])

final_translations = []
for filename, translated_chunks in merged_translations.items():
    full_text = "\n".join(translated_chunks)
    final_translations.append({
        "filename": filename,
        "translated": full_text
    })

In [82]:
# === STEP 8: Preview or save translated text ===
for doc in final_translations:
    print(f"\n--- {doc['filename']} (Translated) ---\n")
    print(doc["translated"][:1000])  # Preview first 1000 chars



--- 2test (8).pdf (Translated) ---

Print and time date: 07:41 01/04/2025
**Confidential Medical**
Department: Oncology
Summary Anamnesis Medical
Unit Name: Clinic Manager: Department: Dr. Ronen Berner
Nurse in charge and nurse: Oruah Drausha
Oncological M.R. 29966
Email: oncolog@wmc.gov.il
Fax: 03-5018202 Phone: 03-5028795
Name Details: Yelena ID: 320465776 Family Name: Voloshina
Phone: 050-9707038 Age: 51 Birth Date: 01/07/1973
Address: Kiryat Motzkin
Health Fund: Maccabi Gender: Female
Mozkin Kiryat 39/8 The Hasmoneans
Diagnostic Oncologies
Diagnosis Code Side Date S/P M/P Rec
MALIGNANT NEOPLASM OF BREAST (FEMALE), UNSPECIFIED 174.9
Main Complaint
28/1/2025 RT BREAST IDC ER POS PR POS HER2 NEG KI67=5% T3N1
23.2.2025 ACDD+T
Current Illness
The patient was followed up in my clinic. She works in an office. She lives in Kiryat Mozkin, 51 years old, 2+G. 
PS=0
She arrived accompanied by her daughters
Insurance Details
Abdominal distension, dyslipidemia, thyroid gland enlargement.
Backgr

# Text translated - Fill the Dataframe

In [96]:
# Install libraries
!pip install openai pandas tqdm

# Import
import pandas as pd
import json
import re
from openai import OpenAI
from google.colab import files
from tqdm import tqdm




In [97]:
# Upload your empty Excel template
uploaded = files.upload()

for filename in uploaded.keys():
    df_template = pd.read_excel(filename)

# Extract columns to fill
column_names = df_template.columns.tolist()
print(f"Columns to fill: {column_names}")


Saving patient_sum_table.xlsx to patient_sum_table (9).xlsx
Columns to fill: ['Participant', 'Participant initials', 'ID', 'Medical Center', 'Date of Birth', 'Date of Death', 'Gender', 'DM', 'HPL', 'HTN', 'Ischemia', 'Arrhythmia', 'COPD', 'Asthma', 'ILD', 'Peptic disease', 'IBD', 'Liver Disease', 'Hepatitis', 'Chronic Renal Failure', 'Nephrolithiasis', 'BPH', 'Bones, Muscles and Joints', 'Thyroid', 'CVA', 'Other', 'Autoimmune', 'Hematological', 'Other Diseases', 'Other Malignancies', 'Other Medications', 'Canncer Family History First degree relatives', 'Canncer Family History Second degree relatives', 'Tobacco', 'Alcohol', 'Drugs', 'Jewish', 'Arab', 'Date of Diagnosis', 'Stage at Diagnosis', 'surgery type', 'Date of Surgery', 'Primary tumor location', 'Grade', 'IHC', 'Immunohistochemistry (IHC)', 'Date of Metastatic Spread Outcome', 'Molecular Profile ', 'RT Site', 'RT Date', 'RT Dose', 'RT Drugs', 'RT Start', 'Adjuvant treatment End', 'Adjuvant treatment Number of cycles', 'Adjuvant t

In [98]:
# Initialize OpenAI client
client = OpenAI(api_key="your-openai-api-key")

In [99]:
def safe_truncate(text, max_chars=7000):
    """Truncate big texts safely to fit GPT-4 limits."""
    return text[:max_chars] if len(text) > max_chars else text

In [100]:
def extract_fields_from_full_text(text, column_names):
    prompt = f"""
You are given a translated English medical report.

Your task:
- Go through the text carefully.
- For each of the following fields: {', '.join(column_names)},
  extract the relevant information immediately if it appears.
- If a field is missing, write "Unknown".

Return a JSON dictionary like:
{{
{', '.join([f'"{col}": "..."' for col in column_names])}
}}

Here is the full report:
{text}
"""
    try:
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}],
            temperature=0,
            max_tokens=2000
        )
        content = response.choices[0].message.content.strip()

        # Extract only the JSON part safely
        json_match = re.search(r'\{.*\}', content, re.DOTALL)
        if json_match:
            fields = json.loads(json_match.group())
        else:
            raise ValueError("No JSON found in response")

        # Ensure all fields exist
        for col in column_names:
            fields.setdefault(col, "Unknown")

        return fields

    except Exception as e:
        print(f"Field extraction error: {e}")
        return {col: "Unknown" for col in column_names}


In [101]:
filled_records = []

for record in tqdm(final_translations, desc="Processing Patients"):
    filename = record["filename"]
    big_text = record["translated"]

    # Step 1: Safely truncate if needed
    safe_text = safe_truncate(big_text)

    # Step 2: Extract fields
    filled_fields = extract_fields_from_full_text(safe_text, column_names)

    # Step 3: Add filename for tracking
    filled_fields["Filename"] = filename

    filled_records.append(filled_fields)

# Build the final DataFrame
filled_df = pd.DataFrame(filled_records)

# Reorder columns if needed
final_cols = column_names + ["Filename"]
filled_df = filled_df[final_cols]

# Preview
filled_df.head()


Processing Patients: 100%|██████████| 2/2 [00:41<00:00, 20.76s/it]


Unnamed: 0,Participant,Participant initials,ID,Medical Center,Date of Birth,Date of Death,Gender,DM,HPL,HTN,...,Start date,metastatic 2nd line,Best response.1,Date of best response .1,End date.1,Number of cycles.1,Stop reason.1,Drugs.2,Start date.1,Filename
0,Yelena Voloshina,YV,320465776,Unknown,01/07/1973,Unknown,Female,Yes,Unknown,Unknown,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,2test (8).pdf
1,Natalia Gofman,NG,328859046,Unknown,09/05/1958,Unknown,Female,Unknown,Unknown,Unknown,...,6/2024,Enhertu,Unknown,Unknown,31/10/2024,Unknown,Unknown,Unknown,Unknown,1test (8).pdf


In [102]:
# Save to Excel
filled_df.to_excel("completed_patient_summary.xlsx", index=False)

# Download
files.download("completed_patient_summary.xlsx")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [103]:
filled_df.head().T

Unnamed: 0,0,1
Participant,Yelena Voloshina,Natalia Gofman
Participant initials,YV,NG
ID,320465776,328859046
Medical Center,Unknown,Unknown
Date of Birth,01/07/1973,09/05/1958
...,...,...
Number of cycles.1,Unknown,Unknown
Stop reason.1,Unknown,Unknown
Drugs.2,Unknown,Unknown
Start date.1,Unknown,Unknown


In [105]:
# Total number of "Unknown" values in the entire DataFrame
total_unknowns = (filled_df == "Unknown").sum().sum()

print(f"\nTotal number of 'Unknown' fields: {total_unknowns}")
print(f"Total number of fields: {filled_df.size}")
print(f"Percentage of 'Unknown' fields: {total_unknowns / filled_df.size * 100:.2f}%")


Total number of 'Unknown' fields: 101
Total number of fields: 148
Percentage of 'Unknown' fields: 68.24%
