<a href="https://colab.research.google.com/github/InbalBolshinsky/LungCancer_Research_Project/blob/main/ExtractFromPDFtoCSV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import fitz  # PyMuPDF for PDF text extraction
import os
import re
import pandas as pd
import spacy
import openai

# Load Medical NLP Model
nlp = spacy.load("en_core_sci_sm")

# OpenAI API Key (If using GPT for complex structuring)
openai.api_key = "your_openai_api_key"

# Folder containing PDF files
PDF_FOLDER = "medical_pdfs/"
OUTPUT_CSV = "structured_medical_data.csv"

def extract_text_from_pdf(pdf_path):
    """Extract raw text from a PDF file using PyMuPDF."""
    try:
        doc = fitz.open(pdf_path)
        text = "\n".join(page.get_text() for page in doc)
        return text
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return None

def extract_patient_info(text):
    """Extract key medical details from unstructured text using regex & NLP."""

    extracted_data = {
        "Patient Name": None,
        "Age": None,
        "Gender": None,
        "Procedure": None,
        "Diagnosis": None,
        "Medications": None,
        "Doctor's Notes": None
    }

    # Use regex to find structured details
    name_match = re.search(r"(?:Patient Name|Name):\s*(.+)", text, re.IGNORECASE)
    age_match = re.search(r"Age:\s*(\d+)", text, re.IGNORECASE)
    gender_match = re.search(r"Gender:\s*(Male|Female|Other)", text, re.IGNORECASE)
    procedure_match = re.search(r"Procedure:\s*(.+)", text, re.IGNORECASE)
    diagnosis_match = re.search(r"Diagnosis:\s*(.+)", text, re.IGNORECASE)
    medications_match = re.search(r"Medications?:\s*(.+)", text, re.IGNORECASE)
    notes_match = re.search(r"(?:Doctor's Notes|Notes):\s*(.+)", text, re.IGNORECASE)

    if name_match:
        extracted_data["Patient Name"] = name_match.group(1).strip()
    if age_match:
        extracted_data["Age"] = age_match.group(1).strip()
    if gender_match:
        extracted_data["Gender"] = gender_match.group(1).strip()
    if procedure_match:
        extracted_data["Procedure"] = procedure_match.group(1).strip()
    if diagnosis_match:
        extracted_data["Diagnosis"] = diagnosis_match.group(1).strip()
    if medications_match:
        extracted_data["Medications"] = medications_match.group(1).strip()
    if notes_match:
        extracted_data["Doctor's Notes"] = notes_match.group(1).strip()

    return extracted_data

def gpt_extract_medical_info(text):
    """Use GPT to extract structured patient records for highly unstructured text."""
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "Extract structured medical records from the following text."},
            {"role": "user", "content": text}
        ]
    )

    structured_data = response["choices"][0]["message"]["content"]
    return eval(structured_data)  # Ensure GPT returns a list of dictionaries

def process_all_pdfs(pdf_folder, use_gpt=False):
    """Extract and process data from all PDFs in the folder."""
    all_data = []

    for filename in os.listdir(pdf_folder):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, filename)
            print(f"Processing: {pdf_path}")

            # Extract text
            raw_text = extract_text_from_pdf(pdf_path)
            if not raw_text:
                continue

            # Extract structured data
            if use_gpt:
                patient_records = gpt_extract_medical_info(raw_text)
                all_data.extend(patient_records)  # GPT returns multiple records per file
            else:
                patient_info = extract_patient_info(raw_text)
                all_data.append(patient_info)

    # Convert to DataFrame
    df = pd.DataFrame(all_data)
    df.to_csv(OUTPUT_CSV, index=False)
    print(f"✅ CSV saved successfully as {OUTPUT_CSV}")

# Run the pipeline
process_all_pdfs(PDF_FOLDER, use_gpt=False)  # Set use_gpt=True for AI-based structuring
