In [175]:
# !pip install requests
# !pip install langchain-core==0.3.29 langchain-groq==0.2.3
# !pip install transformers

In [239]:
import os
import re
import pandas as pd
import json
import requests
from langchain_groq import ChatGroq
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
from torch.utils.data import DataLoader, Dataset
from torch.optim import Adam
from torch.nn import functional as F
from tqdm import tqdm
from transformers import DataCollatorWithPadding

In [217]:
project_path = "/Users/varaprasad/System/dev/PyEnv/Mendel_AI/code"

# Directory
mimicIV_data = os.path.join(project_path, "data/MIMIC-IV")
data_path = os.path.join(project_path, "data")
csv_path = os.path.join(data_path, "ontology.csv")

# List to store extracted data
data = []

In [218]:
# Function to extract sections from text
def extract_sections(text, filename):
    sections = {
        "file_id": filename,  # This is a string, not a regex match object
        "chief_complaint": re.search(r'Chief Complaint:\n(.*?)(?:\n\n|\Z)', text, re.DOTALL),
        "history_of_present_illness": re.search(r'History of Present Illness:\n(.*?)(?:\n\n|\Z)', text, re.DOTALL),
        "past_medical_history": re.search(r'Past Medical History:\n(.*?)(?:\n\n|\Z)', text, re.DOTALL),
        "medications_on_admission": re.search(r'Medications on Admission:\n(.*?)(?:\n\n|\Z)', text, re.DOTALL),
        "discharge_medications": re.search(r'Discharge Medications:\n(.*?)(?:\n\n|\Z)', text, re.DOTALL),
        "discharge_diagnosis": re.search(r'Discharge Diagnosis:\n(.*?)(?:\n\n|\Z)', text, re.DOTALL),
        "brief_hospital_course": re.search(r'Brief Hospital Course:\n(.*?)(?:\n\n|\Z)', text, re.DOTALL),
        "major_procedure": re.search(r'Major Surgical or Invasive Procedure:\n(.*?)(?:\n\n|\Z)', text, re.DOTALL),
        "pertinent_results": re.search(r'Pertinent Results:\n(.*?)(?:\n\n|\Z)', text, re.DOTALL),
        "social_history": re.search(r'Social History:\n(.*?)(?:\n\n|\Z)', text, re.DOTALL),
#         "allergies": re.search(r'Allergies:\n(.*?)(?:\n\n|\Z)', text, re.DOTALL)
    }
    
    # Extract text from regex matches, skip non-regex fields like 'file_id'
    for key, match in sections.items():
        if isinstance(match, re.Match):  # Only apply .group() to regex match objects
            sections[key] = match.group(1).strip()
        else:
            sections[key] = match  # Keep filename or None as is
    
    return sections

In [219]:
for filename in os.listdir(mimicIV_data):
    if filename.endswith(".txt"):
        file_path = os.path.join(mimicIV_data, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            extracted_data = extract_sections(content, filename)
            data.append(extracted_data)

In [220]:
# Create a DataFrame and save to CSV
df = pd.DataFrame(data)

df.to_csv(csv_path, index=False)

print(f"Data successfully extracted and saved to {data_path}")

Data successfully extracted and saved to /Users/varaprasad/System/dev/PyEnv/Mendel_AI/code/data


### Setting Up the LLM Pipeline

In [221]:
df = pd.read_csv(csv_path)

df.head()

Unnamed: 0,file_id,chief_complaint,history_of_present_illness,past_medical_history,medications_on_admission,discharge_medications,discharge_diagnosis,brief_hospital_course,major_procedure,pertinent_results,social_history
0,oncology-report-10056223-DS-14.txt,Abdominal pain\n \nMajor Surgical or Invasive ...,"___ yo ___ speaking M with HCV, EtOH cirrhosis...",1. HCC s/p TACEx 2 with CT in ___ negative for...,Preadmission medications listed are correct an...,1. Clotrimazole 1 TROC PO 5X/DAY \nRX *clotrim...,Alcoholic Cirrhosis,HOSPIAL COURSE\n___ yo ___ speaking M with HCV...,No paracentesis\n \nHistory of Present Illness...,___ 08:20PM BLOOD WBC-5.3 RBC-3.51* Hgb-10.1* ...,"___\nFamily History:\n He is divorced, has two..."
1,oncology-report-10060764-DS-8.txt,Biliary obstruction\n \nMajor Surgical or Inva...,___ yo w/cholangiocarinoma presents as transfe...,HTN\nAfib s/p pacemaker placement on coumadin\...,The Preadmission Medication list is accurate a...,1. Cyanocobalamin 100 mcg PO DAILY \n2. Digoxi...,Cholangitis\nE Coli and Enterococcal Septicemi...,___ yo female with cholangiocarcinoma who was ...,ERCP\nPTBD\nTransesophageal echo,___ Results:\nCT ___: mild intrahepatic ductal...,"___\nFamily History:\nMother with colon CA, si..."
2,oncology-report-10041836-DS-21.txt,"C. Diff Colitis, Pathologic Fracture Femur\n \...",___ year old female with hx of endometrial can...,# endometrial CA s/p resection ___ in staging ...,Senna 1 tab po q day prn \nAtivan 0.5 mg po q...,1. Lorazepam 0.5 mg Tablet Sig: One (1) Tablet...,C diff colitis\nIntratrochanteric Femur Fractu...,# C. diff colitis\n- 14 Day course of Flagyl i...,External Beam Radiation\n \nHistory of Present...,___ 07:15AM BLOOD WBC-10.2 RBC-3.25* Hgb-8.1* ...,"___\nFamily History:\nmother - died ""blood can..."
3,oncology-report-10061124-DS-12.txt,CC: dyspnea\nreason for transfer: IP evaluation,"___ yo M with CAD, PAD, RAS, CHF (EF 20%) init...",1. Hypertension.\n2. Hyperlipidemia.\n3. CA...,meds on transfer:\nTylenol ___ Q6 hours prn PO...,1. Metoprolol Tartrate 25 mg Tablet Sig: One (...,Small Cell Lung Cancer \nHypertension'Hyperlip...,"Hospital Course: ___ yo M with CAD, PAD, RAS, ...",bronchoscopy\n \nHistory of Present Illness:\n...,___ 07:30PM UREA N-16 CREAT-1.3* SODIUM-141 ...,___\nFamily History:\nno known h/o lung CA\n \...
4,oncology-report-10120037-DS-9.txt,"""jaundice, light colored stools"" \n \nMajor Su...",This is a ___ yo male with probable metastatic...,Mr. ___ initially developed abdominal/epigastr...,ATENOLOL - (Prescribed by Other Provider) - 50...,1. atenolol 50 mg Tablet Sig: One (1) Tablet P...,"Bile duct obstruction, Pancreatic cancer, diff...",Mr. ___ is a ___ year-old man with probable me...,ERCP,Admission Labs:\nCBC: WBC-7.4# RBC-3.42* Hgb-1...,___\nFamily History:\nBoth his mother and fath...


In [222]:
# Function to clean text
def clean_text(text):
    if pd.isnull(text):
        return None
    text = str(text)
    
    # Remove placeholders like ___
    text = re.sub(r'_{2,}', '', text)
    
    # Load the abbreviations from JSON
    with open(os.path.join(data_path, 'medical_abbreviations.json'), 'r') as file:
        abbreviations = json.load(file)
    
    for abbr, full_form in abbreviations.items():
        # Use word boundaries to ensure accurate replacements
        pattern = re.compile(rf'\b{re.escape(abbr)}\b', re.IGNORECASE)
        text = pattern.sub(full_form, text)
    
    # Normalize date formats (MM/DD/YYYY to YYYY-MM-DD)
    text = re.sub(r'\b(\d{1,2})/(\d{1,2})/(\d{2,4})\b', lambda x: f"{x.group(3).zfill(4)}-{x.group(1).zfill(2)}-{x.group(2).zfill(2)}", text)
    
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Apply cleaning to all relevant columns except 'file_id'
for col in df.columns:
    if col != 'file_id':
        df[col] = df[col].apply(clean_text)

# Remove rows where all fields except 'file_id' are empty
df_cleaned = df.dropna(subset=df.columns.difference(['file_id']), how='all')

# Convert cleaned data to JSON format for LLM input
cleaned_json = df_cleaned.to_dict(orient='records')

# Save cleaned data
df_cleaned.to_csv(os.path.join(data_path,"cleaned_ontology.csv"), index=False)

# with open(os.path.join(data_path,"cleaned_ontology.json"), 'w') as json_file:
#     json.dump(cleaned_json, json_file, indent=4)

print("Data cleaning complete.")

Data cleaning complete.


### LLM - Prompt engineering approach for Information Extraction task

In [229]:
# Load cleaned data
new_df = pd.read_csv(os.path.join(data_path,"cleaned_ontology.csv"))
new_df = new_df.head()
new_df.head()

Unnamed: 0,file_id,chief_complaint,history_of_present_illness,past_medical_history,medications_on_admission,discharge_medications,discharge_diagnosis,brief_hospital_course,major_procedure,pertinent_results,social_history
0,oncology-report-10056223-DS-14.txt,Abdominal pain Major Surgical or Invasive Proc...,"yo speaking M with Hepatitis C Virus, Alcohol ...",1. Hepatocellular Carcinoma status post TACEx ...,Preadmission medications listed are correct an...,1. Clotrimazole 1 TROC by mouth 5X/DAY Prescri...,Alcoholic Cirrhosis,HOSPIAL COURSE yo speaking M with Hepatitis C ...,No paracentesis History of Present Illness: yo...,08:20PM BLOOD White Blood Cells-5.3 RBC-3.51* ...,"Family History: He is divorced, has two kids i..."
1,oncology-report-10060764-DS-8.txt,Biliary obstruction Major Surgical or Invasive...,yo w/cholangiocarinoma presents as transfer fr...,Hypertension Afib status post pacemaker placem...,The Preadmission Medication list is accurate a...,1. Cyanocobalamin 100 mcg by mouth DAILY 2. Di...,Cholangitis E Coli and Enterococcal Septicemia...,yo female with cholangiocarcinoma who was admi...,ERCP PTBD Transesophageal echo,Results: Computed Tomography : mild intrahepat...,"Family History: Mother with colon CA, sister w..."
2,oncology-report-10041836-DS-21.txt,"Clostridium difficile Colitis, Pathologic Frac...",year old female with hx of endometrial cancer ...,# endometrial CA status post resection in stag...,Senna 1 tab by mouth q day as needed Ativan 0....,1. Lorazepam 0.5 mg Tablet Sig: One (1) Tablet...,C diff colitis Intratrochanteric Femur Fractur...,# Clostridium difficile colitis - 14 Day cours...,External Beam Radiation History of Present Ill...,07:15AM BLOOD White Blood Cells-10.2 RBC-3.25*...,"Family History: mother - died ""blood cancer"" f..."
3,oncology-report-10061124-DS-12.txt,CC: dyspnea reason for transfer: IP evaluation,"yo M with Coronary Artery Disease, PAD, RAS, C...",1. Hypertension. 2. Hyperlipidemia. 3. Coronar...,meds on transfer: Tylenol Q6 hours as needed b...,1. Metoprolol Tartrate 25 mg Tablet Sig: One (...,Small Cell Lung Cancer Hypertension'Hyperlipid...,Hospital Course: yo M with Coronary Artery Dis...,bronchoscopy History of Present Illness: yo M ...,07:30PM UREA N-16 CREAT-1.3* SODIUM-141 POTASS...,Family History: no known h/o lung CA Physical ...
4,oncology-report-10120037-DS-9.txt,"""jaundice, light colored stools"" Major Surgica...",This is a yo male with probable metastatic pan...,Mr. initially developed abdominal/epigastric d...,ATENOLOL - (Prescribed by Other Provider) - 50...,1. atenolol 50 mg Tablet Sig: One (1) Tablet b...,"Bile duct obstruction, Pancreatic cancer, diff...",Mr. is a year-old man with probable metastatic...,ERCP,Admission Labs: Complete Blood Count: White Bl...,Family History: Both his mother and father die...


In [230]:
# Groq API configuration
GROQ_API_URL = 'https://api.groq.com/v1/extract'

# Replace with your actual API key
GROQ_API_KEY = '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'

In [231]:
def create_prompt(row_data):
    prompt = f"""
    Extract chemotherapy-related information from the following medical report:

    - Diagnosis
    - Chemotherapy Type
    - Chemotherapy Drugs
    - Treatment Dates
    - Procedures
    - Outcomes

    Return the extracted data in JSON format. Here's the report:

    {row_data}
    """
    return prompt

In [232]:
# Prepare for storing extracted information
extracted_data = []

# Initialize Groq LLM
llm = ChatGroq(
    api_key=GROQ_API_KEY,
    model="deepseek-r1-distill-llama-70b", 
    temperature=0.7,
    max_completion_tokens=1024,
    reasoning_format="raw"
)

# Function to generate response from Groq LLM
def generate_response(prompt_text: str) -> str:
    response = llm.invoke(prompt_text)
    return response.content

# Loop through each row and send to Groq API
for index, row in new_df.iterrows():
    file_id = row['file_id']
    row_data = row.drop('file_id').to_json()

    prompt = create_prompt(row_data)

    print(f"Processing file_id: {file_id}...")
    extracted_info = generate_response(prompt)

    if extracted_info:
        extracted_entry = {
            'file_id': file_id,
            'extracted_info': extracted_info  # Store the raw extracted info
        }
        extracted_data.append(extracted_entry)

                    max_completion_tokens was transferred to model_kwargs.
                    Please confirm that max_completion_tokens is what you intended.
                    top_p was transferred to model_kwargs.
                    Please confirm that top_p is what you intended.
                    reasoning_format was transferred to model_kwargs.
                    Please confirm that reasoning_format is what you intended.


Processing file_id: oncology-report-10056223-DS-14.txt...
Processing file_id: oncology-report-10060764-DS-8.txt...
Processing file_id: oncology-report-10041836-DS-21.txt...
Processing file_id: oncology-report-10061124-DS-12.txt...
Processing file_id: oncology-report-10120037-DS-9.txt...


In [233]:
# output_csv = os.path.join(data_path, 'llm_extract.csv')
# pd.DataFrame(extracted_data).to_csv(output_csv, index=False)
extracted_df = pd.DataFrame(extracted_data)

# output_json = os.path.join(data_path, 'extracted_chemo_info.json')
# with open(output_json, 'w') as json_file:
#     json.dump(extracted_data, json_file, indent=4)

print(f"Information Extraction complete.")

Information Extraction complete.


In [235]:
# Function to clean and extract JSON content from the 'extracted_info' field
def llm_parse(text):
    # Attempt to find JSON between ```json and ```
    json_match = re.search(r'```json(.*?)```', text, re.DOTALL)
    
    # If JSON delimiters are found
    if json_match:
        json_text = json_match.group(1).strip()
    else:
        # If delimiters aren't found, attempt to parse the entire text
        json_text = text.strip()

    try:
        # Try loading the JSON content
        data = json.loads(json_text)
        
        # Normalize key names to lowercase for consistent extraction
        normalized_data = {key.lower().replace(" ", "_"): value for key, value in data.items()}
        return normalized_data

    except json.JSONDecodeError:
        print(f"JSON parsing failed for text: {text[:100]}...")  # Show first 100 characters for debugging
        return None

# Apply the extraction to each row
structured_data = []

for _, row in extracted_df.iterrows():
    file_id = row['file_id']
    extracted_info = llm_parse(row['extracted_info'])
    
    if extracted_info:
        # Safely extract data and handle missing fields
        procedures = extracted_info.get('procedures', [])
        if isinstance(procedures, list):
            procedures = ", ".join(procedures)
        elif isinstance(procedures, str):
            procedures = procedures
        else:
            procedures = ''

        structured_entry = {
            'file_id': file_id,
            'diagnosis': extracted_info.get('diagnosis', 'Not Available'),
            'chemotherapy_type': extracted_info.get('chemotherapy_type', 'Not Available'),
            'chemotherapy_drugs': extracted_info.get('chemotherapy_drugs', 'Not Available'),
            'treatment_dates': extracted_info.get('treatment_dates', 'Not Available'),
            'procedures': procedures,
            'outcomes': extracted_info.get('outcomes', 'Not Available')
        }
        structured_data.append(structured_entry)

# Convert the structured data into a DataFrame
structured_df = pd.DataFrame(structured_data)

# Save the structured data to a new CSV file
output_csv_path = os.path.join(data_path, 'chemo_info.csv')
structured_df.to_csv(output_csv_path, index=False)

# Display the first few rows to verify
structured_df.head()

Unnamed: 0,file_id,diagnosis,chemotherapy_type,chemotherapy_drugs,treatment_dates,procedures,outcomes
0,oncology-report-10056223-DS-14.txt,Hepatocellular Carcinoma,Transarterial Chemoembolization (TACE) x2,Not explicitly mentioned,Not explicitly mentioned,"TACE x2, Radiofrequency Ablation",No evidence of recurrence on CT scan; MRI show...
1,oncology-report-10060764-DS-8.txt,Cholangiocarcinoma,Adjuvant,Not specified,Not specified,"ERCP, PTBD, Transesophageal echo",Treated with antibiotics and drains placed to ...
2,oncology-report-10041836-DS-21.txt,Endometrial cancer with metastasis to bone and...,Not explicitly mentioned in the report,No chemotherapy drugs explicitly mentioned in ...,No specific chemotherapy treatment dates menti...,"External beam radiation, Orthopedic pinning of...",Patient was diarrhea-free at the time of disch...
3,oncology-report-10061124-DS-12.txt,Small Cell Lung Cancer,Not explicitly specified (likely palliative or...,"[Carboplatin, Etoposide]",Not specified,Bronchoscopy,Completed chemotherapy without complications.
4,oncology-report-10120037-DS-9.txt,"Pancreatic cancer, diffuse large B-cell lymphoma",Adjuvant chemotherapy,"[Gemcitabine, Navelbine (Vinorelbine), Capecit...",Treatment dates are mentioned but not explicit...,"ERCP, Radiation therapy, Surgery (T8-L2 poster...",The patient's levels have been stable in the 1...


### Proxy Reward - Reinforcement Learning Framework