In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
!pip install groq pandas gradio


Collecting groq
  Downloading groq-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting gradio
  Downloading gradio-5.6.0-py3-none-any.whl.metadata (16 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.5-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.4.3 (from gradio)
  Downloading gradio_client-1.4.3-py3-none-any.whl.metadata (7.1 kB)
Collecting python-multipart==0.0.12 (from gradio)
  Downloading python_multipart-0.0.12-py3-none-any.whl.metadata (1.9 kB)
Collecting ruff>=0.2.2 (from gradio)
  Downloading ruff-0.8.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<1.0,>=0.1.1 (from gradio)
  Downloading safehttpx-0.1.1-py3-none-any.whl.metadata (4.1 kB)
Collecting semantic-version~=2.0 (from gradio)
  Downloading semantic_version-2.10.0-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting starlette<1.0,>=0.40.0

In [19]:
import groq
import os
import re
import json
import pandas as pd
import gradio as gr
from typing import Dict, List, Optional
from dataclasses import dataclass, asdict
import tempfile

In [20]:
@dataclass
class PatientInfo:
    name: str
    address: str
    hospital: str
    allergies: List[str]
    major_medical_problems: List[str]
    sdoh_factors: List[str]

@dataclass
class SDOHMatch:
    factor: str
    code: str
    confidence: float
    explanation: str

@dataclass
class OutputFormat:
    patient_information: Dict
    sdoh_factors_with_codes: List[Dict]

In [16]:
import os

# Set the GROQ_API_KEY environment variable
os.environ['GROQ_API_KEY'] = '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'

# Verify it's set
print(os.getenv('GROQ_API_KEY'))  # This should print your API key


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


In [21]:
def extract_patient_info(clinical_note: str) -> PatientInfo:
    """Extract patient information from clinical note with improved error handling."""
    if not isinstance(clinical_note, str):
        raise ValueError("Clinical note must be a string")

    # Initialize with default values
    patient_info = PatientInfo(
        name="",
        address="",
        hospital="",
        allergies=[],
        major_medical_problems=[],
        sdoh_factors=[]
    )
    
    # Extract name
    name_match = re.search(r"Pt:[\s]*([^(\n]+)", clinical_note)
    if name_match:
        patient_info.name = name_match.group(1).strip()
    
    # Extract address
    address_match = re.search(r"residing @\s*(.*?(?:\d{5}))", clinical_note)
    if address_match:
        patient_info.address = address_match.group(1).strip()
    
    # Extract hospital
    hospital_match = re.search(r"Treating facility:\s*(.*?)(?:\d{5}|\n)", clinical_note)
    if hospital_match:
        patient_info.hospital = hospital_match.group(1).strip()
    
    # Extract allergies
    allergy_match = re.search(r"known allergies to:\s*(.*?)\.", clinical_note, re.IGNORECASE)
    if allergy_match:
        allergies_text = allergy_match.group(1)
        patient_info.allergies = [a.strip() for a in allergies_text.split(',') if a.strip()]
    
    # Medical problems mapping
    medical_problem_map = {
        r"T2DM": "Type 2 Diabetes Mellitus",
        r"COVID\+": "COVID-19",
        r"DM foot ulcer": "Diabetic Foot Ulcer"
    }
    
    medical_problems = []
    for pattern, condition in medical_problem_map.items():
        if re.search(pattern, clinical_note, re.IGNORECASE):
            medical_problems.append(condition)
    patient_info.major_medical_problems = medical_problems
    
    # SDOH factors extraction
    sdoh_pattern_map = {
        r"construction dust/debris": "Exposure to dust and smoke",
        r"poor office ventilation": "Exposure to environmental pollutants",
        r"2nd hand smoke": "Exposure to dust and smoke",
        r"Lives 2nd flr apt w/o elevator": "Poor housing conditions",
        r"Poor breakfast compliance": "Inadequate nutrition",
        r"relies on wife for transport": "Limited access to healthcare services"
    }
    
    sdoh_factors = []
    for pattern, factor in sdoh_pattern_map.items():
        if re.search(pattern, clinical_note, re.IGNORECASE):
            if factor not in sdoh_factors:  # Avoid duplicates
                sdoh_factors.append(factor)
    patient_info.sdoh_factors = sdoh_factors
    
    return patient_info

In [22]:
def match_sdoh_with_llm(clinical_sdoh: str, reference_factors: List[Dict[str, str]]) -> Dict:
    """Match SDOH factors using Groq LLM."""
    client = groq.Groq(
        api_key=os.environ.get("GROQ_API_KEY")
    )
    
    prompt = f"""
    Task: Match the following clinical SDOH factor with the most appropriate standardized SDOH factor from the reference list.
    
    Clinical SDOH factor: "{clinical_sdoh}"
    
    Reference SDOH factors and codes:
    {json.dumps(reference_factors, indent=2)}
    
    Please provide:
    1. The best matching standardized SDOH factor
    2. Its corresponding code
    3. A confidence score (0-1)
    4. A brief explanation for the match
    
    Format your response as a JSON object with these keys: matched_factor, code, confidence, explanation
    """
    
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": "You are a healthcare coding expert specializing in Social Determinants of Health (SDOH). Your task is to match clinical SDOH descriptions to standardized codes."
            },
            {
                "role": "user",
                "content": prompt
            }
        ],
        model="mixtral-8x7b-32768",
        temperature=0.1,
        max_tokens=500
    )
    
    try:
        response_text = chat_completion.choices[0].message.content
        match_result = json.loads(response_text)
        return match_result
    except Exception as e:
        return {
            "matched_factor": clinical_sdoh,
            "code": "UNKNOWN",
            "confidence": 0.0,
            "explanation": f"Error processing LLM response: {str(e)}"
        }


In [24]:
def process_files(clinical_note_file, sdoh_codes_file) -> str:
    """Process uploaded files with LLM-based matching."""
    try:
        # Validate file inputs
        if clinical_note_file is None or sdoh_codes_file is None:
            return json.dumps({
                "error": "Please upload both a clinical note (TXT) and SDOH codes (CSV) file"
            })

        # Read clinical note
        try:
            if isinstance(clinical_note_file, str):
                clinical_note = clinical_note_file
            else:
                clinical_note = clinical_note_file.decode('utf-8')
        except (UnicodeDecodeError, AttributeError):
            return json.dumps({
                "error": "Unable to read clinical note. Please ensure it is a valid UTF-8 encoded text file"
            })

        # Read SDOH codes
        try:
            with tempfile.NamedTemporaryFile(mode='wb', delete=False) as temp_file:
                if isinstance(sdoh_codes_file, str):
                    temp_file.write(sdoh_codes_file.encode('utf-8'))
                else:
                    temp_file.write(sdoh_codes_file)
                temp_file_path = temp_file.name

            sdoh_df = pd.read_csv(temp_file_path)
            os.unlink(temp_file_path)

            required_columns = {'SDOH factor', 'Code'}
            if not all(col in sdoh_df.columns for col in required_columns):
                return json.dumps({
                    "error": "The SDOH codes CSV file must contain 'SDOH factor' and 'Code' columns"
                })
            
            sdoh_codes_dict = dict(zip(sdoh_df['SDOH factor'], sdoh_df['Code']))
        except Exception as e:
            return json.dumps({
                "error": f"Error reading SDOH codes file: Please ensure it is a valid CSV file with the required columns",
                "details": str(e)
            })
        
        # Extract and process information with LLM matching
        patient_info = extract_patient_info(clinical_note)
        sdoh_matches = match_sdoh_codes(patient_info.sdoh_factors, sdoh_codes_dict)
        
        # Format output with enhanced matching information
        output = OutputFormat(
            patient_information={
                "Name": patient_info.name,
                "Address": patient_info.address,
                "Hospital": patient_info.hospital,
                "Allergies": patient_info.allergies,
                "Major Medical Problems": patient_info.major_medical_problems
            },
            sdoh_factors_with_codes=[{
                "original_factor": match.factor,
                "matched_factor": match.factor,
                "code": match.code,
                "confidence": match.confidence,
                "explanation": match.explanation
            } for match in sdoh_matches]
        )
        
        return json.dumps(asdict(output), indent=2, ensure_ascii=False)
        
    except Exception as e:
        return json.dumps({
            "error": "An unexpected error occurred while processing the files",
            "details": str(e)
        })

In [25]:
# Create Gradio interface
iface = gr.Interface(
    fn=process_files,
    inputs=[
        gr.File(
            label="Upload Clinical Note (TXT file)",
            file_types=[".txt"],
            type="binary"
        ),
        gr.File(
            label="Upload SDOH Codes (CSV file)",
            file_types=[".csv"],
            type="binary"
        )
    ],
    outputs=gr.JSON(label="Extracted Information"),
    title="Clinical Note and SDOH Code Matcher (LLM-Enhanced)",
    description="""
    Upload a clinical note (TXT) and SDOH codes (CSV) to extract patient information and match SDOH factors with their corresponding codes.
    This version uses AI to provide more accurate matching and detailed explanations.
    
    The CSV file should contain two columns:
    - 'SDOH factor': The social determinant of health factor
    - 'Code': The corresponding diagnostic code
    """,
    allow_flagging="never"
)

if __name__ == "__main__":
    # Make sure to set your Groq API key
    if not os.environ.get("GROQ_API_KEY"):
        print("Please set your GROQ_API_KEY environment variable")
    else:
        iface.launch()



* Running on local URL:  http://127.0.0.1:7861
Kaggle notebooks require sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

* Running on public URL: https://063b35d88531bb17de.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
