In [1]:
import os
import pandas as pd
import duckdb
from pathlib import Path
import re

# username = os.getlogin()  # Works locally
# desc_path = os.path.join("C:\\Users", username, "Documents", "GitHub", 
#                          "DEEP_ML_Project", "data", "extracted" , "D_ICD_DIAGNOSES.csv")
# diag_path = os.path.join("C:\\Users", username, "Documents", "GitHub", 
#                          "DEEP_ML_Project", "data", "extracted" , "DIAGNOSES_ICD.csv")
# notes_path = os.path.join("C:\\Users", username, "Documents", "GitHub", 
#                          "DEEP_ML_Project", "data", "extracted" , "NOTEEVENTS.csv")

extracted_dir = Path("../data/extracted")
desc_path = extracted_dir / "D_ICD_DIAGNOSES.csv"
diag_path = extracted_dir / "DIAGNOSES_ICD.csv"
notes_path = extracted_dir / "NOTEEVENTS.csv"

print(desc_path)
print(diag_path)
print(notes_path)

..\data\extracted\D_ICD_DIAGNOSES.csv
..\data\extracted\DIAGNOSES_ICD.csv
..\data\extracted\NOTEEVENTS.csv


In [2]:
# Define improved section extraction functions
def identify_sections(text):
    """
    Identify common section headers in MIMIC-III discharge summaries
    Returns a dictionary of section names and their starting positions
    """
    if pd.isna(text) or text is None:
        return {}
    
    # Convert to string if not already
    text = str(text)
    
    # Common section headers in MIMIC-III discharge summaries
    # Format: (regex pattern, standardized name)
    section_patterns = [
        # Common discharge summary sections
        (r'(?i)(?:admission date|date of admission)\s*:', 'ADMISSION_DATE'),
        (r'(?i)(?:discharge date|date of discharge)\s*:', 'DISCHARGE_DATE'),
        (r'(?i)date of birth\s*:', 'DATE_OF_BIRTH'),
        (r'(?i)sex\s*:', 'SEX'),
        (r'(?i)service\s*:', 'SERVICE'),
        (r'(?i)(?:chief\s+complaint|reason for admission|reason for hospitalization)\s*:', 'CHIEF_COMPLAINT'),
        (r'(?i)(?:history of (?:the )?present illness|hpi)\s*:', 'HISTORY_OF_PRESENT_ILLNESS'),
        (r'(?i)(?:brief )?(?:hospital course|hospital stay)\s*:', 'HOSPITAL_COURSE'),
        (r'(?i)(?:past medical history|pmh|medical history)\s*:', 'PAST_MEDICAL_HISTORY'),
        (r'(?i)(?:past surgical history|surgical history)\s*:', 'SURGICAL_HISTORY'),
        (r'(?i)preoperative diagnos(?:is|es)\s*:', 'PREOP_DIAGNOSIS'),
        (r'(?i)postoperative diagnos(?:is|es)\s*:', 'POSTOP_DIAGNOSIS'),
        (r'(?i)(?:admission|interim) diagnos(?:is|es)\s*:', 'ADMISSION_DIAGNOSIS'),
        (r'(?i)(?:final diagnos(?:is|es)|discharge diagnos(?:is|es))\s*:', 'DISCHARGE_DIAGNOSIS'),
        (r'(?i)(?:condition (?:upon|on) discharge|discharge condition)\s*:', 'DISCHARGE_CONDITION'),
        (r'(?i)discharge disposition\s*:', 'DISCHARGE_DISPOSITION'),
        (r'(?i)discharge (?:instructions|status)\s*:', 'DISCHARGE_INSTRUCTIONS'),
        (r'(?i)(?:discharge )?medications(?: on discharge| at discharge)?\s*:', 'DISCHARGE_MEDICATIONS'),
        (r'(?i)medications(?: on admission)?\s*:', 'ADMISSION_MEDICATIONS'),
        (r'(?i)current medications\s*:', 'CURRENT_MEDICATIONS'),
        (r'(?i)allergies\s*:', 'ALLERGIES'),
        (r'(?i)physical (?:examination|exam)\s*:', 'PHYSICAL_EXAMINATION'),
        (r'(?i)review of systems\s*:', 'REVIEW_OF_SYSTEMS'),
        (r'(?i)(?:social history|social)\s*:', 'SOCIAL_HISTORY'),
        (r'(?i)(?:family history|fh)\s*:', 'FAMILY_HISTORY'),
        (r'(?i)laboratory(?: data| results| values)?\s*:', 'LABORATORY_DATA'),
        (r'(?i)imaging\s*:', 'IMAGING'),
        (r'(?i)imaging studies\s*:', 'IMAGING_STUDIES'),
        (r'(?i)radiology\s*:', 'RADIOLOGY'),
        (r'(?i)(?:procedures performed|procedures)\s*:', 'PROCEDURES'),
        (r'(?i)assessment\s*(?:and plan)?:', 'ASSESSMENT'),
        (r'(?i)assessment and plan\s*:', 'ASSESSMENT_AND_PLAN'),
        (r'(?i)(?:plan|plans)\s*:', 'PLAN'),
        (r'(?i)consultations\s*:', 'CONSULTATIONS'),
        (r'(?i)follow(?:[-\s])?up\s*:', 'FOLLOWUP'),
        (r'(?i)disposition\s*:', 'DISPOSITION'),
        (r'(?i)code status\s*:', 'CODE_STATUS'),
        (r'(?i)impression\s*:', 'IMPRESSION'),
        (r'(?i)findings\s*:', 'FINDINGS'),
        
        # Additional formats sometimes found in MIMIC-III
        (r'(?i)^chief complaint:\s*', 'CHIEF_COMPLAINT'),  # At beginning of line
        (r'(?i)^history of present illness:\s*', 'HISTORY_OF_PRESENT_ILLNESS'),
        (r'(?i)^hospital course:\s*', 'HOSPITAL_COURSE'),
        (r'(?i)^discharge diagnosis:\s*', 'DISCHARGE_DIAGNOSIS'),
        (r'(?i)^past medical history:\s*', 'PAST_MEDICAL_HISTORY'),
        (r'(?i)^assessment:\s*', 'ASSESSMENT'),
        (r'(?i)^impression:\s*', 'IMPRESSION'),
    ]
    
    # Find all section positions
    section_positions = {}
    
    for pattern, name in section_patterns:
        for match in re.finditer(pattern, text):
            # Store the position of the section header
            section_positions[match.start()] = name
    
    # Sort by position
    return {pos: section_positions[pos] for pos in sorted(section_positions.keys())}

def extract_sections(text):
    """
    Extract sections from a discharge summary
    Returns a dictionary with section names as keys and section text as values
    """
    if pd.isna(text) or text is None:
        return {}
    
    # Identify sections and their positions
    section_positions = identify_sections(text)
    positions = list(section_positions.keys())
    
    # If no sections found, return the entire text as 'FULL_TEXT'
    if not positions:
        return {'FULL_TEXT': text.strip()}
    
    # Extract text for each section
    sections = {}
    
    for i, pos in enumerate(positions):
        section_name = section_positions[pos]
        
        # Find the end of this section (start of next section or end of text)
        if i < len(positions) - 1:
            section_end = positions[i + 1]
        else:
            section_end = len(text)
        
        # Extract section text (skip the header)
        # Find the end of the header (look for colon)
        colon_pos = text.find(':', pos)
        if colon_pos != -1 and colon_pos < section_end:
            header_end = colon_pos + 1
        else:
            # If no colon found, look for newline
            nl_pos = text.find('\n', pos)
            if nl_pos != -1 and nl_pos < section_end:
                header_end = nl_pos + 1
            else:
                # If neither found, use position + length of pattern as approximation
                header_end = pos + 30  # Approximate header length
        
        section_text = text[header_end:section_end].strip()
        
        # Store the extracted section
        sections[section_name] = section_text
    
    return sections

def preprocess_mimic_text(text):
    """
    Apply comprehensive preprocessing to MIMIC-III text
    """
    if pd.isna(text) or text is None:
        return ""
    
    # Convert to string if not already
    text = str(text)
    
    # Replace de-identified elements
    text = re.sub(r'\[\*\*\d{4}-\d{1,2}-\d{1,2}\*\*\]', '[DATE]', text)
    text = re.sub(r'\[\*\*\d{1,2}/\d{1,2}/\d{2,4}\*\*\]', '[DATE]', text)  # MM/DD/YYYY format
    text = re.sub(r'\[\*\*\d{4}\*\*\]', '[YEAR]', text)
    text = re.sub(r'\[\*\*.*?[Hh]ospital.*?\*\*\]', '[HOSPITAL]', text)
    text = re.sub(r'\[\*\*.*?[Cc]linic.*?\*\*\]', '[CLINIC]', text)
    text = re.sub(r'\[\*\*.*?[Dd]octor.*?\*\*\]', '[DOCTOR]', text)
    text = re.sub(r'\[\*\*.*?[Nn]ame.*?\*\*\]', '[NAME]', text)
    text = re.sub(r'\[\*\*.*?[Pp]hone.*?\*\*\]', '[PHONE]', text)
    text = re.sub(r'\[\*\*.*?[Ss]tate.*?\*\*\]', '[STATE]', text)
    text = re.sub(r'\[\*\*.*?[Cc]ity.*?\*\*\]', '[CITY]', text)
    text = re.sub(r'\[\*\*.*?[Nn]umber.*?\*\*\]', '[NUMBER]', text)
    text = re.sub(r'\[\*\*.*?\*\*\]', '[DEIDENTIFIED]', text)
    
    # Normalize spacing
    text = re.sub(r' +', ' ', text)
    text = re.sub(r'\n+', '\n', text)
    
    return text.strip()

def create_clinical_weighted_text(sections):
    """
    Create a weighted clinical text by combining the most predictive sections
    Returns a focused text with the most diagnostically relevant information
    """
    if not sections:
        return ""
    
    weighted_pieces = []
    
    # Add the most diagnostically relevant sections
    if 'CHIEF_COMPLAINT' in sections and sections['CHIEF_COMPLAINT']:
        weighted_pieces.append(f"CHIEF COMPLAINT: {sections['CHIEF_COMPLAINT']}")
        
    if 'HISTORY_OF_PRESENT_ILLNESS' in sections and sections['HISTORY_OF_PRESENT_ILLNESS']:
        weighted_pieces.append(f"HPI: {sections['HISTORY_OF_PRESENT_ILLNESS']}")
        
    if 'DISCHARGE_DIAGNOSIS' in sections and sections['DISCHARGE_DIAGNOSIS']:
        weighted_pieces.append(f"DIAGNOSIS: {sections['DISCHARGE_DIAGNOSIS']}")
    
    if 'ASSESSMENT' in sections and sections['ASSESSMENT']:
        weighted_pieces.append(f"ASSESSMENT: {sections['ASSESSMENT']}")
    elif 'ASSESSMENT_AND_PLAN' in sections and sections['ASSESSMENT_AND_PLAN']:
        weighted_pieces.append(f"ASSESSMENT: {sections['ASSESSMENT_AND_PLAN']}")
        
    if 'IMPRESSION' in sections and sections['IMPRESSION']:
        weighted_pieces.append(f"IMPRESSION: {sections['IMPRESSION']}")
    
    # Add hospital course (contains important clinical details)
    if 'HOSPITAL_COURSE' in sections and sections['HOSPITAL_COURSE']:
        weighted_pieces.append(f"COURSE: {sections['HOSPITAL_COURSE']}")
    
    return " ".join(weighted_pieces)

In [3]:
# Create an in-memory DuckDB connection
con = duckdb.connect(database=':memory:')

# Load CSVs directly using DuckDB (efficient for large files)
query = f"""
-- Load NOTEEVENTS (discharge summaries only, no known errors)
CREATE VIEW noteevents AS
SELECT *
FROM read_csv_auto('{notes_path}')
WHERE category = 'Discharge summary' AND ISERROR IS NULL;

-- Load DIAGNOSES_ICD
CREATE VIEW diagnoses_icd AS
SELECT *
FROM read_csv_auto('{diag_path}');

-- Load ICD-9 descriptions
CREATE VIEW d_icd_diagnoses AS
SELECT *
FROM read_csv_auto('{desc_path}');
"""

# Run the multi-query
con.execute(query)

<duckdb.duckdb.DuckDBPyConnection at 0x1f809536cb0>

In [4]:
# Display NOTEEVENTS column names and top 5 rows
#print("NOTEEVENTS Table Schema:")
#noteevents_schema = con.execute("PRAGMA table_info(noteevents);").fetchdf()
#print(noteevents_schema[['name']])  # Display only column names

#print("\nTop 5 rows from NOTEEVENTS:")
#noteevents_preview = con.execute("SELECT * FROM noteevents LIMIT 5;").fetchdf()
#print(noteevents_preview)

In [5]:
# Get top 20 most frequent ICD-9 codes
top_codes_query = f"""
SELECT icd9_code, COUNT(*) as count
FROM read_csv_auto('{diag_path}')
GROUP BY icd9_code
ORDER BY count DESC
LIMIT 20;
"""
top_codes_df = con.execute(top_codes_query).fetchdf()
top_codes = top_codes_df['ICD9_CODE'].tolist()
#print(top_codes)

# Load ICD-9 descriptions into a DataFrame
desc_df = con.execute("SELECT * FROM d_icd_diagnoses").fetchdf()

# Optional: check column names
print("Top codes columns:", top_codes_df.columns.tolist())
print("Descriptions columns:", desc_df.columns.tolist())

# Try both UPPER and lowercase just in case
count_col = 'COUNT' if 'COUNT' in top_codes_df.columns else 'count'

# Merge with description
merged_df = pd.merge(top_codes_df, desc_df[['ICD9_CODE', 'LONG_TITLE']], on='ICD9_CODE', how='left')

# Sort and print
merged_df = merged_df.sort_values(by=count_col, ascending=False).reset_index(drop=True)

# for idx, row in merged_df.iterrows():
#     print(f"{idx+1}. Code: {row['ICD9_CODE']} → {row['LONG_TITLE']} → {row[count_col]} instances")

Top codes columns: ['ICD9_CODE', 'count']
Descriptions columns: ['ROW_ID', 'ICD9_CODE', 'SHORT_TITLE', 'LONG_TITLE']


In [6]:
# Join, filter for discharge notes, keep only top codes
fetch_query = f"""
WITH joined AS (
    SELECT 
        n.subject_id,
        n.hadm_id,
        n.text AS summary_snippet,
        d.icd9_code,
        icd.long_title
    FROM read_csv_auto('{notes_path}') n
    JOIN read_csv_auto('{diag_path}') d
      ON n.subject_id = d.subject_id AND n.hadm_id = d.hadm_id
    JOIN read_csv_auto('{desc_path}') icd
      ON d.icd9_code = icd.icd9_code
    WHERE n.category = 'Discharge summary'
      AND d.icd9_code IN ('{("', '").join(top_codes)}')
)
SELECT 
    subject_id,
    hadm_id,
    MAX(summary_snippet) AS summary_snippet,  
    STRING_AGG(icd9_code, ', ') AS icd9_codes,
    STRING_AGG(long_title, '; ') AS diagnoses
FROM joined
GROUP BY subject_id, hadm_id
"""
diagnoses_df = con.execute(fetch_query).fetchdf()

# Add diagnosis text fields and full summary text
diagnoses_df['key'] = diagnoses_df['SUBJECT_ID'].astype(str) + '_' + diagnoses_df['HADM_ID'].astype(str)

# Create a base dataframe for results with unique patient admissions
base_results = diagnoses_df[['SUBJECT_ID', 'HADM_ID', 'key', 'summary_snippet', 'icd9_codes', 'diagnoses']].copy()

# Create one-hot encoding columns initialized to 0
for code in top_codes:
    base_results[f'ICD9_{code}'] = 0

# Populate the one-hot encoding columns based on the icd9_codes column
for idx, row in base_results.iterrows():
    if pd.notna(row['icd9_codes']):  # Check if icd9_codes is not NaN
        patient_codes = [code.strip() for code in row['icd9_codes'].split(',')]
        
        # Set 1 for each code that the patient has
        for code in patient_codes:
            if code in top_codes:  # Only process codes in our top list
                base_results.at[idx, f'ICD9_{code}'] = 1

print(f"Dataset contains {len(base_results)} records with one-hot encoded diagnoses")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Dataset contains 46614 records with one-hot encoded diagnoses


In [7]:
# ---------- SECTION EXTRACTION ENHANCEMENTS ----------
print("Applying text preprocessing and section extraction...")

# Step 1: Apply basic preprocessing to clean the text
base_results['summary_snippet_clean'] = base_results['summary_snippet'].apply(preprocess_mimic_text)

# Step 2: Extract sections from each note
all_sections = base_results['summary_snippet_clean'].apply(extract_sections)

# Calculate section statistics
section_counts = {}
for sections in all_sections:
    for section in sections:
        section_counts[section] = section_counts.get(section, 0) + 1

# Get top 15 most common sections
top_sections = sorted(section_counts.items(), key=lambda x: x[1], reverse=True)[:15]
print("\nTop 15 most common sections in the dataset:")
for section, count in top_sections:
    percentage = (count / len(base_results)) * 100
    print(f"{section}: {count} occurrences ({percentage:.2f}%)")

# Step 3: Extract key clinical sections
key_sections = [
    'CHIEF_COMPLAINT', 
    'HISTORY_OF_PRESENT_ILLNESS', 
    'HOSPITAL_COURSE',
    'PAST_MEDICAL_HISTORY', 
    'DISCHARGE_DIAGNOSIS',
    'ASSESSMENT',
    'ASSESSMENT_AND_PLAN',
    'IMPRESSION'
]

# Add columns for key sections
for section in key_sections:
    base_results[f'section_{section}'] = all_sections.apply(
        lambda x: x.get(section, '') if isinstance(x, dict) else ''
    )
    
    # Add binary indicator for section presence
    base_results[f'has_{section}'] = base_results[f'section_{section}'].apply(
        lambda x: 1 if isinstance(x, str) and len(x.strip()) > 0 else 0
    )

# Step 4: Create a weighted clinical text focusing on the most predictive sections
base_results['clinical_weighted_text'] = all_sections.apply(create_clinical_weighted_text)

# Step 5: Add diagnosis count 
base_results['diagnosis_count'] = base_results['icd9_codes'].apply(
    lambda x: len(x.split(',')) if isinstance(x, str) else 0
)

# Step 6: Add section length metrics for potentially important sections
for section in ['CHIEF_COMPLAINT', 'HISTORY_OF_PRESENT_ILLNESS', 'ASSESSMENT', 'DISCHARGE_DIAGNOSIS']:
    section_col = f'section_{section}'
    # Calculate character length 
    base_results[f'{section_col}_length'] = base_results[section_col].apply(
        lambda x: len(x) if isinstance(x, str) else 0
    )
    # Calculate word count
    base_results[f'{section_col}_words'] = base_results[section_col].apply(
        lambda x: len(x.split()) if isinstance(x, str) else 0
    )

Applying text preprocessing and section extraction...

Top 15 most common sections in the dataset:
ADMISSION_DATE: 46560 occurrences (99.88%)
SERVICE: 46359 occurrences (99.45%)
DISCHARGE_DATE: 46358 occurrences (99.45%)
ADMISSION_MEDICATIONS: 41356 occurrences (88.72%)
HISTORY_OF_PRESENT_ILLNESS: 40862 occurrences (87.66%)
DISCHARGE_DIAGNOSIS: 40740 occurrences (87.40%)
SEX: 40155 occurrences (86.14%)
DATE_OF_BIRTH: 40129 occurrences (86.09%)
HOSPITAL_COURSE: 40043 occurrences (85.90%)
ALLERGIES: 39983 occurrences (85.77%)
PAST_MEDICAL_HISTORY: 38842 occurrences (83.33%)
DISPOSITION: 38184 occurrences (81.92%)
PHYSICAL_EXAMINATION: 37848 occurrences (81.19%)
SOCIAL_HISTORY: 37556 occurrences (80.57%)
DISCHARGE_MEDICATIONS: 37553 occurrences (80.56%)


In [8]:
# Remove uneccesarry columns
results = base_results.drop(columns=['key', 'diagnoses', 'icd9_codes', 'summary_snippet', 'diagnosis_count'])

# Print sample sections to verify extraction
sample_idx = min(5, len(results)-1)  # Get a valid index
print(f"\nSample sections from record {sample_idx}:")
for section in key_sections:
    section_text = results.loc[sample_idx, f'section_{section}']
    if isinstance(section_text, str) and len(section_text.strip()) > 0:
        # Print just the first 100 characters of each section for readability
        print(f"{section}: {section_text[:100]}...")

# #Print the results of the summary and which codes were assocated with it Example
# print(results.loc[5, 'summary_snippet'])
# print("-" * 20)  # Separator for clarity

# icd9_codes = results.loc[1, 'icd9_codes'].split(', ')
# diagnoses = results.loc[1, 'diagnoses'].split('; ')

# for i in range(min(len(icd9_codes), len(diagnoses))):
#   print(f"{icd9_codes[i]} --> {diagnoses[i]}")

# # handling the case where icd9_codes and diagnoses are different lengths.
# if len(icd9_codes) > len(diagnoses):
#     for i in range(len(diagnoses), len(icd9_codes)):
#         print(f"{icd9_codes[i]} --> No corresponding diagnosis")
# elif len(diagnoses) > len(icd9_codes):
#     for i in range(len(icd9_codes), len(diagnoses)):
#         print(f"No corresponding ICD9 --> {diagnoses[i]}")



Sample sections from record 5:
CHIEF_COMPLAINT: respiratory failure
Major Surgical or Invasive Procedure:
intubation, PICC line placement...
HISTORY_OF_PRESENT_ILLNESS: 76 yo M tranferred from [HOSPITAL] intubated with
pneumococcal pneumonia. Patient developed malaise ...
HOSPITAL_COURSE: 76 year old male tranferred from [HOSPITAL] intubated
with pneumococcal pneumonia and resuscitated s...
PAST_MEDICAL_HISTORY: Recent GI bleed - admitted to [HOSPITAL] [DEIDENTIFIED]; Capsule endoscopy [DEIDENTIFIED]
with angio...
DISCHARGE_DIAGNOSIS: Primary: streptococcal pneumonia
Secondary: change in mental status, thrombocytopenia, Type II
diabe...
IMPRESSION: No evidence of an acute infarct or enhancing
abnormalities. Tiny amounts of chronic microvascular is...


In [9]:
# Export results to CSV
#summary_results_path = os.path.join("C:\\Users", username, "Documents", "GitHub", "DEEP_ML_Project", "data", "preprocessed", "summary_results.csv")
#summary_results_trimmed_path = os.path.join("C:\\Users", username, "Documents", "GitHub", "DEEP_ML_Project", "data", "preprocessed", "summary_results_trimmed.csv")

# Export function that handles large files properly
def export_to_csv(df, file_path, chunk_size=None):
    """
    Export DataFrame to CSV with proper handling of large files
    
    Args:
        df: DataFrame to export
        file_path: Path for the output file
        chunk_size: If provided, split into chunks of this size
    """
    if chunk_size is None:
        # Export as a single file with minimal options to ensure correct handling
        df.to_csv(file_path, index=False, quoting=1)  # quoting=1 ensures proper text field handling
        return [file_path]
    else:
        # Split into multiple files if chunk_size is specified
        file_base = file_path.stem
        file_ext = file_path.suffix
        file_dir = file_path.parent
        
        chunk_files = []
        num_chunks = (len(df) + chunk_size - 1) // chunk_size  # Ceiling division
        
        for i in range(num_chunks):
            start_idx = i * chunk_size
            end_idx = min((i + 1) * chunk_size, len(df))
            chunk_file = file_dir / f"{file_base}_part{i+1}{file_ext}"
            df.iloc[start_idx:end_idx].to_csv(chunk_file, index=False, quoting=1)
            chunk_files.append(chunk_file)
            
        return chunk_files

chunk_size = None  # Set to a value like 10000 if you want to split the file

preprocessed_dir = Path("../data/preprocessed")
preprocessed_dir.mkdir(parents=True, exist_ok=True)

summary_results_path = preprocessed_dir / "summary_results.csv"
summary_results_trimmed_path = preprocessed_dir / "summary_results_trimmed.csv"

#results.iloc[:-20].to_csv(summary_results_path, index=False)
export_files = export_to_csv(results.iloc[:-20], summary_results_path, chunk_size)
results.iloc[-20:].to_csv(summary_results_trimmed_path, index=False, quoting=1)

print(f"Full training results exported to: {summary_results_path}")
print(f"Trimmed training results exported to: {summary_results_trimmed_path}")

Full training results exported to: ..\data\preprocessed\summary_results.csv
Trimmed training results exported to: ..\data\preprocessed\summary_results_trimmed.csv
