In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List, Dict
import warnings
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd
# Load Dataset
print("Loading MIMIC-III data files")

# Load the three main files
patients = pd.read_csv(r"C:\Users\kehin\Downloads\mimic-iii-extracted\mimic-iii-clinical-database-1.4\PATIENTS.csv.gz")
admissions = pd.read_csv(r"C:\Users\kehin\Downloads\mimic-iii-extracted\mimic-iii-clinical-database-1.4\ADMISSIONS.csv.gz")
notes = pd.read_csv(r"C:\Users\kehin\Downloads\mimic-iii-extracted\mimic-iii-clinical-database-1.4\NOTEEVENTS.csv.gz")
print(f" PATIENTS.csv: {len(patients):,} patients")
print(f" ADMISSIONS.csv: {len(admissions):,} admissions")
print(f" NOTEEVENTS.csv: {len(notes):,} clinical notes")
print()

Loading MIMIC-III data files
 PATIENTS.csv: 46,520 patients
 ADMISSIONS.csv: 58,976 admissions
 NOTEEVENTS.csv: 2,083,180 clinical notes



In [11]:
import pandas as pd
patients = pd.read_csv(r"C:\Users\kehin\Downloads\mimic-iii-extracted\mimic-iii-clinical-database-1.4\PATIENTS.csv.gz")
notes = pd.read_csv(r"C:\Users\kehin\Downloads\mimic-iii-extracted\mimic-iii-clinical-database-1.4\NOTEEVENTS.csv.gz")
admissions = pd.read_csv(r"C:\Users\kehin\Downloads\mimic-iii-extracted\mimic-iii-clinical-database-1.4\ADMISSIONS.csv.gz")

# Keep relevant columns
patients = patients[['SUBJECT_ID', 'GENDER', 'DOB']]
admissions = admissions[['SUBJECT_ID', 'HADM_ID', 'ETHNICITY']]

# Merge patients → admissions
patient_adm = pd.merge(admissions, patients, on='SUBJECT_ID', how='left')

# Merge notes → patient + admission info
notes_merged = pd.merge(notes, patient_adm, on=['SUBJECT_ID', 'HADM_ID'], how='left')

print(notes_merged.head())

chunk_size = 10000  # adjust based on memory
filename =r"C:\Users\kehin\Downloads\mimic-iii-extracted\mimic-iii-clinical-database-1.4\NOTEEVENTS.csv.gz"
  # path to your note events file

def process_notes_chunk(chunk, patients_df):
    # Merge chunk with patient demographics
    merged_chunk = chunk.merge(patients_df, on="SUBJECT_ID", how="left")
    
    # Example: count negative compliance terms
    neg_compliance = ["non-compliant", "refuses", "uncooperative"]
    merged_chunk['neg_count'] = merged_chunk['TEXT'].str.lower().str.count(r'\b(?:' + '|'.join(neg_compliance) + r')\b')
    
    # Example: count positive compliance terms
    pos_compliance = ["cooperative", "adherent", "compliant"]
    merged_chunk['pos_count'] = merged_chunk['TEXT'].str.lower().str.count(r'\b(?:' + '|'.join(pos_compliance) + r')\b')
    
    return merged_chunk
all_notes = []  # store processed chunks

for chunk in pd.read_csv(filename, chunksize=chunk_size):
    processed_chunk = process_notes_chunk(chunk, patients)
    all_notes.append(processed_chunk)

# Combine all chunks into a single DataFrame
full_notes = pd.concat(all_notes, ignore_index=True)

print(full_notes.head())


   ROW_ID  SUBJECT_ID   HADM_ID   CHARTDATE CHARTTIME STORETIME  \
0     174       22532  167853.0  2151-08-04       NaN       NaN   
1     175       13702  107527.0  2118-06-14       NaN       NaN   
2     176       13702  167118.0  2119-05-25       NaN       NaN   
3     177       13702  196489.0  2124-08-18       NaN       NaN   
4     178       26880  135453.0  2162-03-25       NaN       NaN   

            CATEGORY DESCRIPTION  CGID  ISERROR  \
0  Discharge summary      Report   NaN      NaN   
1  Discharge summary      Report   NaN      NaN   
2  Discharge summary      Report   NaN      NaN   
3  Discharge summary      Report   NaN      NaN   
4  Discharge summary      Report   NaN      NaN   

                                                TEXT              ETHNICITY  \
0  Admission Date:  [**2151-7-16**]       Dischar...  UNKNOWN/NOT SPECIFIED   
1  Admission Date:  [**2118-6-2**]       Discharg...                  WHITE   
2  Admission Date:  [**2119-5-4**]              D... 

In [29]:
import pandas as pd

filename_notes = r"C:\Users\kehin\Downloads\mimic-iii-extracted\mimic-iii-clinical-database-1.4\NOTEEVENTS.csv.gz"
filename_adm = r"C:\Users\kehin\Downloads\mimic-iii-extracted\mimic-iii-clinical-database-1.4\ADMISSIONS.csv.gz"

usecols_notes = ['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'TEXT']
usecols_adm = ['SUBJECT_ID', 'HADM_ID', 'ETHNICITY']

chunk_size = 1000  # adjust down if still too large


In [31]:
# Load admissions
admissions = pd.read_csv(filename_adm, usecols=usecols_adm)

# Standardize ethnicity
def standardize_ethnicity(x):
    x = str(x).upper()
    if "BLACK" in x: return "BLACK"
    elif "WHITE" in x: return "WHITE"
    elif "ASIAN" in x: return "ASIAN"
    elif "HISPANIC" in x: return "HISPANIC"
    elif "UNKNOWN" in x or x.strip() == "": return "UNKNOWN"
    else: return "OTHER"

admissions['ETHNICITY'] = admissions['ETHNICITY'].apply(standardize_ethnicity)


In [33]:
neg_compliance = ["non-compliant", "refuses", "uncooperative"]
pos_compliance = ["cooperative", "adherent", "compliant"]
content_words = ["diagnosis", "procedure", "symptom", "medication"]
style_words = ["pleasant", "difficult", "helpful", "agitated"]
framing_markers = ["claims", "reports", "denies", "states"]
stereotype_words = ["stoic", "hysterical", "competent", "warm"]


In [35]:
def process_chunk(chunk, adm_df):
    # Merge with admissions
    chunk = pd.merge(chunk, adm_df, on=['SUBJECT_ID', 'HADM_ID'], how='left')
    
    # Lowercase text
    chunk['TEXT'] = chunk['TEXT'].astype(str).str.lower()
    
    # H1: negative/positive compliance
    chunk['neg_count'] = chunk['TEXT'].str.count('|'.join(neg_compliance))
    chunk['pos_count'] = chunk['TEXT'].str.count('|'.join(pos_compliance))
    
    # H2: content vs style words
    chunk['content_count'] = chunk['TEXT'].str.count('|'.join(content_words))
    chunk['style_count'] = chunk['TEXT'].str.count('|'.join(style_words))
    
    # H3: framing markers
    chunk['framing_count'] = chunk['TEXT'].str.count('|'.join(framing_markers))
    
    # H4: stereotype descriptors
    chunk['stereotype_count'] = chunk['TEXT'].str.count('|'.join(stereotype_words))
    
    return chunk


In [45]:
# Lexicons
neg_compliance = ["non-compliant", "refuses", "uncooperative"]
pos_compliance = ["cooperative", "adherent", "compliant"]

summary_dict = {}

# Example: just process the first few chunks to get some preliminary results
for i, chunk in enumerate(chunk_iter):
    if i > 2:  # only process first 3 chunks for now
        break
    processed = process_chunk(chunk, admissions)
    
    # Count negative/positive terms
    processed['neg_count'] = processed['TEXT'].apply(
        lambda x: sum(1 for word in neg_compliance if word in x.lower())
    )
    processed['pos_count'] = processed['TEXT'].apply(
        lambda x: sum(1 for word in pos_compliance if word in x.lower())
    )

    # Group by ethnicity and summarize
    for eth, group in processed.groupby('ETHNICITY'):
        if eth not in summary_dict:
            summary_dict[eth] = {'notes_count': 0, 'neg_count': 0, 'pos_count': 0}
        summary_dict[eth]['notes_count'] += len(group)
        summary_dict[eth]['neg_count'] += group['neg_count'].sum()
        summary_dict[eth]['pos_count'] += group['pos_count'].sum()

# Quick view of results
for eth, stats in summary_dict.items():
    print(f"{eth}: Notes={stats['notes_count']}, Neg={stats['neg_count']}, Pos={stats['pos_count']}")


ASIAN: Notes=84, Neg=0, Pos=4
BLACK: Notes=215, Neg=5, Pos=19
HISPANIC: Notes=122, Neg=2, Pos=12
OTHER: Notes=138, Neg=1, Pos=7
UNKNOWN: Notes=303, Neg=1, Pos=13
WHITE: Notes=2138, Neg=27, Pos=175


In [47]:
summary = notes_processed.groupby('ETHNICITY').agg(
    notes_count=('ROW_ID', 'count'),
    avg_neg=('neg_count', 'mean'),
    avg_pos=('pos_count', 'mean'),
    avg_content=('content_count', 'mean'),
    avg_style=('style_count', 'mean'),
    avg_framing=('framing_count', 'mean'),
    avg_stereotype=('stereotype_count', 'mean')
).reset_index()

print(summary)


  ETHNICITY  notes_count   avg_neg   avg_pos  avg_content  avg_style  \
0     ASIAN        55253  0.001484  0.012868     0.662715   0.109551   
1     BLACK       180416  0.005986  0.029232     0.750249   0.160014   
2  HISPANIC        65219  0.003159  0.023306     0.782456   0.133182   
3     OTHER       109628  0.002855  0.019420     0.771454   0.133351   
4   UNKNOWN       144843  0.002893  0.026429     0.432171   0.126668   
5     WHITE      1295985  0.004432  0.027324     0.823405   0.162248   

   avg_framing  avg_stereotype  
0     0.116464        0.113007  
1     0.198408        0.101726  
2     0.176743        0.107576  
3     0.127659        0.117187  
4     0.102608        0.078844  
5     0.173565        0.109361  
