In [None]:
import os
import pandas as pd
import duckdb
from pathlib import Path
import sys
import io

# username = os.getlogin()  # Works locally
# desc_path = os.path.join("C:\\Users", username, "Documents", "GitHub", 
#                          "DEEP_ML_Project", "data", "extracted" , "D_ICD_DIAGNOSES.csv")
# diag_path = os.path.join("C:\\Users", username, "Documents", "GitHub", 
#                          "DEEP_ML_Project", "data", "extracted" , "DIAGNOSES_ICD.csv")
# notes_path = os.path.join("C:\\Users", username, "Documents", "GitHub", 
#                          "DEEP_ML_Project", "data", "extracted" , "NOTEEVENTS.csv")
# Check if running in Google Colab
def is_colab():
    try:
        import google.colab
        return True
    except:
        return False

# Set up environment-specific configurations
if is_colab():
    from google.colab import files
    import gzip
    
    print("Running in Google Colab environment")
    
    # Create directory structure in Colab
    for dir_path in ["./data/extracted", "./data/preprocessed"]:
        os.makedirs(dir_path, exist_ok=True)
    
    # Function to handle file uploads in Colab
    def upload_and_save_files():
        print("Please upload the required CSV files:")
        print("1. D_ICD_DIAGNOSES.csv[.gz]")
        print("2. DIAGNOSES_ICD.csv[.gz]")
        print("3. NOTEEVENTS.csv[.gz]")
        
        uploaded = files.upload()
        
        for filename, content in uploaded.items():
            # Handle both compressed and uncompressed files
            if filename.endswith('.gz'):
                # Decompress .gz files
                base_name = filename[:-3]  # Remove .gz extension
                with gzip.open(io.BytesIO(content), 'rb') as f_in:
                    with open(f'./data/extracted/{base_name}', 'wb') as f_out:
                        f_out.write(f_in.read())
                print(f"Decompressed and saved: {base_name}")
            else:
                # Save as-is if not compressed
                with open(f'./data/extracted/{filename}', 'wb') as f:
                    f.write(content)
                print(f"Saved: {filename}")
    
    # Ask user if they want to upload files or use previously uploaded ones
    if not all(os.path.exists(f"./data/extracted/{f}") for f in 
              ["D_ICD_DIAGNOSES.csv", "DIAGNOSES_ICD.csv", "NOTEEVENTS.csv"]):
        print("Required files not found. You'll need to upload them.")
        upload_and_save_files()
    else:
        response = input("Use previously uploaded files? (y/n): ")
        if response.lower() != 'y':
            upload_and_save_files()
            
    # Set paths for Colab environment
    extracted_dir = Path("./data/extracted")
    preprocessed_dir = Path("./data/preprocessed")
else:
    print("Running in local environment")
    # Use the project's directory structure as defined in README
    extracted_dir = Path("../data/extracted")
    preprocessed_dir = Path("../data/preprocessed")
    preprocessed_dir.mkdir(parents=True, exist_ok=True)
    
extracted_dir = Path("../data/extracted")
desc_path = extracted_dir / "D_ICD_DIAGNOSES.csv"
diag_path = extracted_dir / "DIAGNOSES_ICD.csv"
notes_path = extracted_dir / "NOTEEVENTS.csv"

print(desc_path)
print(diag_path)
print(notes_path)

C:\Users\Alex\Documents\GitHub\DEEP_ML_Project\Raw Data\D_ICD_DIAGNOSES.csv
C:\Users\Alex\Documents\GitHub\DEEP_ML_Project\Raw Data\DIAGNOSES_ICD.csv
C:\Users\Alex\Documents\GitHub\DEEP_ML_Project\Raw Data\NOTEEVENTS.csv


In [2]:
# Create an in-memory DuckDB connection
con = duckdb.connect(database=':memory:')

# Load CSVs directly using DuckDB (efficient for large files)
query = f"""
-- Load NOTEEVENTS (discharge summaries only, no known errors)
CREATE VIEW noteevents AS
SELECT *
FROM read_csv_auto('{notes_path}')
WHERE category = 'Discharge summary' AND ISERROR IS NULL;

-- Load DIAGNOSES_ICD
CREATE VIEW diagnoses_icd AS
SELECT *
FROM read_csv_auto('{diag_path}');

-- Load ICD-9 descriptions
CREATE VIEW d_icd_diagnoses AS
SELECT *
FROM read_csv_auto('{desc_path}');
"""

# Run the multi-query
con.execute(query)

<duckdb.duckdb.DuckDBPyConnection at 0x20b7ecd5ef0>

In [3]:
# Display NOTEEVENTS column names and top 5 rows
print("NOTEEVENTS Table Schema:")
noteevents_schema = con.execute("PRAGMA table_info(noteevents);").fetchdf()
print(noteevents_schema[['name']])  # Display only column names

print("\nTop 5 rows from NOTEEVENTS:")
noteevents_preview = con.execute("SELECT * FROM noteevents LIMIT 5;").fetchdf()
print(noteevents_preview)

NOTEEVENTS Table Schema:
           name
0        ROW_ID
1    SUBJECT_ID
2       HADM_ID
3     CHARTDATE
4     CHARTTIME
5     STORETIME
6      CATEGORY
7   DESCRIPTION
8          CGID
9       ISERROR
10         TEXT

Top 5 rows from NOTEEVENTS:


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

   ROW_ID  SUBJECT_ID  HADM_ID  CHARTDATE CHARTTIME STORETIME  \
0     174       22532   167853 2151-08-04      None      None   
1     175       13702   107527 2118-06-14      None      None   
2     176       13702   167118 2119-05-25      None      None   
3     177       13702   196489 2124-08-18      None      None   
4     178       26880   135453 2162-03-25      None      None   

            CATEGORY DESCRIPTION  CGID ISERROR  \
0  Discharge summary      Report  None    None   
1  Discharge summary      Report  None    None   
2  Discharge summary      Report  None    None   
3  Discharge summary      Report  None    None   
4  Discharge summary      Report  None    None   

                                                TEXT  
0  Admission Date:  [**2151-7-16**]       Dischar...  
1  Admission Date:  [**2118-6-2**]       Discharg...  
2  Admission Date:  [**2119-5-4**]              D...  
3  Admission Date:  [**2124-7-21**]              ...  
4  Admission Date:  [**2162-3-3**

In [4]:
# Get top 20 most frequent ICD-9 codes
top_codes_query = f"""
SELECT icd9_code, COUNT(*) as count
FROM read_csv_auto('{diag_path}')
GROUP BY icd9_code
ORDER BY count DESC
LIMIT 20;
"""
top_codes_df = con.execute(top_codes_query).fetchdf()
top_codes = top_codes_df['ICD9_CODE'].tolist()
#print(top_codes)

# Load ICD-9 descriptions into a DataFrame
desc_df = con.execute("SELECT * FROM d_icd_diagnoses").fetchdf()

# Optional: check column names
print("Top codes columns:", top_codes_df.columns.tolist())
print("Descriptions columns:", desc_df.columns.tolist())

# Try both UPPER and lowercase just in case
count_col = 'COUNT' if 'COUNT' in top_codes_df.columns else 'count'

# Merge with description
merged_df = pd.merge(top_codes_df, desc_df[['ICD9_CODE', 'LONG_TITLE']], on='ICD9_CODE', how='left')

# Sort and print
merged_df = merged_df.sort_values(by=count_col, ascending=False).reset_index(drop=True)

for idx, row in merged_df.iterrows():
    print(f"{idx+1}. Code: {row['ICD9_CODE']} → {row['LONG_TITLE']} → {row[count_col]} instances")

Top codes columns: ['ICD9_CODE', 'count']
Descriptions columns: ['ROW_ID', 'ICD9_CODE', 'SHORT_TITLE', 'LONG_TITLE']
1. Code: 4019 → Unspecified essential hypertension → 20703 instances
2. Code: 4280 → Congestive heart failure, unspecified → 13111 instances
3. Code: 42731 → Atrial fibrillation → 12891 instances
4. Code: 41401 → Coronary atherosclerosis of native coronary artery → 12429 instances
5. Code: 5849 → Acute kidney failure, unspecified → 9119 instances
6. Code: 25000 → Diabetes mellitus without mention of complication, type II or unspecified type, not stated as uncontrolled → 9058 instances
7. Code: 2724 → Other and unspecified hyperlipidemia → 8690 instances
8. Code: 51881 → Acute respiratory failure → 7497 instances
9. Code: 5990 → Urinary tract infection, site not specified → 6555 instances
10. Code: 53081 → Esophageal reflux → 6326 instances
11. Code: 2720 → Pure hypercholesterolemia → 5930 instances
12. Code: V053 → Need for prophylactic vaccination and inoculation agains

In [5]:
# Join, filter for discharge notes, keep only top codes
top_codes_str = "', '".join(top_codes)
fetch_query = f"""
WITH joined AS (
    SELECT 
        n.subject_id,
        n.hadm_id,
        SUBSTRING(n.text, 1, 5000) AS summary_snippet,
        d.icd9_code,
        icd.long_title
    FROM read_csv_auto('{notes_path}') n
    JOIN read_csv_auto('{diag_path}') d
      ON n.subject_id = d.subject_id AND n.hadm_id = d.hadm_id
    JOIN read_csv_auto('{desc_path}') icd
      ON d.icd9_code = icd.icd9_code
    WHERE n.category = 'Discharge summary'
      AND d.icd9_code IN ('{top_codes_str}')
)
SELECT 
    subject_id,
    hadm_id,
    summary_snippet,
    STRING_AGG(icd9_code, ', ') AS icd9_codes,
    STRING_AGG(long_title, '; ') AS diagnoses,
    COUNT(icd9_code) AS code_count
FROM joined
GROUP BY subject_id, hadm_id, summary_snippet
LIMIT 50000;
"""
results = con.execute(fetch_query).fetchdf()


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [6]:
print(results.head(3))  # Show sample output
#print(len(results))

   SUBJECT_ID  HADM_ID                                    summary_snippet  \
0        8556   179945  Admission Date:  [**2113-9-10**]       Dischar...   
1        6999   127232  Admission Date:  [**2164-3-22**]              ...   
2        9598   139201  Admission Date:  [**2141-2-24**]       Dischar...   

                icd9_codes                                          diagnoses  \
0                    42731                                Atrial fibrillation   
1  4019, 41401, 2720, 2851  Unspecified essential hypertension; Coronary a...   
2       4280, 42731, 41401  Congestive heart failure, unspecified; Atrial ...   

   code_count  
0           1  
1           4  
2           3  


In [7]:
#Print the results of the summary and which codes were assocated with it Example
print(results.loc[5, 'summary_snippet'])
print("-" * 20)  # Separator for clarity

icd9_codes = results.loc[1, 'icd9_codes'].split(', ')
diagnoses = results.loc[1, 'diagnoses'].split('; ')

for i in range(min(len(icd9_codes), len(diagnoses))):
  print(f"{icd9_codes[i]} --> {diagnoses[i]}")

# handling the case where icd9_codes and diagnoses are different lengths.
if len(icd9_codes) > len(diagnoses):
    for i in range(len(diagnoses), len(icd9_codes)):
        print(f"{icd9_codes[i]} --> No corresponding diagnosis")
elif len(diagnoses) > len(icd9_codes):
    for i in range(len(icd9_codes), len(diagnoses)):
        print(f"No corresponding ICD9 --> {diagnoses[i]}")


Admission Date:  [**2159-10-5**]     Discharge Date:  [**2159-10-10**]

Date of Birth:   [**2114-8-15**]     Sex:  M

Service:  [**Hospital1 **]

HISTORY OF PRESENT ILLNESS:  Patient is admitted with a chief
complaint of fever.  This is a 44-year-old male with multiple
medical problems including several episodes of endocarditis,
status post homograft aortic valve replacement times two,
status post AV outflow tract route debridement, status post
pseudomonal pneumonia, intermittent pancreatitis, diabetes
mellitus type 2, coronary artery disease status post coronary
artery bypass graft, hypercalcemia, critical care neuropathy,
who was recently cared for at [**Hospital3 672**] from [**2159-8-17**]
to [**2159-9-14**] following his pseudomonal pneumonia who
presented on [**2159-10-5**] with fevers, chills, dizziness, and a
nonproductive cough as well as one-day history of vomiting.

Patient denied any headache, visual changes, chest pain,
pleuritic pain, back pain, abdominal pain, diarrhea, 

In [None]:
# Export results to CSV
#summary_results_path = os.path.join("C:\\Users", username, "Documents", "GitHub", "DEEP_ML_Project", "data", "preprocessed", "summary_results.csv")
#summary_results_trimmed_path = os.path.join("C:\\Users", username, "Documents", "GitHub", "DEEP_ML_Project", "data", "preprocessed", "summary_results_trimmed.csv")

preprocessed_dir = Path("../data/preprocessed")
preprocessed_dir.mkdir(parents=True, exist_ok=True)

summary_results_path = preprocessed_dir / "summary_results.csv"
summary_results_trimmed_path = preprocessed_dir / "summary_results_trimmed.csv"

results.iloc[:-20].to_csv(summary_results_path, index=False)
results.iloc[-20:].to_csv(summary_results_trimmed_path, index=False)

print(f"Full training results exported to: {summary_results_path}")
print(f"Trimmed training results exported to: {summary_results_trimmed_path}")

# If running in Colab, provide download option for the generated files
if is_colab():
    print("\nDownload processed files:")
    for file_path in [summary_results_path, summary_results_trimmed_path]:
        if os.path.exists(file_path):
            files.download(str(file_path))

Results exported to: C:\Users\Alex\Documents\GitHub\DEEP_ML_Project\summary_results.csv


In [9]:
# # Sample JOIN query to fetch ICD-9 + Discharge Summary
# fetch_query = f"""
# SELECT 
#     n.subject_id,
#     n.hadm_id,
#     SUBSTRING(n.text, 1, 4000) AS summary_snippet,
#     STRING_AGG(d.icd9_code, ', ') AS icd9_codes,
#     STRING_AGG(icd.long_title, '; ') AS diagnoses
# FROM read_csv_auto('{notes_path}') n
# JOIN read_csv_auto('{diag_path}') d
#   ON n.subject_id = d.subject_id AND n.hadm_id = d.hadm_id
# JOIN read_csv_auto('{desc_path}') icd
#   ON d.icd9_code = icd.icd9_code
# WHERE n.category = 'Discharge summary'
# GROUP BY n.subject_id, n.hadm_id, n.text
# # LIMIT 5000;
# """



# # Execute and display results
# results = con.execute(fetch_query).fetchdf()
# print(results.head(3))  # Show sample output