In [None]:
# Import libraries
import os

import numpy as np
import pandas as pd


# Access data using Google BigQuery.
from google.colab import auth
from google.cloud import bigquery

In [None]:
# authenticate
auth.authenticate_user()
# Setting up the BigQuery client
project_id = 'comp90089s2mingjun'
client = bigquery.Client(project='comp90089s2mingjun')

In [None]:
# Read data from BigQuery into pandas dataframes.
def run_query(query, project_id=project_id):
  return pd.io.gbq.read_gbq(
      query,
      project_id=project_id,
      dialect='standard')

### Find patients with hypertension based on icd-code and and find the first recorded case of hypertension

In [None]:
query1 = f"""
SELECT DISTINCT
    d.subject_id,
    d.hadm_id,
    d.icd_code,
    a.admittime,
    a.dischtime,
    a.deathtime,
    'ICD_based' AS hypertension_criteria

FROM
    physionet-data.mimiciv_hosp.diagnoses_icd d
JOIN
    physionet-data.mimiciv_hosp.admissions a ON d.hadm_id = a.hadm_id
WHERE
    d.icd_code IN ('I10', 'I15')

"""

# Execute the query and get the results
pt = run_query(query1)
pt_data = pd.DataFrame(pt)
print(f'Total Records: {len(pt_data)}')

# Calculate the number of unique subject_id
unique_subject_ids = pt_data['subject_id'].nunique()

# Output the number of unique subject_id
print(f"Total unique subject_id: {unique_subject_ids}")

# First, convert the 'admittime' column to datetime format to ensure proper comparison
pt_data['admittime'] = pd.to_datetime(pt_data['admittime'])

# Group by subject_id and find the first admission time for each patient
first_admission = pt_data.groupby('subject_id')['admittime'].min().reset_index()

# Merge the first admission time back with the original data to find the first hypertension record for each subject_id
first_hypertension_records = pd.merge(first_admission, pt_data, on=['subject_id', 'admittime'], how='inner')

# Output the total number of first hypertension records
print(f"Total First Hypertension Records: {len(first_hypertension_records)}")

# Display the first 10 records of the first hypertension occurrences
first_hypertension_records.head(10)


  return pd.io.gbq.read_gbq(


Total Records: 51704
Total unique subject_id: 31521
Total First Hypertension Records: 31521


Unnamed: 0,subject_id,admittime,hadm_id,icd_code,dischtime,deathtime,hypertension_criteria
0,10001401,2131-06-04 00:00:00,21544441,I10,2131-06-15 16:10:00,NaT,ICD_based
1,10001884,2130-08-21 15:26:00,26202981,I10,2130-08-23 16:40:00,NaT,ICD_based
2,10002013,2167-07-05 06:10:00,28629319,I10,2167-07-05 11:44:00,NaT,ICD_based
3,10002131,2128-03-17 14:53:00,24065018,I10,2128-03-19 16:25:00,NaT,ICD_based
4,10002348,2112-11-30 22:22:00,22725460,I10,2112-12-10 17:56:00,NaT,ICD_based
5,10002428,2160-04-14 12:30:00,28295257,I10,2160-04-18 16:00:00,NaT,ICD_based
6,10002430,2125-06-23 09:00:00,27218502,I10,2125-06-25 11:42:00,NaT,ICD_based
7,10002443,2183-10-17 23:20:00,21329021,I10,2183-10-20 18:47:00,NaT,ICD_based
8,10002495,2141-05-22 20:17:00,24982426,I10,2141-05-29 17:41:00,NaT,ICD_based
9,10002807,2152-03-30 16:09:00,28464737,I10,2152-03-31 08:20:00,NaT,ICD_based


### According to international standards, find patients with hypertension, find the corresponding hamd_id through subject_id and stay_id, and remove the data with the same subject_id and hamd_id, leaving only one. The ultimate goal is to find cases of patients who have hypertension for the first time.

In [None]:
# SQL 查询
query2 = """
WITH First_Hypertension AS (
    SELECT
        v.subject_id,
        MIN(v.charttime) AS first_charttime  -- Get the first time of hypertension measurement
    FROM
        `physionet-data.mimiciv_derived.vitalsign` v
    WHERE
        v.sbp >= 140 OR v.dbp >= 90  -- 2024 new hypertension criteria
    GROUP BY
        v.subject_id
)
-- Join to retrieve corresponding hadm_id, stay_id, and other details
SELECT
    v.subject_id,
    icu.hadm_id,
    v.stay_id,
    v.charttime AS admittime,
    'BP_based' AS hypertension_criteria
FROM
    First_Hypertension fh
JOIN
    `physionet-data.mimiciv_derived.vitalsign` v
    ON fh.subject_id = v.subject_id AND fh.first_charttime = v.charttime
JOIN
    `physionet-data.mimiciv_derived.icustay_detail` icu
    ON v.subject_id = icu.subject_id AND v.stay_id = icu.stay_id

"""




# Convert the query result into a Pandas DataFrame
pt_by_vitalsign = run_query(query2)
pt_by_vitalsign_data = pd.DataFrame(pt_by_vitalsign)

# Output the total number of records
print(f'Total Records: {len(pt_by_vitalsign_data)}')

# Calculate the number of unique subject_ids
unique_subject_ids = pt_by_vitalsign_data['subject_id'].nunique()
print(f'Unique subject_id count: {unique_subject_ids}')

# View the first 20 records
pt_by_vitalsign_data.head(20)


  return pd.io.gbq.read_gbq(


Total Records: 42644
Unique subject_id count: 42644


Unnamed: 0,subject_id,hadm_id,stay_id,admittime,hypertension_criteria
0,16022256,27633132,31153217,2118-12-03 13:00:00,BP_based
1,10455364,20537341,36385525,2148-08-12 21:03:00,BP_based
2,14393652,20079282,38197697,2123-12-21 21:55:00,BP_based
3,17372212,22110838,36347718,2162-08-17 14:02:00,BP_based
4,15851682,24999949,31682246,2167-04-11 19:27:00,BP_based
5,15223513,28241689,34462332,2135-09-09 16:00:00,BP_based
6,19602578,27362418,36960817,2136-04-05 18:34:00,BP_based
7,11651054,27078841,39589257,2113-02-05 19:00:00,BP_based
8,17923746,25878929,31079581,2122-11-23 08:14:00,BP_based
9,10247613,28010779,34941013,2147-11-02 20:00:00,BP_based


### Merge the data and remove the data with the same **subject_id and hamd_id**

In [None]:
# Convert 'admittime' columns to datetime format to ensure proper comparison
pt_data['admittime'] = pd.to_datetime(pt_data['admittime'])
pt_by_vitalsign_data['admittime'] = pd.to_datetime(pt_by_vitalsign_data['admittime'])

# Combine both query results by concatenating them
combined_data = pd.concat([pt_data[['subject_id', 'hadm_id', 'admittime', 'hypertension_criteria']],
                           pt_by_vitalsign_data[['subject_id', 'hadm_id', 'admittime', 'hypertension_criteria']]])

# Drop any duplicates (if necessary) and sort by subject_id and admittime
combined_data = combined_data.drop_duplicates()
combined_data = combined_data.sort_values(by=['subject_id', 'admittime'])

# Keep only the first record for each subject_id, which is the earliest admittime
final_data = combined_data.groupby('subject_id').first().reset_index()

# Output the total number of unique subject_id and records
print(f"Total unique subject_id: {final_data['subject_id'].nunique()}")
print(f"Total records after merge: {len(final_data)}")

# Display the first 10 records of the final dataset
final_data.head(10)


Total unique subject_id: 63606
Total records after merge: 63606


Unnamed: 0,subject_id,hadm_id,admittime,hypertension_criteria
0,10000980,26913865,2189-06-27 08:55:00,BP_based
1,10001217,24597018,2157-11-20 19:19:00,BP_based
2,10001401,21544441,2131-06-04 00:00:00,ICD_based
3,10001884,26202981,2130-08-21 15:26:00,ICD_based
4,10002013,23581541,2160-05-18 17:00:00,BP_based
5,10002131,24065018,2128-03-17 14:53:00,ICD_based
6,10002348,22725460,2112-11-30 22:22:00,ICD_based
7,10002428,28662225,2156-04-12 18:04:00,BP_based
8,10002430,27218502,2125-06-23 09:00:00,ICD_based
9,10002443,21329021,2183-10-17 23:20:00,ICD_based


In [None]:
# Define your SQL query to get the stroke patients along with their admission time
query_first_stroke = f"""
WITH stroke_patients AS (
    -- Select stroke patients based on ICD codes
    SELECT subject_id, hadm_id, icd_code
    FROM `physionet-data.mimiciv_hosp.diagnoses_icd`
    WHERE icd_code LIKE 'I60%' OR icd_code LIKE 'I61%' OR icd_code LIKE 'I62%' OR icd_code LIKE 'I63%' OR icd_code LIKE 'I64%'
       OR icd_code LIKE 'I65%' OR icd_code LIKE 'I66%' OR icd_code LIKE 'I67%' OR icd_code LIKE 'I68%' OR icd_code LIKE 'I69%'
)
-- Join with admissions table to get admission time
SELECT
    sp.subject_id,
    sp.hadm_id,
    a.admittime,
    sp.icd_code
FROM
    stroke_patients sp
JOIN
    `physionet-data.mimiciv_hosp.admissions` a
ON
    sp.hadm_id = a.hadm_id
ORDER BY
    sp.subject_id, a.admittime
"""

# Execute the query and convert the results to a DataFrame
stroke_admissions = client.query(query_first_stroke).to_dataframe()

# Convert 'admittime' to datetime format for proper comparison
stroke_admissions['admittime'] = pd.to_datetime(stroke_admissions['admittime'])

# Sort by subject_id and admittime to get the earliest stroke diagnosis for each patient
stroke_admissions_sorted = stroke_admissions.sort_values(by=['subject_id', 'admittime'])

# Group by subject_id and keep only the first occurrence (the earliest admittime)
first_stroke_admission = stroke_admissions_sorted.groupby('subject_id').first().reset_index()

# Output the total number of unique subject_id and records
print(f"Total unique subject_id: {first_stroke_admission['subject_id'].nunique()}")
print(f"Total records after filtering for first stroke diagnosis: {len(first_stroke_admission)}")

# Display the first 10 records
first_stroke_admission.head(10)


Total unique subject_id: 7855
Total records after filtering for first stroke diagnosis: 7855


Unnamed: 0,subject_id,hadm_id,admittime,icd_code
0,10002221,21729093,2204-06-27 16:57:00,I671
1,10002430,24648311,2129-04-29 12:24:00,I6523
2,10003299,29323205,2181-10-22 19:08:00,I639
3,10004113,29879900,2173-03-20 00:00:00,I619
4,10004457,28108313,2147-12-19 00:00:00,I6521
5,10004606,28731738,2159-04-05 01:58:00,I69398
6,10006457,24934308,2151-11-06 01:57:00,I69351
7,10011189,23456305,2188-03-21 20:43:00,I671
8,10011365,26722872,2166-01-23 06:02:00,I69351
9,10013310,22098926,2153-06-10 11:55:00,I63412


### Divide patients into three groups:

**Only_hypertension:** Only diagnosed with hypertension and never with stroke

**stroke_after_hypertension:** Diagnosed with stroke after being diagnosed with hypertension

**stroke_before_hypertension:** Diagnosed with stroke before hypertension

In [None]:
# Merge the stroke and hypertension datasets on 'subject_id'
merged_data = pd.merge(final_data, first_stroke_admission, on='subject_id', how='left', suffixes=('_hypertension', '_stroke'))

# Define a function to classify the patients based on the timing of hypertension and stroke diagnoses
def classify_patient(row):
    if pd.isna(row['admittime_stroke']):  # No stroke diagnosis
        return 'Only_hypertension'  # Only diagnosed with hypertension
    elif row['admittime_hypertension'] < row['admittime_stroke']:  # Stroke after hypertension
        return 'stroke_after_hypertension'  # Diagnosed with stroke after being diagnosed with hypertension
    else:  # Stroke before hypertension
        return 'stroke_before_hypertension'

# Apply the classification function to each row
merged_data['classification'] = merged_data.apply(classify_patient, axis=1)

# Output the total count for each classification
print(merged_data['classification'].value_counts())

# Display the first 10 records for verification
merged_data[['subject_id', 'admittime_hypertension', 'admittime_stroke', 'classification']].head(10)


classification
Only_hypertension             57026
stroke_before_hypertension     4855
stroke_after_hypertension      1725
Name: count, dtype: int64


Unnamed: 0,subject_id,admittime_hypertension,admittime_stroke,classification
0,10000980,2189-06-27 08:55:00,NaT,Only_hypertension
1,10001217,2157-11-20 19:19:00,NaT,Only_hypertension
2,10001401,2131-06-04 00:00:00,NaT,Only_hypertension
3,10001884,2130-08-21 15:26:00,NaT,Only_hypertension
4,10002013,2160-05-18 17:00:00,NaT,Only_hypertension
5,10002131,2128-03-17 14:53:00,NaT,Only_hypertension
6,10002348,2112-11-30 22:22:00,NaT,Only_hypertension
7,10002428,2156-04-12 18:04:00,NaT,Only_hypertension
8,10002430,2125-06-23 09:00:00,2129-04-29 12:24:00,stroke_after_hypertension
9,10002443,2183-10-17 23:20:00,NaT,Only_hypertension


### Find those patients who suffered from hypertension first and then suffered a stroke, and those who suffered from hypertension at all, and exclude patients who had a stroke before hypertension.

In [None]:
# Merge the stroke and hypertension datasets on 'subject_id' using an outer join to include all patients
merged_data = pd.merge(final_data, first_stroke_admission, on='subject_id', how='outer', suffixes=('_hypertension', '_stroke'))

# Define a function to classify the patients based on the timing of hypertension and stroke diagnoses
def classify_patient(row):
    if pd.isna(row['admittime_stroke']) and pd.notna(row['admittime_hypertension']):  # No stroke diagnosis
        return 'Only_hypertension'  # Only diagnosed with hypertension
    elif pd.isna(row['admittime_hypertension']) and pd.notna(row['admittime_stroke']):  # No hypertension diagnosis
        return 'Only_stroke'  # Only diagnosed with stroke
    elif row['admittime_hypertension'] < row['admittime_stroke']:  # Stroke after hypertension
        return 'stroke_after_hypertension'  # Diagnosed with stroke after being diagnosed with hypertension
    else:  # Stroke before hypertension
        return 'stroke_before_hypertension'

# Apply the classification function to each row
merged_data['classification'] = merged_data.apply(classify_patient, axis=1)

# Filter both 'stroke_after_hypertension' and 'Only_hypertension' groups
filtered_data = merged_data[merged_data['classification'].isin(['stroke_after_hypertension', 'Only_hypertension'])]

# Select the relevant columns to display
output_data = filtered_data[['subject_id', 'hadm_id_hypertension', 'admittime_hypertension', 'admittime_stroke', 'classification']]

# Rename 'hadm_id_hypertension' to 'hadm_id' for clarity
output_data = output_data.rename(columns={'hadm_id_hypertension': 'hadm_id'})

# Display the filtered data
print(f"Total patients with Only_hypertension or stroke_after_hypertension: {len(output_data)}")
output_data.head(10)





Total patients with Only_hypertension or stroke_after_hypertension: 58751


Unnamed: 0,subject_id,hadm_id,admittime_hypertension,admittime_stroke,classification
0,10000980,26913865,2189-06-27 08:55:00,NaT,Only_hypertension
1,10001217,24597018,2157-11-20 19:19:00,NaT,Only_hypertension
2,10001401,21544441,2131-06-04 00:00:00,NaT,Only_hypertension
3,10001884,26202981,2130-08-21 15:26:00,NaT,Only_hypertension
4,10002013,23581541,2160-05-18 17:00:00,NaT,Only_hypertension
5,10002131,24065018,2128-03-17 14:53:00,NaT,Only_hypertension
7,10002348,22725460,2112-11-30 22:22:00,NaT,Only_hypertension
8,10002428,28662225,2156-04-12 18:04:00,NaT,Only_hypertension
9,10002430,27218502,2125-06-23 09:00:00,2129-04-29 12:24:00,stroke_after_hypertension
10,10002443,21329021,2183-10-17 23:20:00,NaT,Only_hypertension


### Add the two features of gender and age. In the physionet-data.mimiciv_hosp.patients table, gender is directly associated with subject_id. Age is calculated as follows: the year in admittime_hypertension in output_data minus the value of anchor_year in the physionet-data.mimiciv_hosp.patients table plus anchor_age in the physionet-data.mimiciv_hosp.patients table is the age when the patient was first diagnosed with hypertension.

In [None]:
# Query the patients table to get subject_id, gender, anchor_year, and anchor_age
query_patients = """
SELECT
    subject_id,
    gender,
    anchor_year,
    anchor_age
FROM
    `physionet-data.mimiciv_hosp.patients`
"""

# Execute the query and convert the result to a DataFrame
patients_data = client.query(query_patients).to_dataframe()

# Merge the output_data with patients_data on 'subject_id' to get gender, anchor_year, and anchor_age
merged_with_patients = pd.merge(output_data, patients_data, on='subject_id', how='left')

# Extract the year from the 'admittime_hypertension' column to calculate age
merged_with_patients['admittime_year'] = pd.to_datetime(merged_with_patients['admittime_hypertension']).dt.year

# Calculate age at the time of hypertension diagnosis
merged_with_patients['age'] = merged_with_patients['admittime_year'] - merged_with_patients['anchor_year'] + merged_with_patients['anchor_age']

# Select the relevant columns including subject_id, gender, age, and admittime_hypertension
output = merged_with_patients[['subject_id', 'hadm_id', 'admittime_hypertension', 'admittime_stroke', 'classification', 'gender', 'age']]
print(len(output))
# Display the first 10 records with the added gender and age columns
output.head(20)

58751


Unnamed: 0,subject_id,hadm_id,admittime_hypertension,admittime_stroke,classification,gender,age
0,10000980,26913865,2189-06-27 08:55:00,NaT,Only_hypertension,F,76
1,10001217,24597018,2157-11-20 19:19:00,NaT,Only_hypertension,F,55
2,10001401,21544441,2131-06-04 00:00:00,NaT,Only_hypertension,F,89
3,10001884,26202981,2130-08-21 15:26:00,NaT,Only_hypertension,F,76
4,10002013,23581541,2160-05-18 17:00:00,NaT,Only_hypertension,F,57
5,10002131,24065018,2128-03-17 14:53:00,NaT,Only_hypertension,F,92
6,10002348,22725460,2112-11-30 22:22:00,NaT,Only_hypertension,F,77
7,10002428,28662225,2156-04-12 18:04:00,NaT,Only_hypertension,F,81
8,10002430,27218502,2125-06-23 09:00:00,2129-04-29 12:24:00,stroke_after_hypertension,M,86
9,10002443,21329021,2183-10-17 23:20:00,NaT,Only_hypertension,M,53


### add the feature of race

In [None]:
# Query the admissions table to get subject_id, hadm_id, and race
query_admissions = """
SELECT
    subject_id,
    hadm_id,
    race
FROM
    `physionet-data.mimiciv_hosp.admissions`
"""

# Execute the query and convert the result to a DataFrame
admissions_data = client.query(query_admissions).to_dataframe()

# Merge the output with admissions_data on 'subject_id' and 'hadm_id' to get race
merged_with_race = pd.merge(output, admissions_data[['subject_id', 'hadm_id', 'race']], on=['subject_id', 'hadm_id'], how='left')

# Display the first 10 records with the added race column
merged_with_race.head(10)


Unnamed: 0,subject_id,hadm_id,admittime_hypertension,admittime_stroke,classification,gender,age,race
0,10000980,26913865,2189-06-27 08:55:00,NaT,Only_hypertension,F,76,BLACK/AFRICAN AMERICAN
1,10001217,24597018,2157-11-20 19:19:00,NaT,Only_hypertension,F,55,WHITE
2,10001401,21544441,2131-06-04 00:00:00,NaT,Only_hypertension,F,89,WHITE
3,10001884,26202981,2130-08-21 15:26:00,NaT,Only_hypertension,F,76,BLACK/AFRICAN AMERICAN
4,10002013,23581541,2160-05-18 17:00:00,NaT,Only_hypertension,F,57,OTHER
5,10002131,24065018,2128-03-17 14:53:00,NaT,Only_hypertension,F,92,WHITE
6,10002348,22725460,2112-11-30 22:22:00,NaT,Only_hypertension,F,77,WHITE
7,10002428,28662225,2156-04-12 18:04:00,NaT,Only_hypertension,F,81,WHITE
8,10002430,27218502,2125-06-23 09:00:00,2129-04-29 12:24:00,stroke_after_hypertension,M,86,WHITE
9,10002443,21329021,2183-10-17 23:20:00,NaT,Only_hypertension,M,53,WHITE


### Find all the diseases these cases had before the first diagnosis of hypertension

In [None]:
# Query to get previously diagnosed ICD codes before the hypertension diagnosis
query_previous_icd = """
SELECT
    d.subject_id,
    d.hadm_id,
    d.icd_code,
    a.admittime
FROM
    `physionet-data.mimiciv_hosp.diagnoses_icd` d
JOIN
    `physionet-data.mimiciv_hosp.admissions` a ON d.hadm_id = a.hadm_id
WHERE
    d.subject_id IN ({subject_ids})  -- We will replace this with the list of subject_ids from merged_with_race
"""

# Get the list of subject_ids to limit the query
subject_ids = ','.join(merged_with_race['subject_id'].astype(str).unique())

# Update the query with the list of subject_ids
query_previous_icd = query_previous_icd.format(subject_ids=subject_ids)

# Execute the query and get the results
previous_icd_data = client.query(query_previous_icd).to_dataframe()

# Convert 'admittime' to datetime format for comparison
previous_icd_data['admittime'] = pd.to_datetime(previous_icd_data['admittime'])

# Merge with the merged_with_race data to bring in admittime_hypertension
merged_icd_with_hypertension = pd.merge(previous_icd_data, merged_with_race[['subject_id', 'admittime_hypertension']], on='subject_id')

# Filter out ICD codes diagnosed before the hypertension diagnosis
previous_icd_before_hypertension = merged_icd_with_hypertension[
    merged_icd_with_hypertension['admittime'] < merged_icd_with_hypertension['admittime_hypertension']
]

# Group the ICD codes for each subject_id
previous_icd_grouped = previous_icd_before_hypertension.groupby('subject_id')['icd_code'].apply(lambda codes: ', '.join(codes)).reset_index()

# Merge the ICD codes with the original merged_with_race dataframe
final_data_with_icd = pd.merge(merged_with_race, previous_icd_grouped, on='subject_id', how='left')

# Rename the column for clarity
final_data_with_icd = final_data_with_icd.rename(columns={'icd_code': 'previous_icd_codes'})
print(len(final_data_with_icd))
# Display the final dataframe with the added previous ICD codes
final_data_with_icd.head(10)




58751


Unnamed: 0,subject_id,hadm_id,admittime_hypertension,admittime_stroke,classification,gender,age,race,previous_icd_codes
0,10000980,26913865,2189-06-27 08:55:00,NaT,Only_hypertension,F,76,BLACK/AFRICAN AMERICAN,"41071, 42833, 42823, 41189, 41412, 40390, 5854..."
1,10001217,24597018,2157-11-20 19:19:00,NaT,Only_hypertension,F,55,WHITE,"3240, 3484, 3485, 5180, 340, 04109, 3051, V168..."
2,10001401,21544441,2131-06-04 00:00:00,NaT,Only_hypertension,F,89,WHITE,
3,10001884,26202981,2130-08-21 15:26:00,NaT,Only_hypertension,F,76,BLACK/AFRICAN AMERICAN,"7850, 4871, 49392, 486, 78659, 78650, 49322, 4..."
4,10002013,23581541,2160-05-18 17:00:00,NaT,Only_hypertension,F,57,OTHER,"99672, 78659, 72709, 42832, 42832, 4139, 4928,..."
5,10002131,24065018,2128-03-17 14:53:00,NaT,Only_hypertension,F,92,WHITE,"78650, 2948, 00863, 4263, 27651, 5641, 2948, 3..."
6,10002348,22725460,2112-11-30 22:22:00,NaT,Only_hypertension,F,77,WHITE,
7,10002428,28662225,2156-04-12 18:04:00,NaT,Only_hypertension,F,81,WHITE,"78097, E9478, 4940, 56210, 53550, 5768, 7904, ..."
8,10002430,27218502,2125-06-23 09:00:00,2129-04-29 12:24:00,stroke_after_hypertension,M,86,WHITE,
9,10002443,21329021,2183-10-17 23:20:00,NaT,Only_hypertension,M,53,WHITE,


In [None]:
# Save final_data_with_icd as a CSV file
final_data_with_icd.to_csv('final_data_with_icd.csv', index=False)

# Make sure the data has been saved successfully
print("Data saved to final_data_with_icd.csv")

Data saved to final_data_with_icd.csv
