In [12]:
# Import libraries
import os

import numpy as np
import pandas as pd


# Access data using Google BigQuery.
from google.colab import auth
from google.cloud import bigquery

In [13]:
# authenticate
auth.authenticate_user()
# Setting up the BigQuery client
project_id = 'comp90089s2mingjun'
client = bigquery.Client(project='comp90089s2mingjun')

In [14]:
# Load the CSV file containing the separated ICD codes
final_data = pd.read_csv('final_data.csv')


In [17]:
# Query the physionet-data.mimiciv_hosp.d_icd_diagnoses table to extract icd_code and long_title
query = """
    SELECT icd_code, long_title
    FROM `physionet-data.mimiciv_hosp.d_icd_diagnoses`
    WHERE icd_code IN UNNEST(@icd_codes)
"""

# Extract unique ICD codes from final_data
icd_codes_list = final_data.columns[6:]  ##can change this number

# Pass the ICD codes list as a parameter to the query
job_config = bigquery.QueryJobConfig(
    query_parameters=[
        bigquery.ArrayQueryParameter("icd_codes", "STRING", icd_codes_list.tolist())
    ]
)

# Execute the query
icd_diagnoses_df = client.query(query, job_config=job_config).to_dataframe()

# View the query results
print(icd_diagnoses_df.head())


  icd_code                                         long_title
0    00845  Intestinal infection due to Clostridium difficile
1     0389                             Unspecified septicemia
2    04104  Streptococcus infection in conditions classifi...
3    04111  Methicillin susceptible Staphylococcus aureus ...
4     0414  Escherichia coli [E. coli] infection in condit...


In [19]:
# Create a dictionary that maps icd_code to long_title
icd_to_title = icd_diagnoses_df.set_index('icd_code')['long_title'].to_dict()

# Define a function to convert icd_code to the format long_title[icd_code]
def format_icd_code(icd_column_name):
    icd_code = icd_column_name
    if icd_code in icd_to_title:
        return f"{icd_to_title[icd_code]}[{icd_code}]"
    else:
        return icd_code  # If no long_title is found, keep the original ICD code

# Update the column names in final_data
updated_columns = ['subject_id', 'hadm_id', 'gender', 'age', 'race', 'classification'] + [format_icd_code(icd) for icd in icd_codes_list]
final_data.columns = updated_columns

# View the final result
print(final_data.head())

# Save the final result to a CSV file
final_data.to_csv('final_data_with_long_titles.csv', index=False)


   subject_id   hadm_id gender  age                    race  \
0    10000980  26913865      F   76  BLACK/AFRICAN AMERICAN   
1    10001217  24597018      F   55                   WHITE   
2    10001401  21544441      F   89                   WHITE   
3    10001884  26202981      F   76  BLACK/AFRICAN AMERICAN   
4    10002013  23581541      F   57                   OTHER   

      classification  \
0  Only_hypertension   
1  Only_hypertension   
2  Only_hypertension   
3  Only_hypertension   
4  Only_hypertension   

   Intestinal infection due to Clostridium difficile[00845]  \
0                                                  0          
1                                                  0          
2                                                  0          
3                                                  0          
4                                                  0          

   Unspecified septicemia[0389]  \
0                             0   
1                          