In [1]:
import sys
import pickle
import numpy as np
import os
import pandas as pd
import argparse
import gzip
from itertools import chain
from collections import defaultdict, Counter
from typing import Dict, Optional, Tuple, List

In [6]:
def reformat_icd(code: str, version: int, is_diag: bool) -> str:
    """
    Reformat the given ICD code based on the specified version.

    Args:
        code (str): The ICD code to be reformatted.
        version (int): The version of the ICD code (9 or 10).
        is_diag (bool): Indicates whether the code is a diagnosis code or not.

    Returns:
        str: The reformatted ICD code.

    Raises:
        ValueError: If the version is neither 9 nor 10.
    """
    if version == 9:
        return reformat_icd9(code, is_diag)
    elif version == 10:
        return reformat_icd10(code, is_diag)
    else:
        raise ValueError("version must be 9 or 10")

def reformat_icd10(code: str, is_diag: bool) -> str:
    """
    Reformat the given ICD-10 code.

    Args:
    code (str): The ICD-10 code to be reformatted.
    is_diag (bool): Indicates whether the code is a diagnosis code or not.

    Returns:
    str: The reformatted ICD-10 code.

    """
    code = "".join(code.split("."))
    if not is_diag:
        return code
    return code[:3] + "." + code[3:]


def reformat_icd9(code: str, is_diag: bool) -> str:
    """
    Reformat the given ICD-9 code based on the provided parameters putting a point in the right place.

    Args:
        code (str): The ICD-9 code to be reformatted.
        is_diag (bool): A flag indicating whether the code is a diagnosis code or not.

    Returns:
        str: The reformatted ICD-9 code.

    """
    code = "".join(code.split("."))
    if is_diag:
        if code.startswith("E"):
            if len(code) > 4:
                return code[:4] + "." + code[4:]
        else:
            if len(code) > 3:
                return code[:3] + "." + code[3:]
    else:
        if len(code) > 2:
            return code[:2] + "." + code[2:]
    return code


In [7]:
def get_ICDs_from_mimic_file(fileName: str, isdiagnosis: bool = True) -> Dict[int, List[str]]:
    """
    Retrieves ICD codes from a MIMIC file.

    Args:
        fileName (str): The path to the MIMIC file.
        isdiagnosis (bool, optional): Specifies whether to retrieve diagnosis codes (True) or procedure codes (False). 
                                      Defaults to True.

    Returns:
        dict: A dictionary mapping hospital admission IDs (hadm_id) to a list of corresponding ICD codes.

    """

    mapping = {}
    mimicFile = gzip.open(fileName, 'r')
        
    codes = []
    
    number_of_null_ICD9_codes = 0
    number_of_null_ICD10_codes = 0
    mimicFile.readline()
    for line in mimicFile:  #   0  ,     1    ,    2   ,   3  ,    4
        tokens = line.decode('utf-8').strip().split(',')
        #print(tokens)
        hadm_id = int(tokens[1])
        # depending on the file, the ICD code is in different columns (3 for diagnosis, 4 for procedures)
        if (isdiagnosis and len(tokens[3]) == 0) or (not isdiagnosis and len(tokens[4]) == 0):
            if isdiagnosis:
                if (tokens[4] =='9'):
                    # ignore diagnoses where ICD9_code is null
                    number_of_null_ICD9_codes += 1
                else:
                    number_of_null_ICD10_codes += 1
                continue
            else:
                if (tokens[5] =='9'):
                    # ignore diagnoses where ICD9_code is null
                    number_of_null_ICD9_codes += 1
                else:
                    number_of_null_ICD10_codes += 1
                continue
                
        if isdiagnosis:
            ICD_code = tokens[3]
        else:
            ICD_code = tokens[4] 

        if ICD_code.find("\"") != -1:
            ICD_code = ICD_code[1:-1].strip()  # toss off quotes and proceed

        # since diagnosis and procedure ICD9 codes have intersections, a prefix is necessary for disambiguation
        if isdiagnosis:
            ICD_code = 'D' + tokens[4]+ '_' +ICD_code
        else:
            ICD_code = 'P' + tokens[5] + '_' + ICD_code

        if hadm_id in mapping:
            mapping[hadm_id].append(ICD_code.strip())
        else:
            mapping[hadm_id]= [ICD_code.strip()]  

    mimicFile.close()
    print ('-Number of null ICD9 codes in file ' + fileName + ': ' + str(number_of_null_ICD9_codes))
    print ('-Number of null ICD10 codes in file ' + fileName + ': ' + str(number_of_null_ICD10_codes))
    return mapping

In [8]:
def get_drugs_from_mimic_file(fileName :str, choice : Optional[str] ='ndc') -> Tuple[Dict[str, str], Dict[int, list]]:
    """
    Extracts drug information from a MIMIC file.

    Args:
        fileName (str): The path to the MIMIC file.
        choice (str, optional): The choice of drug code to extract. Defaults to 'ndc'.

    Returns:
        tuple: A tuple containing two dictionaries:
            - drugDescription: A dictionary mapping drug codes to their descriptions.
            - mapping: A dictionary mapping hospital admission IDs to lists of drug codes.

    Raises:
        Exception: If an error occurs while processing the MIMIC file.
    """
    mapping = {}
    drugDescription = {}
    mimicFile = gzip.open(fileName, 'r')  # subject_id,hadm_id,gsn,ndc,drug
    mimicFile.readline()
    number_of_null_NDC_codes = 0
    try:
        for line in mimicFile:
            tokens = line.decode('utf-8').strip().split(',')
            hadm_id = int(tokens[1])
            if choice =='ndc':
                drug_code = tokens[12]
            else:
                drug_code = tokens[11]
            drug_code = drug_code.strip()
            drug_code = 'DR'+'_'+drug_code
            if hadm_id in mapping:
                mapping[hadm_id].append(drug_code.strip())
            else:
                mapping[hadm_id]=[drug_code.strip()]
            if drug_code not in drugDescription:
                drugDescription[drug_code] = tokens[9]
    except Exception as e:
        print(e)
    mimicFile.close()
    return drugDescription, mapping

In [9]:
def load_mimic_data(choice : Optional[str] = 'ndc'):
    """
    Loads MIMIC data and returns various mappings.

    Args:
    - choice (Optional[str]): The choice of drug mapping. Defaults to 'ndc'.

    Returns:
    - subject_idAdmMap (dict): A dictionary mapping subject_id to a list of admission IDs.
    - admDxMap (dict): A dictionary mapping admission IDs to diagnosis codes.
    - admPxMap (dict): A dictionary mapping admission IDs to procedure codes.
    - admDrugMap (dict): A dictionary mapping admission IDs to drug codes.
    - drugDescription (dict): A dictionary mapping drug codes to drug descriptions.
    """
    print ('Building subject_id-admission mapping, admission-date mapping')
    previous_subject = 0
    previous_admission = 0
    subject_idAdmMap = {}
    admDateMap = {}
    subject_idStatic = {}   # adm type, Insurance , ethnicity , marital status
    infd = gzip.open(admissionFile, 'r')
    infd.readline()
    for line in infd:
        tokens = line.decode('utf-8').strip().split(',')
        subject_id = int(tokens[0])
        hadm_id = int(tokens[1])
        if subject_id in subject_idAdmMap: 
            subject_idAdmMap[subject_id].add(hadm_id)
        else: 
            subject_idAdmMap[subject_id] = set()
            subject_idAdmMap[subject_id].add(hadm_id)
    for subject_id in subject_idAdmMap.keys():
        subject_idAdmMap[subject_id] = list(subject_idAdmMap[subject_id])  
    infd.close()

    print ('Building admission-diagnosis mapping')
    admDxMap = get_ICDs_from_mimic_file(diagnosisFile)

    print ('Building admission-procedure mapping')
    admPxMap = get_ICDs_from_mimic_file(procedureFile, isdiagnosis=False)

    print ('Building admission-drug mapping')
    drugDescription, admDrugMap = get_drugs_from_mimic_file(prescriptionFile, choice)
    return subject_idAdmMap,admDxMap,admPxMap,admDrugMap,drugDescription

In [10]:
def updateAdmCodeList(subject_idAdmMap: Dict[int, List[int]], admDxMap:  Dict[int, List[str]], admPxMap : Dict[int, List[str]], admDrugMap :  Dict[int, List[str]]) -> Tuple[Dict[int, List[str]], Dict[int, List[str]], Dict[int, List[str]]]:
    """
    Update the admission code lists for each admission ID (to take into account deleted elements).

    Args:
        subject_idAdmMap (dict): A dictionary mapping subject IDs to a list of admission IDs.
        admDxMap (dict): A dictionary mapping admission IDs to diagnosis codes.
        admPxMap (dict): A dictionary mapping admission IDs to procedure codes.
        admDrugMap (dict): A dictionary mapping admission IDs to drug codes.

    Returns:
        tuple: A tuple containing three dictionaries:
            - adDx: A dictionary mapping admission IDs to diagnosis codes.
            - adPx: A dictionary mapping admission IDs to procedure codes.
            - adDrug: A dictionary mapping admission IDs to drug codes.
    """
    adDx = {}
    adPx = {}
    adDrug = {}
    for subject_id, admIdList in subject_idAdmMap.items():
        for admId in admIdList:
            adDx[admId] = admDxMap[admId]
            adPx[admId] = admPxMap[admId]
            adDrug[admId] = admDrugMap[admId]
            
    return adDx, adPx, adDrug

In [11]:
def ListAvgVisit(dic: Dict[int, List[int]]) -> float:
    a =[len(intList) for k,intList in dic.items()]
    return sum(a)/len(a)

In [12]:
# New
def countCodes(*dicts: Dict[int, List[str]]) -> int:
    all_values = [value for dic in dicts for value in dic.values()]
    code_counts = Counter(code for sublist in all_values for code in sublist)
    return len(code_counts)

In [13]:
def display(pidAdmMap,admDxMap,admPxMap,admDrugMap):
    print(f" Total Number of patients {len(pidAdmMap)}")
    print(f" Total Number of admissions {len(admDxMap)}")
    print(f" Average number of admissions per patient {ListAvgVisit(pidAdmMap)}")
    print(f" Total Number of diagnosis code {countCodes(admDxMap)}")
    print(f" Total Number of procedure code {countCodes(admPxMap)}")
    print(f" Total Number of drug code {countCodes(admDrugMap)}")
    print(f" Total Number of codes {countCodes(admPxMap) +countCodes(admDxMap)+countCodes(admDrugMap) }")
    print(f" average Number of procedure code per visit {ListAvgVisit(admPxMap)}")
    print(f" average Number of diagnosis code per visit {ListAvgVisit(admDxMap)}")
    print(f" average Number of Drug code per visit {ListAvgVisit(admDrugMap)}")

In [14]:
def clean_data(subject_idAdmMap : Dict[int, List[int]], admDxMap : Dict[int, List[str]], admPxMap : Dict[int, List[str]], admDrugMap : Dict[int, List[str]], min_admissions_threshold : int = 2) -> Tuple[Dict[int, List[int]], Dict[int, List[str]], Dict[int, List[str]], Dict[int, List[str]]]:
    """
    Cleans the data by removing patient records that do not have all three medical codes for an admission
    and removing patients who made less than a specified number of admissions.

    Args:
        subject_idAdmMap (dict): A dictionary mapping subject IDs to a list of admission IDs.
        admDxMap (dict): A dictionary mapping admission IDs to diagnostic codes.
        admPxMap (dict): A dictionary mapping admission IDs to procedure codes.
        admDrugMap (dict): A dictionary mapping admission IDs to drug codes.
        min_admissions_threshold (int, optional): The minimum number of admissions required for a patient to be included. Defaults to 2.

    Returns:
        tuple: A tuple containing the updated subject_idAdmMap, adDx, adPx, and adDrug dictionaries.
    """
    print("Cleaning data...")
    subDelList = []

    print("Removing patient records who do not have all three medical codes for an admission")
    for subject_id, hadm_ids in subject_idAdmMap.items():
        for hadm_id in hadm_ids:
            if hadm_id not in admDxMap.keys():
                subDelList.append(subject_id)
            if hadm_id not in admPxMap.keys():
                subDelList.append(subject_id)
            if hadm_id not in admDrugMap.keys():
                subDelList.append(subject_id)

    subDelList = list(set(subDelList))

    for subject_id_to_rm in subDelList:
        del subject_idAdmMap[subject_id_to_rm]


    adDx, adPx, adDrug = updateAdmCodeList(subject_idAdmMap, admDxMap, admPxMap, admDrugMap)

    print(f"Removing patients who made less than {min_admissions_threshold} admissions")
    pidMap = {}
    adm = []
    subDelList = []
    subject_idAdmMap1 = subject_idAdmMap
    for pid, admIdList in subject_idAdmMap.items():
        if len(admIdList) < min_admissions_threshold:
            subDelList.append(pid)
            continue

    for i in subDelList:
        del subject_idAdmMap[i]

    adDx, adPx, adDrug = updateAdmCodeList(subject_idAdmMap, adDx, adPx, adDrug)
    display(subject_idAdmMap, adDx, adPx, adDrug)
    return subject_idAdmMap, adDx, adPx, adDrug


In [15]:
def create_CCS_CCSR_mapping(CCSRDX_file : str, CCSRPCS_file : str, CCSDX_file : str, CCSPX_file : str, dump : bool = True) -> Dict[str, str]:
    """
    Creates a mapping of ICD-10 diagnosis and procedure codes to CCS tokens.

    Args:
        CCSRDX_file (str): The file path of the CCSRDX file containing ICD-10 diagnosis codes and CCS categories.
        CCSRPCS_file (str): The file path of the CCSRPCS file containing ICD-10 procedure codes and CCS categories.
        CCSDX_file (str): The file path of the CCSDX file containing ICD-9 diagnosis codes and CCS categories.
        CCSPX_file (str): The file path of the CCSPX file containing ICD-9 procedure codes and CCS categories.
        dump (bool, optional): Whether to dump the mapping dictionaries to pickle files. Defaults to True.

    Returns:
        dict: A dictionary mapping CCS codes to their descriptions.
    """
    # This part creates an ICD-10 Diagnosis, Procedures map to CCS token list
    df = pd.read_csv(CCSRDX_file)
    a = df[["'ICD-10-CM CODE'", "'CCSR CATEGORY 1'", "'CCSR CATEGORY 2'", "'CCSR CATEGORY 3'", "'CCSR CATEGORY 4'", "'CCSR CATEGORY 5'", "'CCSR CATEGORY 6'"]]

    a = a.map(lambda x: str(x)[1:-1])

    a = a.set_index("'ICD-10-CM CODE'").T.to_dict('list')
    # remove null values
    for key, value in a.items():
        newValue = []
        value = list(filter(lambda x: x.strip(), value))
        for value in value:
            newValue.append('D10_' + value)
        a[key] = newValue

    b = {}
    for key in a.keys():
        new_key = 'D10_' + key
        b[new_key] = a[key]

    df = pd.read_csv(CCSRPCS_file, on_bad_lines='skip')
    df = df[["'ICD-10-PCS'", "'PRCCSR'"]]
    df = df.map(lambda x: str(x)[1:-1])
    df = df.set_index("'ICD-10-PCS'").T.to_dict('list')

    for key, value in df.items():
        newValue = []
        value = list(filter(lambda x: x.strip(), value))
        for value in value:
            newValue.append('P10_' + value)
        df[key] = newValue

    for key in df.keys():
        new_key = 'P10_' + key
        b[new_key] = df[key]

    # ICD-9 diagnosis code and prescription to CCS
    ccsTOdescription_Map = {}
    dxref_ccs_file = open(CCSDX_file, 'r')
    dxref_ccs_file.readline()  # note
    dxref_ccs_file.readline()  # header
    dxref_ccs_file.readline()  # null
    for line in dxref_ccs_file:
        tokens = line.strip().split(',')
        b['D9_' + str(tokens[0][1:-1]).strip()] = 'D9_' + str(tokens[1][1:-1]).strip()
        ccsTOdescription_Map['D9_' + str(tokens[1][1:-1]).strip()] = str(tokens[2][1:-1]).strip()
    dxref_ccs_file.close()

    dxprref_ccs_file = open(CCSPX_file, 'r')
    dxprref_ccs_file.readline()  # note
    dxprref_ccs_file.readline()  # header
    dxprref_ccs_file.readline()  # null
    for line in dxprref_ccs_file:
        tokens = line.strip().split(',')
        b['P9_' + str(tokens[0][1:-1]).strip()] = 'P9_' + str(tokens[1][1:-1]).strip()
        ccsTOdescription_Map['P9_' + str(tokens[1][1:-1]).strip()] = str(tokens[2][1:-1]).strip()
    dxprref_ccs_file.close()

    if dump:
        pickle.dump(b, open('ICD_9_10_to_CSS', 'wb'), -1)
        pickle.dump(ccsTOdescription_Map, open('ccs_to_description_dictionary', 'wb'), -1)
    print('Total ICD to CCS entries: ' + str(len(b)))
    print('Total CCS codes/descriptions: ' + str(len(ccsTOdescription_Map)))

    v1 = []
    for v in b.values():
        for val in v:
            v1.append(val)
    v1 = list(set(v1))
    print("Total number of unique codes (Diag + Proc):", len(v1))

    return ccsTOdescription_Map

In [16]:
def map_ccsr_description(filename: str, cat: str = 'Diag') -> Dict[str, str]:
    """
    Maps CCSR (Clinical Classifications Software Refined) category codes to their descriptions.

    Args:
        filename (str): The path to the Excel file containing the CCSR categories.
        cat (str, optional): The category type ('Diag' or 'Proc'). Defaults to 'Diag'.

    Returns:
        Dict[str, str]: A dictionary mapping CCSR category codes to their descriptions.
    """
    if cat == 'Diag':
        padStr = 'D10_'
    else:
        padStr = 'P10_'
    df = pd.read_excel(filename, sheet_name="CCSR_Categories", skiprows=1)
    if type != 'Diag':
        df = df[:-1]
    codeDescription = df[["CCSR Category", "CCSR Category Description"]]
    codeDescription = codeDescription.map(lambda x: padStr + str(x))
    codeDescription = codeDescription.set_index("CCSR Category").T.to_dict('list')
    for key, value in codeDescription.items():
        newValue = value[0][4:]
        codeDescription[key] = newValue

    return codeDescription

In [4]:
def convValuestoList(codeDic : Dict[str, str]) -> Dict[str, List[str]]:
    for key, value in codeDic.items():
        codeDic[key] =  [value]
    return codeDic

In [18]:
def map_ICD_to_CCSR(mapping : Dict[int, List[int]]) -> Tuple[Dict[int, List[str]], List[str], Set[str]]:
    """
    Maps ICD codes to CCSR codes based on a given mapping.

    Args:
        mapping (Dict[int, List[int]]): A dictionary containing the mapping of hadm_id to a list of ICD codes.

    Returns:
        Tuple[Dict[int, List[str]], List[str], Set[str]]: A tuple containing the following:
            - CodesToInternalMap: A dictionary mapping hadm_id to a list of CCSR codes.
            - missingCodes: A list of ICD codes that could not be mapped to CCSR codes.
            - set_of_used_codes: A set of ICD codes that were successfully mapped to CCSR codes.
    """
    
    icdTOCCS_Map = pickle.load(open('ICD_9_10_to_CSS','rb'))
    CodesToInternalMap = {}
    missingCodes = []
    set_of_used_codes = set()
    number_of_codes_missing = 0
    countICD9=0
    countICD10 =0
    for (hadm_id, ICDs_List) in mapping.items():
        for ICD in ICDs_List:
            #print(ICD,type(ICD),len(ICD))
            #while (len(ICD9) < 6): ICD9 += ' '  #pad right white spaces because the CCS mapping uses this pattern
            if ICD.startswith('D10_'):
                padStr = 'D10_'
            elif ICD.startswith('D9_'):
                padStr = 'D9_'
            elif ICD.startswith('P10_'):
                padStr = 'P10_'    
            elif ICD.startswith('P9_'):
                padStr = 'P9_'  
            else:
                print("Wrong coding format")

            try:

                CCS_code = icdTOCCS_Map[ICD]

                if hadm_id in CodesToInternalMap:
                    if(isinstance(CCS_code, str)): 
                        CodesToInternalMap[hadm_id].append(CCS_code)
                    else:
                        for code in CCS_code:
                            CodesToInternalMap[hadm_id].append(code)
                        
                else:
                    if(isinstance(CCS_code, str)): 
                        CodesToInternalMap[hadm_id] = [CCS_code]
                    else:
                        for i in range(len(CCS_code)):
                            if i==0:
                                CodesToInternalMap[hadm_id] = [CCS_code[i]]
                            else:
                                CodesToInternalMap[hadm_id].append(CCS_code[i])
                                
                            
                set_of_used_codes.add(ICD)

            except KeyError:
                #print(f"the mapping of {ICD} {hadm_id}")
                missingCodes.append(ICD)
                #print(f"the mapping of  is : {icdTOCCS_Map[ICD]}")
                number_of_codes_missing +=1
                #print (str(sys.exc_info()[0]) + '  ' + str(ICD) + ". ICD9 code not found, please verify your ICD9 to CCS mapping before proceeding.")


            
    print(f"total number of ICD9 codes used {countICD9} and ICD10 codes: {countICD10}")  
    print ('-Total number (complete set) of ICD9+ICD10 codes (diag + proc): ' + str(len(set(icdTOCCS_Map.keys()))))
    #print ('-Total number (complete set) of CCS codes (diag + proc): ' + str(len(set(icd9TOCCS_Map.values()))))
    print ('-Total number of ICD codes actually used: ' + str(len(set_of_used_codes)))
    print ('-Total number of ICD codes missing in the admissions list: ' , number_of_codes_missing)
    #print(icd9TOCCS_Map)
    
    return CodesToInternalMap,missingCodes,set_of_used_codes

In [19]:
def displayCodeStats(adDx : ,adPx,adDrug):
    print(f" Total Number of diagnosis code {countCodes(adDx)}")
    print(f" Total Number of procedure code {countCodes(adPx)}")
    print(f" Total Number of drug code {countCodes(adDrug)}")
    print(f" Total Number of unique  D,P codes {countCodes(adDx,adPx) }")
    print(f" Total Number of all codes {countCodes(adDx,adPx,adDrug) }")


    print(f" average Number of procedure code per visit {ListAvgVisit(adPx)}")
    print(f" average Number of diagnosis code per visit {ListAvgVisit(adDx)}")
    print(f" average Number of drug code per visit {ListAvgVisit(adDrug)}")

    print(f" Min. and max. Number of diagnosis code per admission {minMaxCodes(adDx)}")
    print(f" Min. and max. Number of procedure code  per admission{minMaxCodes(adPx)}")
    print(f" Min. and max. Number of drug code  per admission {minMaxCodes(adDrug)}")

In [20]:
def minMaxCodes(dic):
    countCode = []
    for codes in dic.values():
        countCode.append(len(codes))    
                
    return min(countCode),max(countCode)

In [21]:
def icd_mapping(CCSRDX_file: str, CCSRPCS_file: str, CCSDX_file: str, CCSPX_file: str, D_CCSR_Ref_file: str, P_CCSR_Ref_file: str, adDx: Dict[int, List[int]], adPx: Dict[int, List[int]], adDrug: Dict[int, List[int]], drugDescription: Dict[str, str]) -> Tuple[Dict[int, List[int]], Dict[int, List[int]], Dict[str, str]]:
    """
    Maps ICD codes to CCS and CCSR codes and returns the mapped diagnosis codes, procedure codes, and code descriptions.

    Args:
    - CCSRDX_file (str): Path to the CCSRDX file.
    - CCSRPCS_file (str): Path to the CCSRPCS file.
    - CCSDX_file (str): Path to the CCSDX file.
    - CCSPX_file (str): Path to the CCSPX file.
    - D_CCSR_Ref_file (str): Path to the D_CCSR_Ref file.
    - P_CCSR_Ref_file (str): Path to the P_CCSR_Ref file.
    - adDx (Dict[int, List[int]]): Dictionary containing the diagnosis codes.
    - adPx (Dict[int, List[int]]): Dictionary containing the procedure codes.
    - adDrug (Dict[int, List[int]]): Dictionary containing the drug codes.
    - drugDescription (Dict[str, str]): Dictionary containing the descriptions of drug codes.

    Returns:
    - Tuple[Dict[int, List[int]], Dict[int, List[int]], Dict[str, str]]: A tuple containing the mapped diagnosis codes, procedure codes, and code descriptions.
    """
    # creating mappint between all ICD codes to CCS and CCSR mapping
    ccsTOdescription_Map = create_CCS_CCSR_mapping(CCSRDX_file,CCSRPCS_file,CCSDX_file,CCSPX_file)
    # getting the description of all codes
    DxcodeDescription = map_ccsr_description(D_CCSR_Ref_file)
    PxcodeDescription = map_ccsr_description(P_CCSR_Ref_file, cat = 'Proc')
    codeDescription ={**DxcodeDescription ,**PxcodeDescription }
    codeDescription ={**codeDescription , **convValuestoList(ccsTOdescription_Map), **drugDescription}
    # mapping diagnois codes
    adDx,missingDxCodes,set_of_used_codes1 = map_ICD_to_CCSR(adDx)
    # mapping procedure codes
    print('here it should be working')
    print(adPx[23384508])
    print('---------------')
    adPx,missingPxCodes,set_of_used_codes2 = map_ICD_to_CCSR(adPx)
    print( 'P10_0QS604Z' in missingPxCodes)
    codeDescription['SOH'] = 'Start of history'
    codeDescription['EOH'] = 'End of history'
    codeDescription['BOV'] = 'Beginning of visit'
    codeDescription['EOV'] = 'End of visit'
    codeDescription['BOS'] = 'Beginning of sequence'
    codeDescription['PAD'] = 'Padding'
    displayCodeStats(adDx,adPx,adDrug)
    return adDx,adPx,codeDescription

In [22]:
def trim(adDx  : Dict[int,List[int]], adPx  : Dict[int,List[int]], adDrug  : Dict[int,List[int]], min_dxm : int, min_px : int, min_drg: int) -> Tuple[Dict[int,List[int]], Dict[int,List[int]], Dict[int,List[int]]]:
    """
    Trims the diagnosis, procedure, and medication codes for each visit.

    Args:
    adDx (dict): A dictionary containing admission IDs as keys and diagnosis codes as values.
    adPx (dict): A dictionary containing admission IDs as keys and procedure codes as values.
    adDrug (dict): A dictionary containing admission IDs as keys and medication codes as values.
    min_dxm (int): The minimum number of diagnosis codes to keep for each admission.
    min_px (int): The minimum number of procedure codes to keep for each admission.
    min_drg (int): The minimum number of medication codes to keep for each admission.

    Returns:
    tuple: A tuple containing the trimmed dictionaries for diagnosis codes, procedure codes, and medication codes.
    """

    print("Trimming the diagnosis, procedure, and medication codes for each visit")
    
    for admission, DiagCodes in adDx.items():
        adDx[admission] = DiagCodes[:min_dx]
        
    for admission, ProcCodes in adPx.items():
        adPx[admission] = ProcCodes[:min_px]
        
    for admission, DrugCodes in adDrug.items():
        adDrug[admission] = DrugCodes[:min_drg]
        
    displayCodeStats(adDx, adPx, adDrug)
    return adDx, adPx, adDrug

In [23]:
def buildData(subject_idAdmMap : Dict[int, List[int]], adDx: Dict[int, List[int]], adPx: Dict[int, List[int]], adDrug: Dict[int, List[int]], minVisits: int = 2) -> Tuple[List[List[List[int]]], Dict[str, int]]:
    """
    Builds the data for patient trajectory forecasting.

    Args:
        subject_idAdmMap (dict): A dictionary mapping subject IDs to admission IDs.
        adDx (dict): A dictionary mapping admission IDs to diagnosis codes.
        adPx (dict): A dictionary mapping admission IDs to procedure codes.
        adDrug (dict): A dictionary mapping admission IDs to drug codes.
        minVisits (int, optional): The minimum number of visits required for a patient. Defaults to 2.

    Returns:
        Tuple[List[List[List[int]]], Dict[str, int]]: A tuple containing the processed patient sequences and the code types dictionary.
    """
    
    adPx, adDx, adDrug = map(lambda d: defaultdict(list, d), (adPx, adDx, adDrug)) # add default [] for missing values

    print(f'Building admission-Visits mapping & filtering patients with less than {minVisits} ')
    pidSeqMap = {}
    
    skipped = 0 
    for subject_id, admIdList in subject_idAdmMap.items():
        if len(admIdList) < minVisits: 
            skipped += 1
            continue # skip patients with less than minVisits ( default 1 )
        sortedList = [(adDx[admId], adPx[admId], adDrug[admId]) for admId in admIdList]
        
        pidSeqMap[subject_id] = sortedList
        
    adPx, adDx, adDrug = map(dict, (adPx, adDx, adDrug))  # remove default [] behavior to not break something

    print(f'{skipped} subjects were removed')
    print('Building subject-id, diagnosis, procedure, drugs mapping')
    subject_ids = []
    dates = []
    seqs = []
    ages = []
    for subject_id, visits in pidSeqMap.items():
        subject_ids.append(subject_id)
        diagnose = []
        procedure = []
        drugs = []
        date = []
        seq = []
        for visit in visits:
            joined = list(dict.fromkeys(chain.from_iterable(visit))) # dict.fromkeys used as an ordered set func
            seq.append(joined)
        seqs.append(seq)

    print('Converting Strings Codes into unique integer, and making types')
    types = {}
    newSeqs = []
    for patient in seqs:
        newPatient = []
        #print("patient",patient)
        for visit in patient:
            #print("visit",visit)
            newVisit = []
            for code in visit:
                #print("code",code)
                if code in types:
                    newVisit.append(types[code])
                else:
                    types[code] = len(types)
                    newVisit.append(types[code])
                    #print("newVisit",newVisit)
            newPatient.append(newVisit)
        newSeqs.append(newPatient)
    return newSeqs, types


In [24]:
def ListAvgVisitForRemoveCode(dic):
    a =[len(intList) for intList in dic]
    return sum(a)/len(a)

In [25]:
def removeCode(currentSeqs : List[List[List[int]]], types, threshold :int = 5) -> Tuple[List[List[List[int]]], Dict[str, int], Dict[int, str]]:
    """
    Removes infrequent codes from the given sequences.

    Args:
        currentSeqs (List[List[List[int]]]): The input sequences containing codes.
        types (Dict[str, int]): A dictionary mapping code types to their corresponding integer values.
        threshold (int, optional): The threshold value for removing infrequent codes. Codes with a count less than or equal to this threshold will be removed. Defaults to 5.

    Returns:
        Tuple[List[List[List[int]]], Dict[str, int], Dict[int, str]]: A tuple containing the updated sequences, the updated types dictionary, and the reverse types dictionary.
    """
    countCode = Counter()
    
    for visits in currentSeqs:
        for visit in visits:
            countCode.update(visit)
            
    codes = [key for key, value in countCode.items() if value <= threshold]
    
    print(f" Total number of codes removed: {len(codes)}  ")
    print(f" Total number of  unique codes : {len(countCode)}  ")

    reverseTypes = {v:k for k,v in types.items()}

    # List of codes like : D9_660...
    types = defaultdict(lambda: len(types), {"PAD": 0,"BOH":1 ,"BOS": 2, "BOV": 3, "EOV": 4, "EOH": 5})

    # Recreates a new mapping while taking into consideration the removed tokens
    updatedSeqs = [[[types[reverseTypes[code]] for code in visit if code not in codes] for visit in patient] for patient in currentSeqs]
    
    reverseTypes = {v:k for k,v in types.items()}

    return updatedSeqs, dict(types), reverseTypes

In [26]:
def saveFiles(updatedSeqs : List[List[List[int]]], types, codeDescription : str, outpath : str = 'outputData/originalData'):
    """
    Save the updated sequences, types, and code description to files.

    Args:
    updatedSeqs (List[List[List[int]]]): The updated sequences to be saved.
    types: The types to be saved.
    codeDescription (str): The code description to be saved.
    outpath (str, optional): The output path where the files will be saved. Defaults to 'outputData/originalData'.
    """

    if not os.path.exists(outpath):
        os.makedirs(outpath)
    
    pickle.dump(updatedSeqs, open(outFile+'.seqs', 'wb'), -1)
    pickle.dump(types, open(outFile+'.types', 'wb'), -1)
    pickle.dump(codeDescription, open(outFile+'.description', 'wb'), -1)

In [27]:
def generateCodeTypes(outFile: str, reverseTypes: Dict[int, str]) -> Dict[str, int]:
    """
    Generate code types based on reverse types dictionary.

    Args:
    - outFile (str): The name of the output file.
    - reverseTypes (Dict[int, str]): A dictionary containing reverse types.

    Returns:
    - codeType (Dict[str, int]): A dictionary containing the generated code types.
    """
    ICD_9_10_to_CSS = pickle.load(open('ICD_9_10_to_CSS', 'rb'))
    codeType = {}
    countD = 0
    countP = 0
    countDr = 0
    countT = 0

    for keys, values in reverseTypes.items():
        found = 0
        if keys not in codeType:
            if values.startswith('DR_'):
                found = 1
                codeType[keys] = 'DR'
                countDr = countDr + 1
            elif values == 'PAD' or values == 'BOH' or values == "BOS" or values == 'BOV' or values == 'EOV' or values == 'EOH':
                found = 1
                codeType[keys] = 'T'
                countT = countT + 1
            else:
                for k, v in ICD_9_10_to_CSS.items():
                    if values in v:
                        found = 1
                        if keys not in codeType:
                            if k.startswith('D'):
                                codeType[keys] = 'D'
                                countD = countD + 1
                            elif k.startswith('P'):
                                codeType[keys] = 'P'
                                countP = countP + 1
            if found == 0:
                print(keys, values)

    print(countD, countP, countDr, countT)
    pickle.dump(codeType, open(outFile + '.codeType', 'wb'), -1)

    return codeType

In [28]:
def load_data(outFile: str) -> Tuple[List[List[List[int]]], Dict[str, ], Dict[str, int], Dict[int, str]]:
        """
        Load data from the specified file.

        Args:
        - outFile (str): The path to the file containing the data.

        Returns:
        - Tuple[List[List[List[int]]], Dict[str, ], Dict[str, int], Dict[int, str]]: A tuple containing the loaded data.
            - The first element is a list of sequences, where each sequence is a list of events, and each event is a list of integers.
            - The second element is a dictionary mapping event types to their corresponding codes.
            - The third element is a dictionary mapping event codes to their corresponding types.
            - The fourth element is a dictionary mapping event codes to their corresponding types (reversed mapping).
        """
        # load the data again
        seqs = pickle.load(open(outFile + '.seqs', 'rb'))
        types = pickle.load(open(outFile + '.types', 'rb'))
        codeType = pickle.load(open(outFile + '.codeType', 'rb'))
        reverseTypes = {v: k for k, v in types.items()}
        return seqs, types, codeType, reverseTypes

In [29]:
def PrepareForTF(sequence : List[List[int]]) -> Tuple[List[List[int]]]:
    """
    Prepares the input sequence for trajectory forecasting training by creating pairs of input and output sequences.

    Args:
        sequence (List[List[int]]): The input sequence of integers.

    Returns:
        List[Tuple[List[List[int]], List[List[int]]]]: A list of pairs, where each pair consists of an input sequence and its corresponding output sequence.
    """
    X, y, pairs = list(), list(), list()
    for i in range(len(sequence)):
        # find the end of this pattern
        if i+1 >= len(sequence):
            break
        X.append(sequence[:i+1])
        y.append(sequence[i+1:])
    pairs = pairing1(X, y)
    return pairs

In [30]:
def PrepareForSDP(sequence: List[List[int]]):
    """
    Prepare the sequence for Sequential disease prediction modeling.

    Args:
        sequence (list): The input sequence.

    Returns:
        list: A list of pairs containing the input sequence and the corresponding target sequence.

    """
    X, y, pairs = list(), list(), list()
    for i in range(len(sequence)):
        if i + 1 >= len(sequence):
            break
        X.append(sequence[:i+1])
        y.append([sequence[i+1]])
    pairs = pairing1(X, y)
    return pairs

In [34]:
def removePairs(newPairs, mn = 600):
    print(f"\n  Total no of pairs before removing :{len(newPairs)}")
    b = len(newPairs)
    x,y,curPair = [],[],[]
    count,county,counts =0,0,0
    for pair in newPairs:
        if len(pair[0]) > mn and len(pair[1]) > mn:
            counts =counts +1
            #newPairs.remove(pair)   
        elif len(pair[0]) > mn or len(pair[1]) > mn:
            count =count +1
            #newPairs.remove(pair)
        else:
            curPair.append(pair)
            
    print(f"\n  Total no of pairs after removing :{len(curPair)}")
    print(f"\n  Total no of pairs removed :{b-len(curPair)}")
    return curPair

In [None]:
def AddSpecialTokens(pairs : List[Tuple[list[List[int]]]]) -> List[Tuple[list[List[int]]]]:
    """
    Adds special tokens to the input and output sequences in the given list of pairs.

    Args:
        pairs (List[Tuple[list[List[int]]]]): A list of pairs, where each pair contains an input source and target sequence.

    Returns:
        List[Tuple[list[List[int]]]]: A new list of pairs with special tokens added to the input source and target sequence.
    """
    newPairs = []
        
    # this part must be updated
    for pair in pairs:
        inputs, output, p =[], [], []
        for i in pair[0]:
            i = i + [2]
            inputs.extend(i)
        p.append([1] + inputs + [3])
        for o in pair[1]:
            o = o + [2]
            output.extend(o)
        p.append([1]+ output+ [3])

        newPairs.append(tuple(p))
    return newPairs

In [None]:
def formatData(originalSeqs : List[List[List[int]]], dataFormat : Optional[str] = 'TF', mn : Optional[int] = 400) -> List[Tuple[List[List[int]]]]:
    """
    Formats the original sequences based on the specified data format.

    Args:
        originalSeqs (List[List[List[int]]]): The original sequences to be formatted.
        dataFormat (str, optional): The data format to use. Can be either 'TF' for trajectory forecasting or 'SDP' for sequential disease prediction. Defaults to 'TF'.
        mn (int, optional): The minimum sequence length. Pairs with a length greater than this value will be removed. Defaults to 400.

    Returns:
        List[Tuple[List[List[int]]]]: The formatted pairs of input and output sequences.
    """

    pairs = []
    
    for i in range(len(originalSeqs)):
        # Trajectory forecasting (TF): predict until the end of EOH
        if dataFormat == 'TF':
            pairs.extend(PrepareForTF(originalSeqs[i]))
        # Sequential disease prediction (SDP): predict until the next visit
        elif dataFormat == 'SDP':
            pairs.extend(PrepareForSDP(originalSeqs[i]))
        else:
            raise Exception('Wrong strategy, must choose either TF, SDP')
            
    newPairs = AddSpecialTokens(pairs)
        
    if(stats(newPairs, mn = mn)):
        print(f"\n\n\nRemoving pairs greater than  {mn} seq length")
        newPairs = removePairs(newPairs,mn=mn)
        stats(newPairs)
    return newPairs

In [36]:
def pairing1(x : List[List[int]], y : List[List[int]]) -> List[Tuple[List[int],List[int]]]:
    pairs = []
    for i, a in enumerate(zip(x,y)):
        pairs.append(a)
    return pairs

In [39]:
def stats(newPairs, mn: int = 600) -> bool:
    """
    Calculates statistics based on the length of pairs in the given list.

    Args:
        newPairs (list): A list of pairs.
        mn (int, optional): The minimum length threshold. Defaults to 600.

    Returns:
        bool: True if any pair in the list has a length greater than mn, False otherwise.
    """
    x, y = [], []
    count, county, counts = 0, 0, 0
    for pair in newPairs:
        if len(pair[0]) > mn:
            count = count + 1

        if len(pair[1]) > mn:
            county = county + 1

        if len(pair[0]) > mn and len(pair[1]) > mn:
            counts = counts + 1
        x.append(len(pair[0]))
        y.append(len(pair[1]))

    if count > 0 or county > 0 or counts > 0:
        run = True
    else:
        run = False
    return run

In [47]:
def resetIntegerOutput(updSeqs: List[List[int]]) -> Tuple[List[List[int]], Dict[int, int]]:
    """
    Resets the integer output codes in the given sequence of pairs.

    Args:
        updSeqs (List[List[int]]): A list of pairs where each pair contains an integer and a list of codes.

    Returns:
        Tuple[List[List[int]], Dict[int, int]]: A tuple containing the updated pairs and a dictionary mapping the old codes to new codes.
    """
    updPair = []
    outTypes = {}
    outTypes.update({0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5})
    for i, pair in enumerate(updSeqs):
        newVisit = []
        for code in pair[1]:
            if code in outTypes:
                newVisit.append(outTypes[code])
            else:
                outTypes[code] = len(outTypes)
                newVisit.append(outTypes[code])
        updPair.append((pair[0], newVisit))
    return updPair, outTypes

In [30]:
def updateOutput(newPairs : List[Tuple[List[List[int]], List[List[int]]]], codeType: Dict[int, str], diagnosis: bool = False, procedure : bool = False , drugs : bool = False, all_ : bool = False):
    """
    Update the output sequences based on the specified criteria.

    Args:
    - newPairs (list): List of pairs containing the input and output sequences.
    - codeType (dict): Dictionary mapping codes to their corresponding types.
    - diagnosis (int): Flag indicating whether to include diagnosis codes in the output. Default is 0.
    - procedure (int): Flag indicating whether to include procedure codes in the output. Default is 0.
    - drugs (int): Flag indicating whether to include drug codes in the output. Default is 0.
    - all (int): Flag indicating whether to keep all codes in the output. Default is 0.

    Returns:
    - updSeqs (list): List of updated pairs containing the input and updated output sequences.
    """

    updSeqs = []

    if procedure and drugs:
        print("\n Removing drug and procedure codes from output for forecasting diagnosis code only")
        for i, pair in enumerate(newPairs):
            newOutput = []
            for code in pair[1]:
                if codeType[code] == 'D' or codeType[code] == 'T':
                    newOutput.append(code)

            if len(newOutput) >= 4:
                updSeqs.append((pair[0], newOutput))

    if drugs and not(procedure):
        print("\n Removing only drug codes from output for forecasting diagnosis and procedure code only")
        for i, pair in enumerate(newPairs):
            newOutput = []
            for code in pair[1]:
                if not (codeType[code] == 'DR'):
                    newOutput.append(code)
            if len(newOutput) >= 4:
                updSeqs.append((pair[0], newOutput))

    if all_:
        print("\n keeping all codes")
        updSeqs = newPairs.copy()

    return updSeqs

In [52]:
def storeFiles(pair,outTypes,codeType,types,reverseTypes,outFile):
    if not os.path.exists(outFile):
        os.makedirs(outFile)
    pickle.dump(pair, open(outFile+'.seqs', 'wb'), -1)
    pickle.dump(outTypes, open(outFile+'.outTypes', 'wb'), -1)
    pickle.dump(codeType, open(outFile+'.codeType', 'wb'), -1)
    pickle.dump(types, open(outFile+'.types', 'wb'), -1)
    pickle.dump(reverseTypes, open(outFile+'.reverseTypes', 'wb'), -1)
    reverseOutTypes = {v:k for k,v in outTypes.items()}
    pickle.dump(reverseOutTypes, open(outFile+'.reverseTypes', 'wb'), -1)

In [40]:
mimic_iv_path = 'mimic-iv-2.2/hosp'
CCS_DIR = './CSS/'

admissionFile = os.path.join(mimic_iv_path, 'admissions.csv.gz')
diagnosisFile = os.path.join(mimic_iv_path, 'diagnoses_icd.csv.gz')
procedureFile = os.path.join(mimic_iv_path, 'procedures_icd.csv.gz')
prescriptionFile = os.path.join(mimic_iv_path, 'prescriptions.csv.gz')

CCSRDX_file = os.path.join(CCS_DIR, 'DXCCSR_v2021-2.csv')
CCSRPCS_file = os.path.join(CCS_DIR, 'PRCCSR_v2021-1.CSV')

CCSDX_file = os.path.join(CCS_DIR, '$dxref 2015.csv')
CCSPX_file = os.path.join(CCS_DIR, '$prref 2015.csv')

D_CCSR_Ref_file = os.path.join(CCS_DIR, 'DXCCSR-Reference-File-v2021-2.xlsx')
P_CCSR_Ref_file = os.path.join(CCS_DIR, 'PRCCSR-Reference-File-v2021-1.xlsx')

In [54]:
print("Loading the data...")
subject_idAdmMap,admDxMap,admPxMap,admDrugMap,drugDescription = load_mimic_data()
print("\n Completed...")
#stage 2 and 3
print("\n Cleaning data...")
subject_idAdmMap,adDx,adPx,adDrug = clean_data(subject_idAdmMap,admDxMap,admPxMap,admDrugMap)
print("\n Completed...")
#stage 4
print("\nMapping ICD data to CCS and CCSR...")
adDx,adPx,codeDescription = icd_mapping(CCSRDX_file,CCSRPCS_file,CCSDX_file,CCSPX_file,D_CCSR_Ref_file,P_CCSR_Ref_file,adDx,adPx,adDrug,drugDescription)
print("\n Completed...")
#stage 5
print("\n Trimming the codes assigned per visit based on a threshold...")
min_dx, min_px, min_drg = 80, 80, 80 
adDx, adPx, adDrug= trim(adDx, adPx, adDrug, min_dx, min_px, min_drg)
print("\n Completed...")
print("\n Building the data..")
newSeqs,types=buildData(subject_idAdmMap,adDx,adPx,adDrug)
#stage 6
threshold = 5
print(f"\n removing the code whose occurence is less than a certain threshold: {threshold}")
updatedSeqs ,types ,reverseTypes  = removeCode(newSeqs,types,threshold=threshold)
# outFile - is a folder path in the working directory where the data is going to get stored
outFile = os.path.join('outputData','originalData')
print("\n Save the data before formmating based on the task")
saveFiles(updatedSeqs,dict(types),codeDescription)
codeType = generateCodeTypes(outFile,reverseTypes)
seqs,types,codeType,reverseTypes = load_data(outFile)
print("\n Completed...")
print("\n Preparing data for Trajectory Forecasting....")
# sequence length threshold  -mn
seqLength = 500
newPairs = formatData(seqs,dataFormat = 'TF', mn = seqLength)
diagnosisOutputFile = os.path.join('outputData','TF','Inp_d_p_dr_out_d')
diagnosisProcedureOutputFile = os.path.join('outputData','TF','Inp_d_p_dr_out_d_p')
AllOutputFile = os.path.join('outputData','TF','Inp_d_p_dr_out_d_p_dr')
# sequence length threshold  -mn
seqLength = 500
newPairs = formatData(seqs,dataFormat = 'TF', mn = seqLength)
diagnosisOutputFile = os.path.join('outputData','TF','Inp_d_p_dr_out_d')
diagnosisProcedureOutputFile = os.path.join('outputData','TF','Inp_d_p_dr_out_d_p')
AllOutputFile = os.path.join('outputData','TF','Inp_d_p_dr_out_d_p_dr')
AllUpdPair,AllOutTypes= resetIntegerOutput(updateOutput(newPairs.copy(),codeType, _all = True))
print(f"\n Remove certain codes from output for different data formats")
AllUpdPair,AllOutTypes= resetIntegerOutput(updateOutput(newPairs.copy(),codeType, _all = True))
diagnosisUpdPair,diagnosisOutTypes= resetIntegerOutput(updateOutput(newPairs.copy(),codeType, procedure=True, drugs = True))
diagnosisProcedureUpdPair,diagnosisProcedureOutTypes= resetIntegerOutput(updateOutput(newPairs.copy(),codeType, drugs = True))
AllUpdPair,AllOutTypes= resetIntegerOutput(updateOutput(newPairs.copy(),codeType, _all = True))
print(f"\n Remove certain codes from output for different data formats")
AllUpdPair,AllOutTypes= resetIntegerOutput(updateOutput(newPairs.copy(),codeType, _all = True))
diagnosisUpdPair,diagnosisOutTypes= resetIntegerOutput(updateOutput(newPairs.copy(),codeType, procedure=  True, drugs = True))
diagnosisProcedureUpdPair,diagnosisProcedureOutTypes= resetIntegerOutput(updateOutput(newPairs.copy(),codeType, drugs = True))

print(f"\n total # S1 records : {len(diagnosisUpdPair)}\n total # S2 records :{len(diagnosisProcedureUpdPair)}\n total # S3 records :{len(AllUpdPair)}")
print(f"\n total Dx codes:{len(diagnosisOutTypes)} \n  total Dx,Px codes:{len(diagnosisProcedureOutTypes)} \n total Dx,Px,Rx codes:{len(AllOutTypes)}")
print("\n Storing all the information related to Trajectory Forecasting...")


storeFiles(diagnosisUpdPair,diagnosisOutTypes,codeType,types,reverseTypes,diagnosisOutputFile)
storeFiles(diagnosisProcedureUpdPair,diagnosisProcedureOutTypes,codeType,types,reverseTypes,diagnosisProcedureOutputFile)
storeFiles(AllUpdPair,AllOutTypes,codeType,types,reverseTypes,AllOutputFile)
print("\n Completed...")

print("\nPreparing data for Sequential disease prediction....")
newPairs = formatData(seqs,dataFormat = 'SDP',mn =500)
diagnosisOutputFile = os.path.join('outputData','SDP','Inp_d_p_dr_out_d')

print(f"\n\n Remove certain codes from output for different data formats")
diagnosisUpdPair,diagnosisOutTypes= resetIntegerOutput(updateOutput(newPairs.copy(), codeType, procedure = True,drugs = True))

print(f"\n total # records: {len(diagnosisUpdPair)} \n total # of codes: {len(diagnosisOutTypes)}")

print("\n Storing all the information related to TSequential disease prediction...")
storeFiles(diagnosisUpdPair,diagnosisOutTypes,codeType,types,reverseTypes,diagnosisOutputFile)
print("\n Completed...")
print("\n All the preprocessing step has been completed, Now use the data in the outputData folder to build the model...")

Loading the data...
Building subject_id-admission mapping, admission-date mapping
Building admission-diagnosis mapping
-Number of null ICD9 codes in file mimic-iv-2.2/hosp/diagnoses_icd.csv.gz: 0
-Number of null ICD10 codes in file mimic-iv-2.2/hosp/diagnoses_icd.csv.gz: 0
Building admission-procedure mapping
-Number of null ICD9 codes in file mimic-iv-2.2/hosp/procedures_icd.csv.gz: 0
-Number of null ICD10 codes in file mimic-iv-2.2/hosp/procedures_icd.csv.gz: 0
Building admission-drug mapping

 Completed...

 Cleaning data...
Cleaning data...
Removing patient records who does not have all three medical codes for an admission
Removing patient who made less than 2 admissions
 Total Number of patients 19834
 Total Number of admissions 52957
 Average number of admissions per patient 2.6700110920641325
 Total Number of diagnosis code 14119
 Total Number of procedure code 7511
 Total Number of drug code 4891
 Total Number of codes 26521
 average Number of procedure code per visit 3.0819910