In [1]:
import pandas as pd
import os
import datetime
from dotenv import dotenv_values
import snowflake.connector as sc
import snowflake.connector.pandas_tools as snp
import re

In [2]:
csv_path = os.path.join(os.getcwd(), 'resources')
time_st = datetime.datetime.now()

In [3]:
naaccr_hierarchy_path = os.path.join(csv_path, 'DataItems.csv')
df_hierarchy = pd.read_csv(naaccr_hierarchy_path)
df_hierarchy = df_hierarchy[["Data Item Number", "Data Item Name", "Section Name", "XML NAACCR ID"]]
# print(df_hierarchy["Section Name"].unique())
df_hierarchy = df_hierarchy[~df_hierarchy['Section Name'].isin(['Record ID', 'Patient-Confidential', 'Hospital-Confidential', 'Other-Confidential'])]
print(df_hierarchy.shape)
# df_hierarchy.head()
df_hierarchy = df_hierarchy[~df_hierarchy['Data Item Number'].isin([420, 430, 522, 523])]
print(df_hierarchy.shape)


(733, 4)
(729, 4)


In [4]:
naaccr_code_list_path = os.path.join(csv_path, 'CodeList.csv')
df_code_list = pd.read_csv(naaccr_code_list_path)
df_code_list = df_code_list[["Data Item Number", "Data Item Name", "Code", "Description"]]
df_code_list = df_code_list.drop_duplicates()
df_code_list.shape

(3877, 4)

In [5]:
naaccr_alternate_names_path = os.path.join(csv_path, 'AlternateNames.csv')
df_alternate_names = pd.read_csv(naaccr_alternate_names_path)
df_alternate_names = df_alternate_names[["Data Item Number", "Data Item Name", "Alternate Name"]]
df_alternate_names.shape

(423, 3)

In [6]:
# ontology schema 
ddl_def = {
    "C_HLEVEL" : "int64",
    "C_FULLNAME" : "str",
    "C_NAME" : "str",
    "C_SYNONYM_CD" : "str",
    "C_VISUALATTRIBUTES" : "str",
    "C_TOTALNUM" : "int64",
    "C_BASECODE" : "str",
    "C_METADATAXML" : "str",
    "C_FACTTABLECOLUMN" : "str",
    "C_TABLENAME" : "str",
    "C_COLUMNNAME" : "str",
    "C_COLUMNDATATYPE" : "str",
    "C_OPERATOR" : "str",
    "C_DIMCODE" : "str",
    "C_COMMENT" : "str",
    "C_TOOLTIP" : "str",
    "M_APPLIED_PATH" : "str",
    "UPDATE_DATE"  : "datetime64[ns]",
    "DOWNLOAD_DATE" : "datetime64[ns]",
    "IMPORT_DATE"  : "datetime64[ns]",
    "SOURCESYSTEM_CD"  : "str",
    "VALUETYPE_CD"  : "str",
    "M_EXCLUSION_CD"  : "str",
    "C_PATH" : "str",
    "C_SYMBOL"  : "str"
}

In [7]:
# Fixed columns

fixed = {
    "C_SYNONYM_CD" : 'N',
    "C_TOTALNUM" : None,
    "C_METADATAXML" : None,
    "C_FACTTABLECOLUMN" : 'TUMOR_FACT.CONCEPT_CD',
    "C_TABLENAME" : 'CONCEPT_DIMENSION',
    "C_COLUMNNAME" : 'CONCEPT_PATH',
    "C_COLUMNDATATYPE" : 'T',
    "C_OPERATOR" : 'LIKE',
    "C_COMMENT" : None,
    "M_APPLIED_PATH" : '@',
    "UPDATE_DATE" : time_st,
    "DOWNLOAD_DATE" : time_st,
    "IMPORT_DATE" : time_st,
    "SOURCESYSTEM_CD" : 'MU',
    "VALUETYPE_CD" : None,
    "M_EXCLUSION_CD" : '@',
    "C_PATH" : None,
    "C_SYMBOL" : None
}

In [8]:
# create template table
final_df = pd.DataFrame(columns=ddl_def.keys()).astype(ddl_def)

In [9]:
# create root
# insert top order
c_fullname = '\\i2b2\\naaccr\\'

entry = {
    "C_HLEVEL" : 1,
    "C_FULLNAME" : c_fullname,
    "C_NAME" : 'Cancer Cases (NAACCR Hierarchy)',
    "C_BASECODE" : None,
    "C_VISUALATTRIBUTES" : 'FA',
    "C_DIMCODE" : c_fullname,
    "C_TOOLTIP" : c_fullname.replace('\\',' \\ ')
}
root_df = pd.DataFrame([entry | fixed])

In [10]:
## create sections
elements = []

hierarchy_order_path = os.path.join(csv_path, 'SectionHierarchy.csv')
hierarchy_order = pd.read_csv(hierarchy_order_path)

for section in df_hierarchy["Section Name"].unique():
    item_c_fullname = c_fullname + f"{section}\\"
    hierarcy = int(hierarchy_order.loc[hierarchy_order['Section Name'] == section, 'Section Hierarchy'].iloc[0])
    entry = {
        "C_HLEVEL" : 2,
        "C_FULLNAME" : item_c_fullname,
        "C_NAME" : f"{hierarcy:02} " + section,
        "C_BASECODE" : None,
        "C_VISUALATTRIBUTES" : 'FA',
        "C_DIMCODE" : item_c_fullname,
        "C_TOOLTIP" : item_c_fullname.replace('\\',' \\ ')
    }
    elements.append(entry | fixed)
    
section_df = pd.DataFrame(elements)


In [11]:
## create items
elements = []

for index, row in df_hierarchy.iterrows():
    section = row["Section Name"]
    item_name = row["Data Item Name"]
    item_num = row["Data Item Number"]
    unique_id = row["XML NAACCR ID"]
    base_code = f"NAACCR|{item_num}"
    
    item_c_fullname = c_fullname + f"{section}\\" + f"{unique_id}\\"
    entry = {
        "C_HLEVEL" : 3,
        "C_FULLNAME" :  item_c_fullname,
        "C_NAME" : f"{int(item_num):04} " + item_name,
        "C_BASECODE" : base_code,
        "C_VISUALATTRIBUTES" : 'FA',
        "C_DIMCODE" : item_c_fullname,
        "C_TOOLTIP" : item_c_fullname.replace('\\',' \\ ')
    }
    elements.append(entry | fixed)
item_df = pd.DataFrame(elements)

In [12]:
# create code items
elements = []
df_code_list = df_code_list.merge(
    df_hierarchy[["Data Item Number", "Section Name", "XML NAACCR ID"]],
    on="Data Item Number",
    how='inner'
)

for index, row in df_code_list.iterrows():
    item_name = row["Data Item Name"]
    item_num = row["Data Item Number"]
    item_desc = row["Description"]
    unique_id = row["XML NAACCR ID"]
    code_value = row["Code"]
    code_value = code_value[:30]
    section = row["Section Name"]
    base_code = f"NAACCR|{item_num}:{code_value}"
    item_c_fullname = c_fullname + f"{section}\\" + f"{unique_id}\\" + f"{code_value}\\"
    entry = {
        "C_HLEVEL" : 4,
        "C_FULLNAME" :  item_c_fullname,
        "C_NAME" : item_desc,
        "C_BASECODE" : base_code,
        "C_VISUALATTRIBUTES" : 'LA',
        "C_DIMCODE" : item_c_fullname,
        "C_TOOLTIP" : item_c_fullname.replace('\\',' \\ ')
    }
    elements.append(entry | fixed)
code_df = pd.DataFrame(elements)

In [13]:
 ## primary sites topography

def is_code_in_range(code, range_str):
    # Define the pattern for the range
    range_pattern = re.compile(r'^C(\d{2})-C(\d{2})$')
    match = range_pattern.match(range_str)
    if not match:
        raise ValueError("Range string does not match the pattern CXX-CXX")
    
    start_code = int(match.group(1))
    end_code = int(match.group(2))
    code_number = int(code[1:])  # Remove the 'C' and convert the rest to an integer
    
    return start_code <= code_number <= end_code

def extract_topoloy(path):
    # Open the text file in read mode
    with open(path, 'r') as file:
        # Read the file line by line
        main_categories = []
        sub_categories = []
        items = []
        for line in file:
            # Strip leading/trailing whitespace characters
            line = line.strip()
            # Process the line (for example, print it)
            # Define the regex patterns
            pattern1 = re.compile(r'^C\d{2}-C\d{2} .*$')       # Matches CXX-CXX
            pattern2 = re.compile(r'^C\d{2} .*$')              # Matches CXX
            pattern3 = re.compile(r'^C\d{2}\.\d .*$')          # Matches CXX.X

            if pattern1.match(line):
                main_categories.append(line)
            elif pattern2.match(line):
                sub_categories.append(line)
            elif pattern3.match(line):    
                items.append(line)
            else: 
                print("Does not match any pattern")

        return main_categories, sub_categories, items
    
# Extract topology 
topography_path = os.path.join(csv_path, 'seer-api', 'topography-manually.txt')
main_categories, sub_categories, items = extract_topoloy(topography_path)


In [14]:
c_base_code = 'NAACCR|400'
primary_site = item_df.loc[item_df["C_BASECODE"] == c_base_code].iloc[0]
base_fullname = primary_site['C_FULLNAME']
base_level = primary_site['C_HLEVEL']
primary_site_elements = []
## create main hierarchy
for main_item in main_categories:
    main_item_splits = main_item.split(' ')
    code_value = main_item_splits[0]
    level = base_level + 1
    item_c_fullname = base_fullname + f"{code_value}\\"
    entry = {
        "C_HLEVEL" : level,
        "C_FULLNAME" :  item_c_fullname,
        "C_NAME" : main_item,
        "C_BASECODE" : c_base_code + ':' + code_value,
        "C_VISUALATTRIBUTES" : 'FA',
        "C_DIMCODE" : item_c_fullname,
        "C_TOOLTIP" : item_c_fullname.replace('\\',' \\ ')
    }
    primary_site_elements.append(entry | fixed)

## create sub hierarchy
for sub_category_item in sub_categories:
    sub_category_item_split = sub_category_item.split(' ')
    sub_category_item_split_code_value = sub_category_item_split[0]
    range_value = None
    for main_item in main_categories:
        main_item_range = main_item.split(' ')[0]
        if is_code_in_range(sub_category_item_split_code_value, main_item_range):
            range_value = main_item_range
            break
    if range_value:
        level = base_level + 2
        item_c_fullname = base_fullname +  f"{range_value}\\" + f"{sub_category_item_split_code_value}\\"
        
    else:
        level = base_level + 1
        item_c_fullname = base_fullname + f"{sub_category_item_split_code_value}\\"
    
    entry = {
            "C_HLEVEL" : level,
            "C_FULLNAME" :  item_c_fullname,
            "C_NAME" : sub_category_item,
            "C_BASECODE" : c_base_code + ':' + sub_category_item_split_code_value,
            "C_VISUALATTRIBUTES" : 'FA',
            "C_DIMCODE" : item_c_fullname,
            "C_TOOLTIP" : item_c_fullname.replace('\\',' \\ ')
        }
    primary_site_elements.append(entry | fixed)

## populate items
for item in items:
    item_split = item.split(' ')
    code_item = item_split[0]
    
    subcategory_item = code_item.split('.')[0]

    range_value = None
    for main_item in main_categories:
        main_item_range = main_item.split(' ')[0]
        if is_code_in_range(subcategory_item, main_item_range):
            range_value = main_item_range
            break
    if range_value:
        level = base_level + 3
        item_c_fullname = base_fullname +  f"{range_value}\\" + f"{subcategory_item}\\" + f"{item}\\"
        
    else:
        level = base_level + 2
        item_c_fullname = base_fullname + f"{subcategory_item}\\" + f"{item}\\"
    
    entry = {
            "C_HLEVEL" : level,
            "C_FULLNAME" :  item_c_fullname,
            "C_NAME" : item,
            "C_BASECODE" : c_base_code + ':' + code_item.replace('.',''),
            "C_VISUALATTRIBUTES" : 'LA',
            "C_DIMCODE" : item_c_fullname,
            "C_TOOLTIP" : item_c_fullname.replace('\\',' \\ ')
        }
    primary_site_elements.append(entry | fixed)

    
primary_site_df = pd.DataFrame(primary_site_elements)

In [15]:
# morphology from histology + behavior
# hide histology and behavior icd-0-2 and icd-0-3 
morph_path = os.path.join(csv_path, 'morphology.xls')

morph_df = pd.read_excel(morph_path,index_col=None, header=None, skiprows=2)
morph_df = morph_df.iloc[:, :3]
morph_df.columns = ['Code', 'Level', 'Term']
morph_df = morph_df[(morph_df["Level"] != 'Synonym') & (morph_df["Level"] != 'Related')]

morph_hierarchy = morph_df[morph_df["Level"] != 'Preferred']
morph_items = morph_df[morph_df["Level"] == 'Preferred']

In [16]:
def code_in_range(code: str, parent_code: str):
    code_split = code.split('-')
    parent_code_split = parent_code.split('-')
    if len(parent_code_split) == 1:
        return code == parent_code
    code_left = int(code_split[0])
    parent_left = int(parent_code_split[0])
    parent_right = int(parent_code_split[1])
    return code_left in range(parent_left, parent_right + 1)

In [17]:
root_node  = section_df.loc[section_df["C_NAME"] == '01 Cancer Identification'].iloc[0]
base_fullname = root_node["C_FULLNAME"] + 'MORPHOLOGY' + '\\'
base_level = root_node["C_HLEVEL"]
base_code = 'NAACCR|MORPH'

morph_elements = []

for index, item in morph_hierarchy.iterrows():
    code_item = str(item["Code"])
    code_level = int(item["Level"])
    code_term = str(item["Term"])
    c_hlevel = base_level + int(code_level)
    if code_item == 'nan':
        c_fullname = base_fullname 
        c_base_code = base_code
    else:
        parent_full_name = base_fullname
        prev_entries = morph_hierarchy.loc[morph_hierarchy["Level"] < code_level]
        for i, entry in prev_entries.iterrows():
            entry_item = str(entry["Code"])
            if code_in_range(code_item, entry_item):
                parent_full_name = base_fullname + entry_item + '\\'
                break
            
        c_fullname = parent_full_name + code_item + '\\'
        c_base_code = base_code + ":" + code_item

    entry = {
            "C_HLEVEL" : c_hlevel,
            "C_FULLNAME" :  c_fullname,
            "C_NAME" : ('' if code_item == 'nan' else code_item + ' ') + code_term,
            "C_BASECODE" : c_base_code,
            "C_VISUALATTRIBUTES" : 'FA',
            "C_DIMCODE" : c_fullname,
            "C_TOOLTIP" : c_fullname.replace('\\',' \\ ')
        }
    morph_elements.append(entry | fixed)

morph_elements_df = pd.DataFrame(morph_elements)

In [18]:
morph_elements_items = []

for index, item in morph_items.iterrows():
    code_item = str(item["Code"])
    code_term = str(item["Term"])
    pref = code_item[:3]
    c_base_code = base_code + ":" + code_item
    
    for i, entry in morph_elements_df.sort_values("C_HLEVEL", ascending=False).iterrows():
            b_level = entry["C_HLEVEL"]
            if b_level == 3:
                continue
            c_parent_full_name = entry["C_FULLNAME"]
            c_fullname = c_parent_full_name + code_item + '\\'
    
            entry_base_code = entry["C_BASECODE"].split(':')[-1]
            if code_in_range(pref, entry_base_code):

                entry = {
                    "C_HLEVEL" : b_level + 1,
                    "C_FULLNAME" :  c_fullname,
                    "C_NAME" : ('' if code_item == 'nan' else code_item + ' ') + code_term,
                    "C_BASECODE" : c_base_code,
                    "C_VISUALATTRIBUTES" : 'LA',
                    "C_DIMCODE" : c_fullname,
                    "C_TOOLTIP" : c_fullname.replace('\\',' \\ ')
                }
                morph_elements_items.append(entry | fixed)
                break

morph_elements_items_df = pd.DataFrame(morph_elements_items)

In [19]:
final_df = pd.concat(
    [
        final_df, 
        root_df,
        section_df, 
        item_df,
        code_df, 
        primary_site_df, 
        morph_elements_df,
        morph_elements_items_df
    ],
    ignore_index=True
)
final_df.shape

(6170, 25)

In [20]:
config = dotenv_values('env/.env')

con = sc.connect(
    user = config["user"],
    password = config["password"],
    account = config["account"],
    warehouse = config["warehouse"],
    role = config["role"],
    database = config["database"],
    schema = config["schema"]
)

In [21]:
success, nchunks, nrows, _ = snp.write_pandas(con, 
                                              final_df, 
                                              'NAACCR_ONTOLOGY', 
                                              auto_create_table= True, 
                                              overwrite=True)
print(success, nchunks, nrows)

True 1 6170
