In [1]:
import pandas as pd
from pathlib import Path
import os
import numpy as np
from numpy import nan

## Retrieve data

In [2]:
%reload_kedro

2022-09-29 17:43:23,598 - kedro.extras.extensions.ipython - INFO - No path argument was provided. Using: C:\Users\gdbt0\Projects\cancer-data-analytics
2022-09-29 17:43:23,886 - kedro.framework.session.store - INFO - `read()` not implemented for `BaseSessionStore`. Assuming empty store.
2022-09-29 17:43:23,992 - kedro.extras.extensions.ipython - INFO - ** Kedro project Cancer data analytics
2022-09-29 17:43:23,993 - kedro.extras.extensions.ipython - INFO - Defined global variable `context`, `session`, `catalog` and `pipelines`


No files found in ['C:\\Users\\gdbt0\\Projects\\cancer-data-analytics\\conf\\base', 'C:\\Users\\gdbt0\\Projects\\cancer-data-analytics\\conf\\local'] matching the glob pattern(s): ['credentials*', 'credentials*/**', '**/credentials*']
  warn(f"Credentials not found in your Kedro project config.\n{str(exc)}")


In [3]:
# Check the DataCatalog list
catalog.list()

['Research data Mongolian',
 'Research data English',
 'Patient',
 'Diagnosis',
 'External beam radiotherapy',
 'Brachytherapy',
 'Chemotherapy',
 'Acute toxicity',
 'Response status',
 'Late morbidity',
 'Disease status',
 'parameters']

## Selecting the research variables

In this notebook, I'm creating a joint table that contains all variables of interest. The sample size of the research group is determined by the number of patients for which we have complete data.

The table is created by selecting and joining interim tables that contain the variables that we are using in the research.

In [4]:
# Importing data from dataframes
patient = catalog.load('Patient')
diagnosis = catalog.load('Diagnosis')
external_radiotherapy = catalog.load('External beam radiotherapy')
brachytherapy = catalog.load('Brachytherapy')
chemotherapy = catalog.load('Chemotherapy')
acute_toxicity = catalog.load('Acute toxicity')
response_status = catalog.load('Response status')
late_morbidity = catalog.load('Late morbidity')
disease_status = catalog.load('Disease status')

2022-09-29 17:43:24,027 - kedro.io.data_catalog - INFO - Loading data from `Patient` (ExcelDataSet)...
2022-09-29 17:43:24,425 - kedro.io.data_catalog - INFO - Loading data from `Diagnosis` (ExcelDataSet)...
2022-09-29 17:43:24,721 - kedro.io.data_catalog - INFO - Loading data from `External beam radiotherapy` (ExcelDataSet)...
2022-09-29 17:43:24,826 - kedro.io.data_catalog - INFO - Loading data from `Brachytherapy` (ExcelDataSet)...
2022-09-29 17:43:24,938 - kedro.io.data_catalog - INFO - Loading data from `Chemotherapy` (ExcelDataSet)...
2022-09-29 17:43:25,040 - kedro.io.data_catalog - INFO - Loading data from `Acute toxicity` (ExcelDataSet)...
2022-09-29 17:43:25,125 - kedro.io.data_catalog - INFO - Loading data from `Response status` (ExcelDataSet)...
2022-09-29 17:43:25,221 - kedro.io.data_catalog - INFO - Loading data from `Late morbidity` (ExcelDataSet)...
2022-09-29 17:43:25,323 - kedro.io.data_catalog - INFO - Loading data from `Disease status` (ExcelDataSet)...


In [5]:
df_merged = diagnosis\
    .join(late_morbidity,on='id',lsuffix='', rsuffix='_y')\
    .join(external_radiotherapy,on='id',lsuffix='', rsuffix='_y')\
    .join(brachytherapy,on='id',lsuffix='', rsuffix='_y')\
    .join(patient,on='id',lsuffix='', rsuffix='_y')\
    .join(diagnosis,on='id',lsuffix='', rsuffix='_y')\
    .join(chemotherapy,on='id',lsuffix='', rsuffix='_y')\
    .join(acute_toxicity,on='id',lsuffix='', rsuffix='_y')\
    .join(disease_status,on='id',lsuffix='', rsuffix='_y')\
    .join(response_status,on='id',lsuffix='', rsuffix='_y')

df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 120 entries, 101 to 180
Columns: 139 entries, card_no to Last_follup_timing
dtypes: datetime64[ns](9), float64(40), int64(19), object(71)
memory usage: 131.2+ KB


In [6]:
# Select variables of interest
df = df_merged[[
            'age',
              'perfor_status',
             'has_concom',
             'tumour_stage',
             'N',
             'M',
             'figo',
             'pathological_type',
             'diagnos_tumor_size',
             'growth_type',
             'vaginal_invasion',
             'PaR',
             'PaL',
             'uterine_invasion',
             'ct_abdomen',
             'ct_pelvic',
             'mri_diagnostic',
             'mri_before_brachy',
             'pelvic_node_mts',
             'Paraaort_node_mts',
             'treatment_total_days',
             'pelvic_total_dose',
             'Pelv_fr',
             'Dose_per_fr',
             'midline_block_dose',
             'midline_block_frac',
             'paramet_boost_dose',
             'paramet_boost_fr',
             'card_no',
             'last_brachy_date',
             'applicator_name',
             # 'fraction_number',
             'total_apoint_left',
             'total_apoint_right',
             'icru_rectum',
             'icru_bladder',
             'hrctv_volume',
             'eqd2_bladder',
             'eqd2_rectum',
             'eqd2_sigmoid',
             'eqd2_hrbrachy_dose',
             'eqd2_total_dose',
             'chemo_dose',
             'chemo_numcycles',
             'before_brachy_tumor_size',
             'before_brachy_categor',
             'post_treatment_response',
             'post_treatment_response_date',
             'early_response_status',
             # 'early_response_date',
             'last_response_date',
             'last_response_status',
             'Last_follup_timing',
             'rect_sig_grade',
             # 'Last_rect_grade',
             'Last_rect_eval_date',
             # 'Last_rectum_duration',
             # 'Last_bladder_grade',
             # 'Last_blad_eval_date',
             # 'Last_blad_duration'
            ]].copy()

In [7]:
df['id'] = df_merged.index

## Create a new age value column

In [8]:
def define_age_group(age):
    if age < 41:
        return "31-40"
    elif age < 51:
        return "41-50"
    elif age < 61:
        return "51-60"
    elif age < 71:
        return "61-70"
    elif age < 81:
        return "71-80"
    else:
        return "81-90"

In [9]:
df["age_group"] = [define_age_group(age) for age in df['age']]

In [10]:
catalog.save('Research data English', df)

2022-09-29 17:43:25,562 - kedro.io.data_catalog - INFO - Saving data to `Research data English` (CSVDataSet)...


## Translate column names

In [11]:
# English column names
english_col_names = df.columns
print(english_col_names)

Index(['age', 'perfor_status', 'has_concom', 'tumour_stage', 'N', 'M', 'figo',
       'pathological_type', 'diagnos_tumor_size', 'growth_type',
       'vaginal_invasion', 'PaR', 'PaL', 'uterine_invasion', 'ct_abdomen',
       'ct_pelvic', 'mri_diagnostic', 'mri_before_brachy', 'pelvic_node_mts',
       'Paraaort_node_mts', 'treatment_total_days', 'pelvic_total_dose',
       'Pelv_fr', 'Dose_per_fr', 'midline_block_dose', 'midline_block_frac',
       'paramet_boost_dose', 'paramet_boost_fr', 'card_no', 'last_brachy_date',
       'applicator_name', 'total_apoint_left', 'total_apoint_right',
       'icru_rectum', 'icru_bladder', 'hrctv_volume', 'eqd2_bladder',
       'eqd2_rectum', 'eqd2_sigmoid', 'eqd2_hrbrachy_dose', 'eqd2_total_dose',
       'chemo_dose', 'chemo_numcycles', 'before_brachy_tumor_size',
       'before_brachy_categor', 'post_treatment_response',
       'post_treatment_response_date', 'early_response_status',
       'last_response_date', 'last_response_status', 'Last_follu

In [14]:
mongolian_col_names = ['Нас', 'Биеийн ерөнхий байдал', 'Хавсарсан өвчин', 'Үе шат (T)', 'N', 'M',
                       'FIGO үе шат',
                       'Эмгэг судлалын дүгнэлт', 'Оношлогдох үеийн хавдрын хэмжээ', 'Ургалтын хэлбэр',
                       'Үтрээний нэвчилт', 'PaR', 'PaL', 'Умайн их биеийн нэвчилт', 'Хэвлийн КТ',
                       'Аарцгийн КТ', 'Оношилгооны СРТ', 'ДТЭ-ний өмнөх СРТ', 'pelvic_node_mts',
                       'Paraaort_node_mts', 'treatment_total_days', 'pelvic_total_dose',
                       'Pelv_fr', 'Dose_per_fr', 'midline_block_dose', 'midline_block_frac',
                       'paramet_boost_dose', 'paramet_boost_fr', 'card_no', 'last_brachy_date',
                       'applicator_name', 'total_apoint_left', 'total_apoint_right',
                       'Шулуун гэдэсний ICRU цэгний тун', 'Давсагны ICRU цэгний тун', 'HRCTV эзэлхүүн', 'Давсагны EQD2 тун', 'Шулуун гэдэсний EQD2 тун',
                       'eqd2_sigmoid', 'eqd2_hrbrachy_dose', 'EQD2 нийлбэр тун', 'chemo_dose',
                       'Химийн эмчилгээний давтамжийн тоо', 'ДТЭ-ний өмнөх хавдрын хэмжээ', 'ДТЭ-ний өмнөх хавдрын багасалтын категори',
                       'Хосолсон ТЭ-ий дараахь үр дүн',
                       'post_treatment_response_date',
                       'early_response_status',
                       'last_response_date',
                       'Өвчний ',
                       'Last_follup_timing',
                       'Шулуун гэдэсний гаж нөлөөний зэрэг',
                       'Last_rect_eval_date', 'id', 'Насны ангилал']

In [15]:
# Change the column names
df.columns = dict(zip(mongolian_col_names,english_col_names,))
print(dict(zip(mongolian_col_names,english_col_names,)))

{'Нас': 'age', 'Биеийн ерөнхий байдал': 'perfor_status', 'Хавсарсан өвчин': 'has_concom', 'Үе шат (T)': 'tumour_stage', 'N': 'N', 'M': 'M', 'FIGO үе шат': 'figo', 'Эмгэг судлалын дүгнэлт': 'pathological_type', 'Оношлогдох үеийн хавдрын хэмжээ': 'diagnos_tumor_size', 'Ургалтын хэлбэр': 'growth_type', 'Үтрээний нэвчилт': 'vaginal_invasion', 'PaR': 'PaR', 'PaL': 'PaL', 'Умайн их биеийн нэвчилт': 'uterine_invasion', 'Хэвлийн КТ': 'ct_abdomen', 'Аарцгийн КТ': 'ct_pelvic', 'Оношилгооны СРТ': 'mri_diagnostic', 'ДТЭ-ний өмнөх СРТ': 'mri_before_brachy', 'pelvic_node_mts': 'pelvic_node_mts', 'Paraaort_node_mts': 'Paraaort_node_mts', 'treatment_total_days': 'treatment_total_days', 'pelvic_total_dose': 'pelvic_total_dose', 'Pelv_fr': 'Pelv_fr', 'Dose_per_fr': 'Dose_per_fr', 'midline_block_dose': 'midline_block_dose', 'midline_block_frac': 'midline_block_frac', 'paramet_boost_dose': 'paramet_boost_dose', 'paramet_boost_fr': 'paramet_boost_fr', 'card_no': 'card_no', 'last_brachy_date': 'last_brachy_

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 120 entries, 101 to 180
Data columns (total 55 columns):
 #   Column                                     Non-Null Count  Dtype         
---  ------                                     --------------  -----         
 0   Нас                                        120 non-null    int64         
 1   Биеийн ерөнхий байдал                      120 non-null    int64         
 2   Хавсарсан өвчин                            120 non-null    object        
 3   Үе шат (T)                                 120 non-null    object        
 4   N                                          120 non-null    int64         
 5   M                                          120 non-null    int64         
 6   FIGO үе шат                                120 non-null    object        
 7   Эмгэг судлалын дүгнэлт                     120 non-null    object        
 8   Оношлогдох үеийн хавдрын хэмжээ            120 non-null    object        
 9   Ургалтын хэлбэр    

In [17]:
df.head()

Unnamed: 0_level_0,Нас,Биеийн ерөнхий байдал,Хавсарсан өвчин,Үе шат (T),N,M,FIGO үе шат,Эмгэг судлалын дүгнэлт,Оношлогдох үеийн хавдрын хэмжээ,Ургалтын хэлбэр,...,Хосолсон ТЭ-ий дараахь үр дүн,post_treatment_response_date,early_response_status,last_response_date,Өвчний,Last_follup_timing,Шулуун гэдэсний гаж нөлөөний зэрэг,Last_rect_eval_date,id,Насны ангилал
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
101,51,1,Тийм,2B,1,0,IIIC1,SCC (NOS),≤4см,1,...,CR,2020.02.04,,2022.05.18,CR,27,,1/5/2022,101,51-60
102,50,1,Үгүй,2B,1,0,IIIC1,SCC (NOS),≤4см,2,...,CR,2019.12.18,,2022.05.18,CR,29,,1/5/2022,102,41-50
103,41,1,Үгүй,3B,1,0,IIIC1,SCC (NOS),>4см,2,...,-0.75,2020.02.03,,,D,6,,,103,41-50
104,51,1,Үгүй,2B,1,0,IIIC1,SCC (NOS),>4см,1,...,CR,2021.01.30,Үсэрхийлэлтэй,2022.05.25,DM/verchov,28,1.0,1/5/2022,104,51-60
105,59,1,Тийм,3B,1,0,IIIC1,SCC (NOS),>4см,1,...,-0.9,2020.03.17,,2022.05.30,CR,26,,1/5/2022,105,51-60


In [18]:
catalog.save('Research data Mongolian', df)

2022-09-29 17:44:00,311 - kedro.io.data_catalog - INFO - Saving data to `Research data Mongolian` (CSVDataSet)...
