In [38]:
import pandas as pd
from pathlib import Path
import os
import numpy as np
from numpy import nan
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date, timedelta

## Retrieve data

In [39]:
# Check the DataCatalog list
catalog.list()

['Research data',
 'Patient',
 'Diagnosis',
 'External beam radiotherapy',
 'Brachytherapy',
 'Chemotherapy',
 'Acute toxicity',
 'Response status',
 'Late morbidity',
 'Disease status',
 'parameters']

## Selecting the research variables

In this notebook, I'm creating a joint table that contains all variables of interest. The sample size of the research group is determined by the number of patients for which we have complete data.

The table is created by selecting and joining interim tables that contain the variables that we are using in the research.

In [40]:
# Importing data from dataframes
patient = catalog.load('Patient')
diagnosis = catalog.load('Diagnosis')
external_radiotherapy = catalog.load('External beam radiotherapy')
brachytherapy = catalog.load('Brachytherapy')
chemotherapy = catalog.load('Chemotherapy')
acute_toxicity = catalog.load('Acute toxicity')
response_status = catalog.load('Response status')
late_morbidity = catalog.load('Late morbidity')
disease_status = catalog.load('Disease status')

2022-07-25 17:02:23,327 - kedro.io.data_catalog - INFO - Loading data from `Patient` (ExcelDataSet)...
2022-07-25 17:02:23,509 - kedro.io.data_catalog - INFO - Loading data from `Diagnosis` (ExcelDataSet)...
2022-07-25 17:02:23,896 - kedro.io.data_catalog - INFO - Loading data from `External beam radiotherapy` (ExcelDataSet)...
2022-07-25 17:02:24,052 - kedro.io.data_catalog - INFO - Loading data from `Brachytherapy` (ExcelDataSet)...
2022-07-25 17:02:24,214 - kedro.io.data_catalog - INFO - Loading data from `Chemotherapy` (ExcelDataSet)...
2022-07-25 17:02:24,331 - kedro.io.data_catalog - INFO - Loading data from `Acute toxicity` (ExcelDataSet)...
2022-07-25 17:02:24,461 - kedro.io.data_catalog - INFO - Loading data from `Response status` (ExcelDataSet)...
2022-07-25 17:02:24,623 - kedro.io.data_catalog - INFO - Loading data from `Late morbidity` (ExcelDataSet)...
2022-07-25 17:02:24,771 - kedro.io.data_catalog - INFO - Loading data from `Disease status` (ExcelDataSet)...


In [41]:
df_merged = diagnosis\
    .join(late_morbidity,on='id',lsuffix='', rsuffix='_y')\
    .join(external_radiotherapy,on='id',lsuffix='', rsuffix='_y')\
    .join(brachytherapy,on='id',lsuffix='', rsuffix='_y')\
    .join(patient,on='id',lsuffix='', rsuffix='_y')\
    .join(diagnosis,on='id',lsuffix='', rsuffix='_y')\
    .join(chemotherapy,on='id',lsuffix='', rsuffix='_y')\
    .join(acute_toxicity,on='id',lsuffix='', rsuffix='_y')\
    .join(disease_status,on='id',lsuffix='', rsuffix='_y')\
    .join(response_status,on='id',lsuffix='', rsuffix='_y')

df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 120 entries, 101 to 180
Columns: 139 entries, card_no to Last_follup_timing
dtypes: datetime64[ns](9), float64(40), int64(20), object(70)
memory usage: 131.2+ KB


In [42]:
# Select variables of interest
df = df_merged[['perfor_status',
             'has_concom',
             'tumour_stage',
             'N',
             'M',
             'figo',
             'pathological_type',
             'diagnos_tumor_size',
             'growth_type',
             'vaginal_invasion',
             'PaR',
             'PaL',
             'uterine_invasion',
             'ct_abdomen',
             'ct_pelvic',
             'mri_diagnostic',
             'mri_before_brachy',
             'pelvic_node_mts',
             'Paraaort_node_mts',
             'treatment_total_days',
             'pelvic_total_dose',
             'Pelv_fr',
             'Dose_per_fr',
             'midline_block_dose',
             'midline_block_frac',
             'paramet_boost_dose',
             'paramet_boost_fr',
             'card_no',
             'last_brachy_date',
             'applicator_name',
             # 'fraction_number',
             'total_apoint_left',
             'total_apoint_right',
             # 'icru_rectum',
             'icru_bladder',
             'hrctv_volume',
             'eqd2_bladder',
             'eqd2_rectum',
             'eqd2_sigmoid',
             'eqd2_hrbrachy_dose',
             'eqd2_total_dose',
             'chemo_dose',
             'chemo_numcycles',
             'before_brachy_tumor_size',
             'before_brachy_categor',
             'post_treatment_response',
             'post_treatment_response_date',
             # 'early_response_status',
             # 'early_response_date',
             'last_response_date',
             'last_response_status',
             'Last_follup_timing',
             # 'Last_rect_grade',
             'Last_rect_eval_date',
             # 'Last_rectum_duration',
             # 'Last_bladder_grade',
             # 'Last_blad_eval_date',
             # 'Last_blad_duration'
            ]].copy()

In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 120 entries, 101 to 180
Data columns (total 49 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   perfor_status                 120 non-null    int64         
 1   has_concom                    120 non-null    object        
 2   tumour_stage                  120 non-null    object        
 3   N                             120 non-null    int64         
 4   M                             120 non-null    int64         
 5   figo                          120 non-null    object        
 6   pathological_type             120 non-null    object        
 7   diagnos_tumor_size            120 non-null    object        
 8   growth_type                   120 non-null    int64         
 9   vaginal_invasion              120 non-null    object        
 10  PaR                           120 non-null    object        
 11  PaL                           

In [44]:
catalog.save('Research data',df)

2022-07-25 17:02:25,034 - kedro.io.data_catalog - INFO - Saving data to `Research data` (CSVDataSet)...
