In [1]:
import os
import sys
import logging
import itertools

In [4]:
sys.path.append('..')

In [5]:
import utils

In [6]:
from trousse.dataset import Dataset


In [7]:
logging.basicConfig(format='%(asctime)s \t %(levelname)s \t Module: %(module)s \t %(message)s ',
                    datefmt='%d-%b-%y %H:%M:%S', level=logging.INFO)

## Load DataFrame from Anonymized .csv
Create object with some infos about the dataframe

We also define some metadata columns manually (referring to the patient identity -> not defined by clinical exams)

Their list is retrieved by copying the first 31 columns of the 'Sani_15300_anonym.csv'

We use these information to isolate the clinical exams features that can be used for partitioning

In [14]:
# This following list is copied and pasted directly from .csv file from the first row (it may be automatized, 
# but this offers visual control)
metadata_cols = "GROUPS	TAG	DATA_SCHEDA	NOME	ID_SCHEDA	COMUNE	PROV	MONTH	YEAR	BREED	SEX	AGE	SEXUAL STATUS	BODYWEIGHT	PULSE RATE	RESPIRATORY RATE	TEMP	BLOOD PRESS MAX	BLOOD PRESS MIN	BLOOD PRESS MEAN	BODY CONDITION SCORE	HT	H	DEATH	TIME OF DEATH	PROFILO_PAZIENTE	ANAMNESI_AMBIENTALE	ANAMNESI_ALIMENTARE	VACCINAZIONI	FILARIOSI	GC_SEQ"
metadata_cols = set(metadata_cols.replace('\t',',').split(','))
metadata_cols = metadata_cols.union({'FILARIOSI', 'PROFILO_PAZIENTE'})

In [35]:
CWD = os.getcwd()
DATA_PATH = os.path.join(CWD, "..", "data")

In [16]:
df_sani_dir = os.path.join(DATA_PATH, 'Sani_15300_anonym.csv')
# df_correct_dir = os.path.join('data', 'df_correct.csv')

In [17]:
dataset = Dataset(metadata_cols=metadata_cols, data_file=df_sani_dir)

  exec(code_obj, self.user_global_ns, self.user_ns)
28-May-20 17:31:43 	 INFO 	 Module: dataframe_with_info 	 Data imported from file successfully 


### Correct some errors in DF 
We look for:
- Columns where we have different types mixed up
- Columns that are not float or int

In [18]:
from utils.refactoring.row_fix import RowFix

In [19]:
fix_tool = RowFix()
df_correct = fix_tool.fix_common_errors(dataset)



28-May-20 17:32:11 	 INFO 	 Module: row_fix 	 Osmolal Gap is converted from String to Numeric. Lost values are: 
{'ASSENTI', 'PRESENTI'} 
28-May-20 17:32:11 	 INFO 	 Module: row_fix 	 TLI is converted from String to Numeric. Lost values are: 
{'>100', '>50.0'} 
28-May-20 17:32:11 	 INFO 	 Module: row_fix 	 pH (quantitative) is converted from String to Numeric. Lost values are: 
{'8.0.'} 
28-May-20 17:32:11 	 INFO 	 Module: row_fix 	 Serum Total Bilirubin is converted from String to Numeric. Lost values are: 
{'0-22'} 
28-May-20 17:32:11 	 INFO 	 Module: row_fix 	 Lipase/Crea is converted from String to Numeric. Lost values are: 
{'PRESENTI'} 
28-May-20 17:32:11 	 INFO 	 Module: row_fix 	 D Dimer is converted from String to Numeric. Lost values are: 
{'0.0.6'} 


In [20]:
fix_tool.count_errors()


 Rows with initial mistakes: 63

 Total:  BEFORE: 155 errors  -->  AFTER: 3 errors


In [21]:
fix_tool.print_errors_per_column()

The errors per feature are:
TTKG: 2 : {'ASSENTI'} ---> 0 : set()
Serum PON-1: 1 : {'-'} ---> 0 : set()
MONOCYTE: 17 : {'3%', '10%', '6%', '8%', '4%', '2%', '5%', '7%'} ---> 0 : set()
PUCU: 1 : {'ASSENTI'} ---> 0 : set()
EOSINOPHIL: 17 : {'3%', '10%', '1%', '6%', '8%', '0%', '2%', '5%'} ---> 0 : set()
Bile Acids/Crea: 1 : {'ASSENTI'} ---> 0 : set()
EF Posphate: 2 : {'ASSENTI'} ---> 0 : set()
VolLTTHY: 1 : {'0,68'} ---> 0 : set()
BASOPHIL: 16 : {'0%'} ---> 0 : set()
D Dimer: 2 : {'0.0.6', '0,03'} ---> 1 : {'0.0.6'}
Plasma Lactate: 3 : {'1,3', '19,6', '-'} ---> 0 : set()
pH (quantitative): 2 : {'8,9', '8.0.'} ---> 1 : {'8.0.'}
Amylase/Crea: 1 : {'PRESENTI'} ---> 0 : set()
Serum Total Protein: 1 : {'6,4'} ---> 0 : set()
Fibrinogen: 3 : {'<60'} ---> 0 : set()
Bilirubin/Crea: 1 : {'PRESENTI'} ---> 0 : set()
EF Potassium: 2 : {'ASSENTI'} ---> 0 : set()
TT4: 1 : {'-'} ---> 0 : set()
FT4: 3 : {'<3.86', '---', '-'} ---> 0 : set()
Serum Ferritin: 1 : {'Error'} ---> 0 : set()
TSH: 4 : {'<0.030', '

### Categorical Encoding

In [22]:
from utils.refactoring import feature_fix

In [23]:
df_correct = feature_fix.encode_single_categorical_column(df_correct, col_name='FILARIOSI')

In [24]:
df_correct.find_encoded_column('FILARIOSI')

('FILARIOSI_enc',)

In [25]:
df_correct.feature_elaborations['FILARIOSI_enc'][0]

<utils.dataframe_with_info.FeatureOperation at 0x7fef6d348d00>

## Create age_partition column to split age intervals in three parts

In [26]:
df_correct = feature_fix.split_continuous_column_into_bins(df_correct, col_name='AGE', bin_threshold=[12, 84])

Look for the encoded column derived from 'AGE'

In [27]:
from utils.refactoring.feature_enum import OperationTypeEnum
from utils.dataframe_with_info import FeatureOperation

In [28]:
operation_on_age = df_correct.find_operation_in_column(feat_operation=FeatureOperation(original_columns='AGE', operation_type=OperationTypeEnum.BIN_SPLITTING))

In [29]:
df_correct.df[operation_on_age.derived_columns[0]]

0        1
1        2
2        1
3        1
4        1
        ..
15212    2
15213    1
15214    1
15215    1
15216    2
Name: AGE_bin_id, Length: 15217, dtype: int8

We may want to retrieve informations about what has been done.
We select the first operation because it is the only present

## Multiple combination of categorical columns (metadata)

In [30]:
from utils.refactoring import feature_fix

In [31]:
partition_cols = ['SEX', 'SEXUAL STATUS', 'AGE_bin_id']

In [32]:
df_output, new_columns = \
    feature_fix.make_categorical_columns_multiple_combinations(df_correct, col_names=partition_cols)

## Set column to datetime (ADD to library functionalities!!!)

In [33]:
import pandas as pd
import numpy as np

# Convert to datetime
dataset.df['DATA_SCHEDA'] = pd.to_datetime(dataset.df['DATA_SCHEDA'], format='%m/%d/%Y')
dataset.df['AGE'] = dataset.df['AGE'].astype(np.int16)

### Export and import of df_correct instance to file using 'shelve'

In [36]:
df_correct.to_file(os.path.join(DATA_PATH, 'df_correct_dump'))