# UMAP plots and Clustering

### Load DataFrame 
Create object with some infos about the dataframe

We also define some metadata columns manually (referring to the patient identity -> not defined by clinical exams)

Their list is retrieved by copying the first 31 columns of the 'Sani_15300_anonym.csv'

We use these information to isolate the clinical exams features that can be used for partitioning

In [1]:
import os
import sys
import logging
import itertools

In [2]:
from pd_extras.dataframe_with_info import DataFrameWithInfo
from pd_extras.dataframe_with_info import import_df_with_info_from_file

In [3]:
logging.basicConfig(format='%(asctime)s \t %(levelname)s \t Module: %(module)s \t %(message)s ',
                    datefmt='%d-%b-%y %H:%M:%S', level=logging.INFO)

Import of df_correct instance to file using 'shelve'

In [4]:
from pathlib import Path

In [6]:
CWD = Path(os.path.abspath(os.path.dirname("__file__"))).parents[1]
df_correct_dir = CWD / 'smvet' / 'data' / 'output_data' / 'ordinal_encoded' / 'df_ordinal_pat'
df_correct = import_df_with_info_from_file(df_correct_dir)

## Create partitions and show plots

In [7]:
bin_count = 13
partition_cols = ['SEX', 'SEXUAL STATUS', 'AGE_bin_id']
input_feat_x = "Serum Albumin"

# UMAP Plot to check separation

In [9]:
sys.path.append('..')

In [10]:
from src.umapviz.umap_exp import UmapExperiment
from src.umapviz.umap_metrics import tanimoto_gower

In [11]:
df_info = df_correct

In [12]:
col_by_type = df_info.column_list_by_type

In [13]:
col_by_type

ColumnListByType(same_value_cols={'GC_SEQ', 'Serum SAA', 'GROUPS', 'TAG'}, mixed_type_cols={'D Dimer', 'Serum Total Bilirubin', 'pH (quantitative)'}, numerical_cols={'WBC', 'PMDW', 'Thrombin Time', 'Serum Potassium_3.4_enc', 'TTKG', 'Serum Potassium_6.0_enc', 'MONTH_7_enc', 'HCT', 'Serum Albumin_4.4_enc', 'Serum Albumin_1.9_enc', 'Serum Albumin_4.6_enc', 'Serum Potassium_3.2_enc', 'Sodium/Crea', 'HT_20_enc', 'Serum Chloride', 'Prothrombin Time', 'MCV', 'Serum Albumin_3.2_enc', 'HT_17_enc', 'SEX-SEXUAL STATUS-AGE_bin_id_enc', 'YEAR_2010_enc', 'VACCINAZIONI_Irregolari_enc', 'Serum Ca x Pi', 'Serum Sodium_144.0_enc', 'Serum Sodium_157.0_enc', 'VolRTTHY', 'Measured Osmolaity', 'NEUTROPHIL', 'BODY CONDITION SCORE_5.0_enc', 'EOSINOPHIL', 'Serum AST', 'ANAMNESI_AMBIENTALE_Casa_enc', 'Serum IgG', 'Serum Sodium_138.0_enc', 'ANAMNESI_ALIMENTARE_Mista_enc', 'Bilirubin/Crea', 'MPM', 'VolLTTHY', 'CH_R', 'CORTISOLO_LC_MS_MS', 'RETICULOCYTE COUNT', 'BODYWEIGHT', 'Serum Potassium_3.1_enc', 'Serum Sodi

In [14]:
# -----------------------
# UMAP (DEFAULT) SETTINGS
# -----------------------
NEIGHBOURHOOD = 10
N_COMPONENTS = 2
RANDOM_STATE = 23
PLOT_HEIGHT = 600
PLOT_WIDTH = 900


# --------------
# Bokeh Settings
# --------------
TOOLS = (
    "hover,crosshair,pan,wheel_zoom,zoom_in,zoom_out,box_zoom,"
    "undo,redo,reset,tap,save,box_select,poly_select,lasso_select,"
)
TOOLTIPS = [
    ("index", "$index"),
    ("(x,y)", "($x, $y)"),
    ("study_UID", "@study_uid"),
]

df_info.metadata_as_features = True
cols_by_type = df_info.column_list_by_type

exclude_cols = {
    "BODYWEIGHT",
    "ID_SCHEDA",
    "TIME OF DEATH",
    "YEAR",
    "AGE",
    "MONTH",
    "AGE_bin_id",
    "SEX_enc",
    "SEXUAL STATUS_enc"
    # FIXED COLS
    # 'Osmolal Gap', 'TLI', 'pH (quantitative)', 'Serum Total Bilirubin', 'Lipase/Crea', 'D Dimer',
    # 'RETICULOCYTE COUNT'
}

CAT_FEATURES = {
    "FILARIOSI_enc",
    "ANAMNESI_AMBIENTALE_enc",
    "VACCINAZIONI_enc",
    "ANAMNESI_ALIMENTARE_enc",
    "PROFILO_PAZIENTE_enc",
}

BOOL_FEATURES = {
    "SEX_enc",
    "SEXUAL STATUS_enc",
}
# cat_features_list = set()
# bool_features_list = set()
# num_features_list = df_info.med_exam_col_list - df_info.metadata_cols - exclude_cols -
#       CAT_FEATURES - BOOL_FEATURES
# cols_by_type.numerical_cols - exclude_cols - CAT_FEATURES - BOOL_FEATURES

color_list = (
    "#34c616",  # Verde
    "#735911",  # Marrone
    "#12b6c4",  # Azzurro
)
test_color_list = ("#da2e1a", "#da971a", "#1a43da")  # Rosso  # Arancione  # Blu

group_values_to_be_shown = (
    "BREED",
    (tuple(["MONGREL"]), tuple(["LABRADOR RETRIEVER"])),
)
# ['GERMAN SHEPHERD'], ['GOLDEN RETRIEVER']],
random_state = 42
num_cols = cols_by_type.numerical_cols.union(
    cols_by_type.num_categorical_cols
).union(cols_by_type.bool_cols)

umap_exp = UmapExperiment(
    df_info=df_info,
    n_neighbors=5,
    min_distance=0.01,
    not_nan_percentage_threshold=0.88,
    train_test_split_ratio=0.2,
    feature_to_color="AGE_bin_id",
    multi_marker_feats=("SEX_enc", "SEXUAL STATUS_enc"),
    enc_value_to_str_map={
        "SEX_enc": {0: "F", 1: "M"},
        "SEXUAL STATUS_enc": {0: "I", 1: "NI"},
    },
    file_title_prefix="AGE",
    exclude_feat_list=exclude_cols,
    numer_feat_list=num_cols,
    # TODO: We have a problem to understand which columns are numerical and categorical based on the
    #   count of unique values. At the moment even Serum Albumin is there.
    #   Moreover:
    #   1. Numer / categ /bool feat_list cannot share some columns names!
    categ_feat_list=(),  # cols_by_type.num_categorical_cols,
    bool_feat_list=(),  # BOOL_FEATURES,  # cols_by_type.bool_cols,
    random_seed=42,
    metric=tanimoto_gower,
    numer_feat_weight=1.0,
    categ_feat_weight=0,
    bool_feat_weight=0.0,
    group_values_to_be_shown=("BREED", (("LABRADOR RETRIEVER",), ("MONGREL",))),
    color_tuple=color_list,
    test_color_tuple=test_color_list,
    tooltip_feats=(
        "SEX",
        "SEXUAL STATUS",
        "AGE",
        "BODYWEIGHT",
    ),
    marker_size=8,
    marker_fill_alpha=0.0,
    tools=TOOLS,
)
# TODO:
#   2. Check the clustering labels and how they are split between train and test set
#    3. Check how the colors are passed and managed
# hdbscan_labels = umap_exp.clustering(min_cluster_size=10, umap_components=40,
#                                      min_samples=2, use_umap_preprocessing=True,)
# umap_exp.plot_cluster_labels(multi_marker_feats=('SEX_enc', 'SEXUAL STATUS_enc'))

umap_exp.fit_transform()
umap_exp.plot(return_plot=False)

03-Sep-20 11:48:23 	 INFO 	 Module: umap_exp 	 The features that have too high number of Nan (and will not be considered in UMAP) are: ['TTKG', 'Sodium/Crea', 'VACCINAZIONI_Irregolari_enc', 'Serum Ca x Pi', 'VolRTTHY', 'BODY CONDITION SCORE_5.0_enc', 'Serum IgG', 'Bilirubin/Crea', 'BROMURO_2336.0_enc', 'FILARIOSI_Irregolare_enc', 'FILARIOSI_Regolare_enc', 'Serum IgA', 'BODY CONDITION SCORE_2.0_enc', 'ANAMNESI_AMBIENTALE_Casa-Giardino_enc', 'ANAMNESI_ALIMENTARE_Commerciale_enc', 'FT4 LC-MS-MS', 'GGT/Crea', 'HPT', 'Calcium/Crea', 'Lipase/Crea', 'VolLiver_19.0_enc', 'FT4', 'EF Posphate', 'VACCINAZIONI_Regolari_enc', 'Serum Haptoglobin', 'Aldosteron', 'Potassium/Crea', 'TSH', 'BLOOD PRESS MIN', 'RESPIRATORY RATE', 'ANAMNESI_AMBIENTALE_Casa_enc', 'ANAMNESI_ALIMENTARE_Mista_enc', 'VolLTTHY', 'CORTISOLO_LC_MS_MS', 'LIPASI_DGGR', 'VolLiver_1177.0_enc', 'Serum PON-1', 'TT4', 'PROFILO_PAZIENTE_Scadente_enc', 'BLOOD PRESS MEAN', 'PROFILO_PAZIENTE_Pessimo_enc', 'AMMONIUM', 'Serum Bile Acid', 'Cort

ValueError: unable to broadcast argument 1 to output array
File "../src/umapviz/umap_metrics.py", line 203, 

# Using functions from umap_functions

In [13]:
sys.path.append('..')

In [14]:
from src.umapviz import umap_plot, umap_functions


train_umap_data, train_notna_full_features_df, \
    test_umap_data, test_notna_full_features_df = \
        umap_functions.prepare_umap_data(df_correct, 
                                    not_nan_percentage_threshold=0.88, 
                                    test_over_set_ratio=0.25)

02-Sep-20 12:34:20 	 INFO 	 Module: umap_functions 	 The features that have too high number of Nan (and will not be considered in UMAP) are: ['Serum Bile Acid', 'ANAMNESI_AMBIENTALE_enc', 'TIME OF DEATH', 'FT4', 'BODY CONDITION SCORE_enc', 'Serum IgA', 'EF Posphate', 'Aldosteron', 'Serum Haptoglobin', 'TTKG', 'TSH', 'Serum PON-1', 'BLOOD PRESS MIN', 'Serum Ca x Pi', 'Osmolal Gap', 'TT4', 'Serum IgM', 'Lipase/Crea', 'AMMONIUM', 'TLI', 'FILARIOSI_enc', 'Potassium/Crea', 'HPT', 'RESPIRATORY RATE', 'Calcium/Crea', 'CORTISOLO_LC_MS_MS', 'VACCINAZIONI_enc', 'GGT/Crea', 'Serum IgG', 'EF Urea', 'EF Calcium', 'AMMONIACA_NORM_CREA', 'LIPASI_DGGR', 'Chloride/Crea', 'ANAMNESI_ALIMENTARE_enc', 'Bile Acids/Crea', 'VolLTTHY', 'BLOOD PRESS MEAN', 'Amylase/Crea', 'Glucose/Crea', 'Quantitative FDP', 'Bilirubin/Crea', 'TEMP', 'VolRTTHY', 'PROFILO_PAZIENTE_enc', 'Sodium/Crea', 'BROMURO_enc', 'P/Crea', 'FT4 LC-MS-MS', 'CREATINURIA', 'Cortisol/Crea', 'PULSE RATE', 'VolLiver_enc', 'BLOOD PRESS MAX'] 


In [16]:
color_list = (
    '#34c616',  # Verde
    '#735911',  # Marrone
    '#12b6c4',  # Azzurro
)
test_color_list = (
    '#da2e1a',  # Rosso
    '#da971a',  # Arancione
    '#1a43da'  # Blu
)

reducer, embedding = umap_functions.calculate_plot_umap(
    df_exams_only_umap=train_umap_data,
    df_full_feat=train_notna_full_features_df,
    n_neighbors=50,  # [5, 15, 50, 100],
    min_dist=0.03,  # [0.01, 0.1, 1],
    group_values_to_be_shown=('BREED', (tuple(['MONGREL']), tuple(['LABRADOR RETRIEVER']))),
    # ['GERMAN SHEPHERD'], ['GOLDEN RETRIEVER']],
    feature_to_color='AGE_bin_id',
    multi_feat_to_combine_partit_list=('SEX', 'SEXUAL STATUS'),
    color_tuple=color_list,
    test_df_exams_only_umap=test_umap_data,
    test_df_full_feat=test_notna_full_features_df,
    test_color_tuple=test_color_list,
    filename_prefix='AGE_50_0.3_with_test',
#     show_plot=True
)


# calculate_plot_umap_multi_breed_multi_params(
#                                                 df_exams_only_umap=train_umap_data, 
#                                                 df_full_feat=train_notna_full_features_df, 
#                                                 n_neighbors_list=[50],  #[5, 15, 50, 100],
#                                                 min_dist_list=[0.03],  # [0.01, 0.1, 1],
#                                                 multiple_breed_list=[['MONGREL'], ['LABRADOR RETRIEVER'], ['GERMAN SHEPHERD'], ['GOLDEN RETRIEVER']],
# #                                                 feature_to_color='SEX-SEXUAL STATUS-AGE_bin_id_group_ID',
#                                                 three_feat_color_list=['SEX', 'SEXUAL STATUS', 'AGE_bin_id'],
#                                                 color_list=color_list, 
#                                                 ncols=3,
#                                                 test_df_exams_only_umap=test_umap_data,
#                                                 test_df_full_feat=test_notna_full_features_df,
#                                                 filename='Multibreed_TEST_UMAP_50_0.3.html'
# )

['#1dda1a', # Verde
 '#1adaae', # Azzurro
 '#1a71da'] # Blu
    #'#da2f1a']

NameError: name 'logging' is not defined