In [1]:
from data_preparation import DataPreparator
from apply_clustering import ClusteringApplier

In [2]:
import pandas as pd
import numpy as np

In [3]:
# Run this to make sure the "normal way" still works
VALIDATE_THE_OTHER_WAY = False

if VALIDATE_THE_OTHER_WAY:
    DataPreparator.prepare_data()
    optimal_number_of_clusters = ClusteringApplier.draw_gap_statistic_plot()
    ClusteringApplier.calculate_observable_patterns(_n_clusters=optimal_number_of_clusters)
    ClusteringApplier.calculate_explainable_distances()

In [4]:
df_observable = pd.read_excel('../data/biological_set_raw_data.xlsx')
df_explainable = pd.read_excel('../data/biological_set_explaining_features.xlsx')

In [5]:
observed_features = ['perc_lying', 'perc_lhd', 'num_lying', 'num_lhd']
explaining_features = ['Age', 'Sex', 'Zoo', 'Stable', 'Genus_ID', 'Family_ID','Order_ID', 'SH', 'Weight', 'Habitat']
index_name = 'Individual'

In [6]:
preparation_result = DataPreparator.prepare_data(
    use_config=False,
    df_observable=df_observable,
    df_explainable=df_explainable,
    distortion_mean=0.,
    distortion_std=0.001,
    observed_features=observed_features,
    explaining_features=explaining_features,
    nn_imputation_k=10,
    group_name=index_name
)

In [7]:
optimal_number_of_clusters = ClusteringApplier.draw_gap_statistic_plot(
    use_config=False, df_observable=preparation_result['observable_df'],
    observed_features=observed_features,
    gap_statistic_cluster_range=10,
    plot=False
)

In [12]:
optimal_number_of_clusters

{'n_clusters': 9,
 'knee': np.int64(2),
 'elbow': np.int64(2),
 'clusterer': OptimalK(n_jobs=16, parallel_backend="joblib")}

In [8]:
observable_patterns_result = ClusteringApplier.calculate_observable_patterns(
    use_config=False,
    df_observable_data=preparation_result['observable_df'],
    _n_clusters=optimal_number_of_clusters['n_clusters'],
    observable_name='observable_name',
    observable_feature_names=observed_features,
    plot_title='Name of the plot',
    number_observable_patterns='auto',
    distance_measure_fingerprint='jensenshannon',
    group_name=index_name, 
    plot=False
)

In [13]:
observable_patterns_result

{'df_cluster_median':               perc_lying  perc_lhd  num_lying   num_lhd
 pattern_type                                           
 0               0.875073  0.185037   0.288052  0.364512
 1               0.733653  0.081344   0.356601  0.170481
 2               0.730759  0.144472   0.357078  0.317669
 3               0.621874  0.064722   0.429953  0.147196
 4               0.796103  0.201119   0.500393  0.366297
 5               0.453920  0.050946   0.214465  0.100429
 6               0.743415  0.116725   0.572055  0.219659
 7               0.826574  0.120236   0.428893  0.220934
 8               0.839516  0.280441   0.429165  0.584441,
 'df_observable_data':      Individual        date  perc_lying  perc_lhd  num_lying   num_lhd  \
 0      A.nas_01  2020-03-04    0.682063  0.278473   0.214550  0.294069   
 1      A.nas_01  2020-03-05    0.554511  0.278341   0.284784  0.245253   
 2      A.nas_01  2020-03-06    0.666260  0.237656   0.285209  0.560183   
 3      A.nas_01  2020-03-07 

In [14]:
calculate_explainable_distances_result = ClusteringApplier.calculate_explainable_distances(
    use_config=False,
    df_explainable=preparation_result['explainable_df'].copy(),
    df_observable_distances=observable_patterns_result['pw_norm_dist'].copy(),
    explaining_features=explaining_features,
    method='exact',
    distance_measure='correlation',
    sparsity_parameter=0.05,
    max_num_threads=12,
    group_name=index_name,
    plot=False
)


  0%|          | 0/1023 [00:00<?, ?it/s]

In [15]:
heuristic_calculate_explainable_distances_result = ClusteringApplier.calculate_explainable_distances(
    use_config=False,
    df_explainable=preparation_result['explainable_df'].copy(),
    df_observable_distances=observable_patterns_result['pw_norm_dist'].copy(),
    explaining_features=explaining_features.copy(),
    method='heuristic',
    distance_measure='correlation',
    sparsity_parameter=0.05,
    max_num_threads=12,
    group_name=index_name,
    heuristics_N=2,
    plot=False
)
heuristic_calculate_explainable_distances_result['optimal_feature_set']

  0%|          | 0/55 [00:00<?, ?it/s]

Started with ['Age', 'Genus_ID']
Refuted 'Weight'


['Age', 'Genus_ID']