In [1]:
import os
import json
import numpy as np
import pandas as pd
import altair as alt

In [2]:
__author__ = 'Aleksandar Anžel'
__copyright__ = ''
__credits__ = ['Aleksandar Anžel']
__license__ = 'GNU General Public License v3.0'
__version__ = '1.0'
__maintainer__ = 'Aleksandar Anžel'
__email__ = 'aleksandar.anzel@uni-marburg.de'
__status__ = 'Dev'

In [3]:
# Uncomment if you are using dark jupyter lab/notebook theme
#alt.renderers.set_embed_options(theme='dark')
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

# 0. Define paths

In [4]:
path_root_data = os.path.join('..', 'Data')
path_vis_data = os.path.join(
    path_root_data, 'Visualization_data', 'data', 'multiple_datasets', 'vis',
    'mds_1_Overview')
path_peptidereactor_vis_data = os.path.join(
    path_root_data, 'Visualization_data', 'peptidereactor_vis_data')

path_peptidereactor_vega_lite_spec = os.path.join(
    path_vis_data, 'mds_1_Overview.json')

path_cmangoes_data = os.path.join(
    path_vis_data, 'hm_cmangoes_data.json')
path_cenact_data = os.path.join(
    path_vis_data, 'hm_cenact_data.json')
path_imb_data = os.path.join(
    path_vis_data, 'hm_imb_data.json')
path_bio_data = os.path.join(
    path_vis_data, 'hm_bio_data.json')

path_peptidereactor_imb_data = os.path.join(
    path_peptidereactor_vis_data, 'hm_imb_data.json')
path_peptidereactor_bio_data = os.path.join(
    path_peptidereactor_vis_data, 'hm_bio_data.json')

path_cmangoes_performance_data = os.path.join(
    path_root_data, 'Performance_experiments', 'results.csv')

list_of_datasets = [
    'ace_vaxinpad',
    'acp_anticp',
    'acp_iacp',
    'acp_mlacp',
    'afp_amppred',
    'afp_antifp',
    'aip_aippred',
    'aip_antiinflam',
    'amp_antibp',
    'amp_antibp2',
    'amp_csamp',
    'amp_fernandes',
    'amp_gonzales',
    'amp_iamp2l',
    'amp_modlamp',
    'atb_antitbp',
    'atb_iantitb',
    'avp_amppred',
    'avp_avppred',
    'bce_ibce',
    'cpp_cellppd',
    'cpp_cellppdmod',
    'cpp_cppredfl',
    'cpp_kelmcpp',
    'cpp_mixed',
    'cpp_mlcpp',
    'cpp_mlcppue',
    'cpp_sanders',
    'hem_hemopi'
]

# 1. Import data

In [5]:
df_cmangoes_data = pd.read_json(path_cmangoes_data)
df_cenact_data = pd.read_json(path_cenact_data)
df_imb_data = pd.read_json(path_imb_data)
df_bio_data = pd.read_json(path_bio_data)

df_peptidereactor_imb_data = pd.read_json(path_peptidereactor_imb_data)
df_peptidereactor_bio_data = pd.read_json(path_peptidereactor_bio_data)

In [6]:
np.sort(df_imb_data['Encoding_max'].unique())

array(['seqbinary_centered_levels_1_and_2_encoding',
       'seqbinary_shifted_levels_1_and_2_encoding',
       'seqdiscretized_centered_levels_1_and_2_encoding',
       'seqdiscretized_shifted_levels_1_and_2_encoding',
       'strbinary_centered_levels_1_and_2_encoding',
       'strbinary_shifted_levels_1_and_2_encoding',
       'strdiscretized_centered_levels_1_and_2_encoding',
       'strdiscretized_shifted_levels_1_and_2_encoding'], dtype=object)

# 2. Clean the data

In [7]:
def df_fix_f1_scores_and_more(df_input):
    df_output = df_input.copy(deep=True)
    
    df_output = df_output[df_output['F1'] != 'separator']
    df_output['F1'] = pd.to_numeric(df_output['F1'])
    
    # We want to drop rows that contain 'zzz' at the end of the necoding
    # This flags were used to seperate heatmap
    df_output.drop(df_output[
        df_output['Encoding'].str.endswith('zzz')].index,
        inplace=True)
    df_output.drop(df_output[
        df_output['Dataset'].str.endswith('zzz')].index,
        inplace=True)
    
    df_output = df_output[df_output['Dataset'].isin(list_of_datasets)]
    
    return df_output

def df_clean_cmangoes_data(df_input):
    df_output = df_input.copy(deep=True)
    df_output = df_fix_f1_scores_and_more(df_output)
    
    df_output.drop(
        df_output[df_output['Encoding'].str.startswith('str')].index,
        inplace=True)
    
    df_output['Encoding_max'] = df_output[
        'Encoding_max'].str.replace('_levels_1_and_2_encoding','')
    df_output['Encoding_max'] = df_output[
        'Encoding_max'].str.replace('seq','CMANGOES_seq_')
    
    list_tmp = []
    for string_encoding in df_output['Encoding_max']:
        string_tmp = '_'.join(
            [string_chunk[:3]
             for string_chunk in string_encoding.split('_')[2:]])
        list_tmp.append(string_tmp)
    
    df_output['Encoding'] = list_tmp
    
    return df_output.reset_index(drop=True)
    
def df_clean_data(df_input):
    df_output = df_input.copy(deep=True)
    df_output = df_fix_f1_scores_and_more(df_output)
    
    df_output.drop(
        df_output[df_output['Encoding'].str.startswith('str')].index,
        inplace=True)
    
    df_output['Encoding_max'] = df_output[
        'Encoding_max'].str.replace('_levels_1_and_2_encoding','')
    df_output['Encoding_max'] = df_output[
        'Encoding_max'].str.replace('seq','CMANGOES_seq_')
    
    list_tmp = []
    for string_encoding in df_output['Encoding_max']:
        string_tmp = '_'.join(
            [string_chunk[:3]
             for string_chunk in string_encoding.split('_')[2:]])
        list_tmp.append(string_tmp)
    
    df_output['Encoding'] = list_tmp
    
    return df_output.reset_index(drop=True)

In [8]:
df_imb_data

Unnamed: 0,Dataset,Encoding,Encoding_max,F1,type,is_imbalanced,bio_field,type_field,missing
0,amp_gonzales,strbinary_shifted_levels_1_and_2_encoding,strbinary_shifted_levels_1_and_2_encoding,0.5,structure based,0.209302,amp,2,False
1,amp_gonzales,strdiscretized_centered_levels_1_and_2_encoding,strdiscretized_centered_levels_1_and_2_encoding,0.571429,structure based,0.209302,amp,2,False
2,amp_gonzales,strbinary_centered_levels_1_and_2_encoding,strbinary_centered_levels_1_and_2_encoding,0.5,structure based,0.209302,amp,2,False
3,amp_gonzales,seqbinary_shifted_levels_1_and_2_encoding,seqbinary_shifted_levels_1_and_2_encoding,0.5,sequence based,0.209302,amp,1,False
4,amp_gonzales,seqdiscretized_centered_levels_1_and_2_encoding,seqdiscretized_centered_levels_1_and_2_encoding,0.571429,sequence based,0.209302,amp,1,False
...,...,...,...,...,...,...,...,...,...
235,cpp_sanders,seqdiscretized_shifted_levels_1_and_2_encoding,seqdiscretized_shifted_levels_1_and_2_encoding,0.862745,sequence based,0.765517,cpp,1,False
236,cpp_sanders,strbinary_centered_levels_1_and_2_encoding,strbinary_centered_levels_1_and_2_encoding,0.862745,structure based,0.765517,cpp,2,False
237,cpp_sanders,strdiscretized_centered_levels_1_and_2_encoding,strdiscretized_centered_levels_1_and_2_encoding,0.862745,structure based,0.765517,cpp,2,False
238,cpp_sanders,seqdiscretized_centered_levels_1_and_2_encoding,seqdiscretized_centered_levels_1_and_2_encoding,0.862745,sequence based,0.765517,cpp,1,False


In [9]:
df_cmangoes_data = df_clean_cmangoes_data(df_cmangoes_data)
df_cenact_data = df_fix_f1_scores_and_more(df_cenact_data)
df_imb_data = df_clean_cmangoes_data(df_imb_data)
df_bio_data = df_clean_cmangoes_data(df_bio_data)
df_peptidereactor_imb_data = df_fix_f1_scores_and_more(df_peptidereactor_imb_data)
df_peptidereactor_bio_data = df_fix_f1_scores_and_more(df_peptidereactor_bio_data)

df_imb_data

Unnamed: 0,Dataset,Encoding,Encoding_max,F1,type,is_imbalanced,bio_field,type_field,missing
0,amp_gonzales,bin_shi,CMANGOES_seq_binary_shifted,0.500000,sequence based,0.209302,amp,1,False
1,amp_gonzales,dis_cen,CMANGOES_seq_discretized_centered,0.571429,sequence based,0.209302,amp,1,False
2,amp_gonzales,dis_shi,CMANGOES_seq_discretized_shifted,0.500000,sequence based,0.209302,amp,1,False
3,amp_gonzales,bin_cen,CMANGOES_seq_binary_centered,0.500000,sequence based,0.209302,amp,1,False
4,amp_iamp2l,dis_shi,CMANGOES_seq_discretized_shifted,0.716014,sequence based,0.267661,amp,1,False
...,...,...,...,...,...,...,...,...,...
111,cpp_mixed,dis_cen,CMANGOES_seq_discretized_centered,0.863636,sequence based,0.757812,cpp,1,False
112,cpp_sanders,bin_shi,CMANGOES_seq_binary_shifted,0.862745,sequence based,0.765517,cpp,1,False
113,cpp_sanders,bin_cen,CMANGOES_seq_binary_centered,0.862745,sequence based,0.765517,cpp,1,False
114,cpp_sanders,dis_shi,CMANGOES_seq_discretized_shifted,0.862745,sequence based,0.765517,cpp,1,False


In [10]:
df_cenact_data

Unnamed: 0,Dataset,Encoding,Encoding_max,F1,type,is_imbalanced,bio_field
0,ace_vaxinpad,cenact_hyd,cenact_hyd,0.819358,sequence based,0.440407,ace
1,acp_anticp,cenact_hyd,cenact_dd,0.786194,sequence based,0.500000,acp
2,acp_iacp,cenact_hyd,cenact_hyd,0.757320,sequence based,0.401163,acp
3,acp_mlacp,cenact_hyd,cenact_hyd,0.584432,sequence based,0.319658,acp
4,afp_amppred,cenact_hyd,cenact_hyd,0.774120,sequence based,0.500000,afp
...,...,...,...,...,...,...,...
111,cpp_mixed,cenact_dd,cenact_hyd,0.870843,sequence based,0.757812,cpp
112,cpp_mlcpp,cenact_dd,cenact_nohyd,0.686391,sequence based,0.387809,cpp
113,cpp_mlcppue,cenact_dd,cenact_dd,0.662610,sequence based,0.500000,cpp
114,cpp_sanders,cenact_dd,cenact_dd,0.882308,sequence based,0.765517,cpp


In [11]:
df_peptidereactor_imb_data

Unnamed: 0,Dataset,Encoding,Encoding_max,F1,type,is_imbalanced,bio_field,type_field,missing
138,amp_gonzales,geary_,geary_nlag_9,0.750000,sequence based,0.209302,amp,1,False
139,amp_gonzales,flgc_a,flgc_aaindex_ZIMJ680104,0.888889,sequence based,0.209302,amp,1,False
140,amp_gonzales,dist_f,dist_freq_dn_5_dc_50,0.888889,sequence based,0.209302,amp,1,False
141,amp_gonzales,ngram_,ngram_s3_5,0.816667,sequence based,0.209302,amp,1,False
142,amp_gonzales,fldpc_,fldpc_aaindex_ZIMJ680104,0.888889,sequence based,0.209302,amp,1,False
...,...,...,...,...,...,...,...,...,...
2339,cpp_sanders,ctdt,ctdt,0.872283,sequence based,0.765517,cpp,1,False
2340,cpp_sanders,ksctri,ksctriad_gap_1,0.872283,sequence based,0.765517,cpp,1,False
2341,cpp_sanders,flgc_a,flgc_aaindex_ZIMJ680104,0.875000,sequence based,0.765517,cpp,1,False
2342,cpp_sanders,ctriad,ctriad,0.875000,sequence based,0.765517,cpp,1,False


In [12]:
df_combined_imb_data = pd.concat(
    [df_imb_data, df_peptidereactor_imb_data, df_cenact_data], ignore_index=True)
df_combined_bio_data = pd.concat(
    [df_bio_data, df_peptidereactor_bio_data, df_cenact_data], ignore_index=True)

df_combined_imb_data

Unnamed: 0,Dataset,Encoding,Encoding_max,F1,type,is_imbalanced,bio_field,type_field,missing
0,amp_gonzales,bin_shi,CMANGOES_seq_binary_shifted,0.500000,sequence based,0.209302,amp,1.0,False
1,amp_gonzales,dis_cen,CMANGOES_seq_discretized_centered,0.571429,sequence based,0.209302,amp,1.0,False
2,amp_gonzales,dis_shi,CMANGOES_seq_discretized_shifted,0.500000,sequence based,0.209302,amp,1.0,False
3,amp_gonzales,bin_cen,CMANGOES_seq_binary_centered,0.500000,sequence based,0.209302,amp,1.0,False
4,amp_iamp2l,dis_shi,CMANGOES_seq_discretized_shifted,0.716014,sequence based,0.267661,amp,1.0,False
...,...,...,...,...,...,...,...,...,...
1503,cpp_mixed,cenact_dd,cenact_hyd,0.870843,sequence based,0.757812,cpp,,
1504,cpp_mlcpp,cenact_dd,cenact_nohyd,0.686391,sequence based,0.387809,cpp,,
1505,cpp_mlcppue,cenact_dd,cenact_dd,0.662610,sequence based,0.500000,cpp,,
1506,cpp_sanders,cenact_dd,cenact_dd,0.882308,sequence based,0.765517,cpp,,


In [13]:
df_combined_imb_data

Unnamed: 0,Dataset,Encoding,Encoding_max,F1,type,is_imbalanced,bio_field,type_field,missing
0,amp_gonzales,bin_shi,CMANGOES_seq_binary_shifted,0.500000,sequence based,0.209302,amp,1.0,False
1,amp_gonzales,dis_cen,CMANGOES_seq_discretized_centered,0.571429,sequence based,0.209302,amp,1.0,False
2,amp_gonzales,dis_shi,CMANGOES_seq_discretized_shifted,0.500000,sequence based,0.209302,amp,1.0,False
3,amp_gonzales,bin_cen,CMANGOES_seq_binary_centered,0.500000,sequence based,0.209302,amp,1.0,False
4,amp_iamp2l,dis_shi,CMANGOES_seq_discretized_shifted,0.716014,sequence based,0.267661,amp,1.0,False
...,...,...,...,...,...,...,...,...,...
1503,cpp_mixed,cenact_dd,cenact_hyd,0.870843,sequence based,0.757812,cpp,,
1504,cpp_mlcpp,cenact_dd,cenact_nohyd,0.686391,sequence based,0.387809,cpp,,
1505,cpp_mlcppue,cenact_dd,cenact_dd,0.662610,sequence based,0.500000,cpp,,
1506,cpp_sanders,cenact_dd,cenact_dd,0.882308,sequence based,0.765517,cpp,,


In [14]:
# Add one more column to flag missing values
df_combined_imb_data['Value'] = [
    'NaN' if i is True else 'notNaN'
    for i in df_combined_imb_data['F1'].isnull().tolist()]
df_combined_bio_data['Value'] = [
    'NaN' if i is True else 'notNaN'
    for i in df_combined_bio_data['F1'].isnull().tolist()]

In [15]:
df_combined_imb_data

Unnamed: 0,Dataset,Encoding,Encoding_max,F1,type,is_imbalanced,bio_field,type_field,missing,Value
0,amp_gonzales,bin_shi,CMANGOES_seq_binary_shifted,0.500000,sequence based,0.209302,amp,1.0,False,notNaN
1,amp_gonzales,dis_cen,CMANGOES_seq_discretized_centered,0.571429,sequence based,0.209302,amp,1.0,False,notNaN
2,amp_gonzales,dis_shi,CMANGOES_seq_discretized_shifted,0.500000,sequence based,0.209302,amp,1.0,False,notNaN
3,amp_gonzales,bin_cen,CMANGOES_seq_binary_centered,0.500000,sequence based,0.209302,amp,1.0,False,notNaN
4,amp_iamp2l,dis_shi,CMANGOES_seq_discretized_shifted,0.716014,sequence based,0.267661,amp,1.0,False,notNaN
...,...,...,...,...,...,...,...,...,...,...
1503,cpp_mixed,cenact_dd,cenact_hyd,0.870843,sequence based,0.757812,cpp,,,notNaN
1504,cpp_mlcpp,cenact_dd,cenact_nohyd,0.686391,sequence based,0.387809,cpp,,,notNaN
1505,cpp_mlcppue,cenact_dd,cenact_dd,0.662610,sequence based,0.500000,cpp,,,notNaN
1506,cpp_sanders,cenact_dd,cenact_dd,0.882308,sequence based,0.765517,cpp,,,notNaN


# 3. Visualize
## 3.1. CMANGOES results

In [16]:
def chart_visualize(df_input, string_title, flag_cmangoes_style):
    
    if flag_cmangoes_style:
        scale_general = alt.Scale(scheme='greys', domain=[0, 1])
        scale_na_values = alt.Scale(domain=['NaN'], range=['#d95f02'])
    else:
        scale_general = alt.Scale(range=["#a6bddb", "#023858"])
        scale_na_values = alt.Scale(domain=['NaN'], range=['#a6611a'])
    
    
    chart_non_null = alt.Chart(
        df_input, title=string_title + " multi-dataset results"
    ).mark_rect().encode(
        x=alt.X('Encoding', type='nominal', axis=alt.Axis(labelAngle=-45)),
        y=alt.Y('Dataset', type='nominal',
                scale=alt.Scale(domain=list_of_datasets)),
        color=alt.Color('F1', type='quantitative', scale=scale_general),
        tooltip=['Dataset', 'Encoding_max', 'F1', 'is_imbalanced', 'bio_field']
    )
    
    chart_null = alt.Chart(
        df_input, title=string_title + " multi-dataset results"
    ).mark_rect().encode(
        x=alt.X('Encoding', type='nominal', axis=alt.Axis(labelAngle=-45)),
        y=alt.Y('Dataset', type='nominal',
                scale=alt.Scale(domain=list_of_datasets)),
        color=alt.Color('Value', type='nominal', scale=scale_na_values),
        tooltip=['Dataset', 'Encoding_max', 'F1', 'is_imbalanced', 'bio_field']
    )
    
    chart_result = chart_non_null + chart_null
    
    return chart_result


def replicate_visualizations(flag_visualization_style):
    chart_cenact_source_result = chart_visualize(
        df_cenact_data, 'CENACT source', flag_visualization_style)
    chart_cmangoes_source_result = chart_visualize(
        df_cmangoes_data, 'CMANGOES source', flag_visualization_style)
    chart_peptidereactor_imb_result = chart_visualize(
        df_peptidereactor_imb_data, 'Peptidereactor imb',
        flag_visualization_style)
    chart_peptidereactor_bio_result = chart_visualize(
        df_peptidereactor_bio_data, 'Peptidereactor bio',
        flag_visualization_style)
    chart_combined_imb_result = chart_visualize(
        df_combined_imb_data, 'Combined imb', flag_visualization_style)
    chart_combined_bio_result = chart_visualize(
        df_combined_bio_data, 'Combined bio', flag_visualization_style)

    chart_result = chart_cenact_source_result &\
                   chart_cmangoes_source_result &\
                   chart_peptidereactor_imb_result &\
                   chart_peptidereactor_bio_result &\
                   chart_combined_imb_result &\
                   chart_combined_bio_result

    return chart_result


## 3.1. Peptidereactor-styled results

In [17]:
flag_cmangoes_style = True
replicate_visualizations(not flag_cmangoes_style)

## 3.2. Our own visualizations

In [18]:
replicate_visualizations(flag_cmangoes_style)

# Create overview heatmap

In [19]:
def chart_visualize_new_design(df_input, string_title):
    
    list_datasets_sorted = df_input.sort_values(
        'is_imbalanced', ascending=False)['Dataset'].drop_duplicates().to_list()
    
    scale_general = alt.Scale(scheme='greys', domain=[0, 1])
    scale_na_values = alt.Scale(domain=['NaN'], range=['#d95f02'])
    #list_scale_imbalance = ['#7570b3', '#8c86c0', '#a6a0ce',
    #                        '#ffffff',
    #                        '#8cc8af', '#5db393', '#1b9e77']
    # list_scale_imbalance = ['#de77ae','#f1b6da','#fde0ef','#f7f7f7','#e6f5d0','#b8e186','#7fbc41']
    list_scale_imbalance = ['#762a83', '#af8dc3', '#e7d4e8', '#f7f7f7',
                            '#d9f0d3','#7fbf7b','#1b7837']
    
    field_CMANGOES = alt.FieldOneOfPredicate(
        field='Encoding', oneOf=['bin_shi', 'dis_shi',
                                 'bin_cen', 'dis_cen'])
    
    field_CENACT = alt.FieldOneOfPredicate(
        field='Encoding', oneOf=['cenact_hyd', 'cenact_nohyd', 'cenact_dd'])
    
    field_CMANGOES_and_CENACT = alt.FieldOneOfPredicate(
        field='Encoding', oneOf=['cenact_hyd', 'cenact_nohyd', 'cenact_dd',
                                 'bin_shi', 'dis_shi',
                                 'bin_cen', 'dis_cen'])
    
    chart_imbalance = alt.Chart(df_input).mark_rect().encode(
        alt.Y('Dataset', type='nominal', axis=alt.Axis(title='Data set')),
              #sort=list_datasets_sorted),
              #sort=alt.EncodingSortField(
              #    field='is_imbalanced', order='descending')),
        alt.Color('is_imbalanced', type='quantitative',
                  scale=alt.Scale(range=list_scale_imbalance, domain=[0, 1]),
                  legend=alt.Legend(title='Imbalance ratio')),
        alt.Tooltip(['Dataset', 'is_imbalanced'])
    ).properties(
        width=40,
    )

    chart_text = chart_imbalance.mark_text(size=10, stroke=None, fill='black',
                                           fillOpacity=0.5).encode(
            alt.Text('is_imbalanced', format=".2f"),
        )

    chart_imbalance_result = alt.layer(chart_imbalance, chart_text)
    
    chart_CENACT = alt.Chart(df_input).mark_rect().encode(
        x=alt.X('Encoding', type='nominal',
                axis=alt.Axis(labelAngle=-45, title='')),
        y=alt.Y('Dataset', type='nominal',
                axis=alt.Axis(title='', labels=False, ticks=False),
                scale=alt.Scale(domain=list_of_datasets),
                sort=list_datasets_sorted),
        color=alt.Color('F1', type='quantitative', scale=scale_general, 
                        legend=None),
        tooltip=['Dataset', 'Encoding_max', 'F1', 'is_imbalanced', 'bio_field']
    ).transform_filter(
        field_CENACT
    )
    
    chart_CMANGOES = alt.Chart(df_input).mark_rect().encode(
        x=alt.X('Encoding', type='nominal',
                axis=alt.Axis(labelAngle=-45, title='')),
        y=alt.Y('Dataset', type='nominal',
                axis=alt.Axis(title='', labels=False, ticks=False),
                scale=alt.Scale(domain=list_of_datasets),
                sort=list_datasets_sorted),
        color=alt.Color('F1', type='quantitative', scale=scale_general, 
                        legend=None),
        tooltip=['Dataset', 'Encoding_max', 'F1', 'is_imbalanced', 'bio_field']
    ).transform_filter(
        field_CMANGOES
    )
    
    chart_non_null_no_param = alt.Chart(df_input).mark_rect().encode(
        x=alt.X('Encoding', type='nominal', axis=alt.Axis(labelAngle=-45)),
        y=alt.Y('Dataset', type='nominal',
                axis=alt.Axis(title='', labels=False, ticks=False),
                scale=alt.Scale(domain=list_of_datasets),
                sort=list_datasets_sorted),
        color=alt.Color('F1', type='quantitative', scale=scale_general),
        tooltip=['Dataset', 'Encoding_max', 'F1', 'is_imbalanced', 'bio_field']
    ).transform_filter(
        {'not': field_CMANGOES_and_CENACT},
    )
    
    chart_null_no_param = alt.Chart(df_input).mark_rect().encode(
        x=alt.X('Encoding', type='nominal', axis=alt.Axis(labelAngle=-45)),
        y=alt.Y('Dataset', type='nominal',
                scale=alt.Scale(domain=list_of_datasets),
                sort=list_datasets_sorted),
        color=alt.Color('Value', type='nominal', scale=scale_na_values),
        tooltip=['Dataset', 'Encoding_max', 'F1', 'is_imbalanced', 'bio_field']
    ).transform_filter(
        {'not': field_CMANGOES_and_CENACT}
    )
    
    chart_result = alt.layer(
        chart_non_null_no_param, chart_null_no_param)
    
    chart_result = alt.hconcat(
        chart_CENACT, chart_CMANGOES, chart_result)
    
    chart_result = alt.hconcat(
        chart_imbalance_result, chart_result, spacing=0).resolve_scale(
        color='independent', y='shared').configure_legend(
        orient='top')
    
    return chart_result

In [24]:
fig = chart_visualize_new_design(df_combined_imb_data, 'Combined')
# fig.save('/home/weckbeckerm/scratch/Visualisations_CENACT/heatmap.png')
fig

## 3.3 Further analysis of encodings

In [None]:
def chart_filter_and_visualize(df_input, string_column, float_value):
    scale_upper = alt.Scale(scheme='greys', domain=[0, 1])
    scale_lower = alt.Scale(domain=['Cut-off'], range=['white']) # #9ecae1
    scale_na_values = alt.Scale(domain=['NaN'], range=['#d95f02'])
    string_title_addon = ' combined multi-dataset results.' +\
        ' Cells in white represent the cut-off values'
    
    if string_column == 'F1':
        float_new_value = df_input[string_column].quantile(q=float_value)
        string_title = string_column + ' > ' + str(int(float_value*100)) +\
        ' percent'
        field_upper = alt.FieldGTPredicate(field=string_column,
                                           gt=float_new_value)
        field_lower = alt.FieldLTEPredicate(field=string_column,
                                            lte=float_new_value)
    elif string_column == 'is_imbalanced':
        float_new_value = float_value
        string_title = string_column + ' == ' + str(float_value)
        field_upper =  alt.FieldEqualPredicate(field=string_column,
                                               equal=float_new_value)
        field_lower = {'not': alt.FieldEqualPredicate(field=string_column,
                                                      equal=float_new_value)}
    else:
        print('Not valid column name')
        return None
    
    chart_base = alt.Chart(
        df_input, title=string_title + string_title_addon
    ).mark_rect().encode(
        alt.X('Encoding', type='nominal', axis=alt.Axis(labelAngle=-45)),
        alt.Y('Dataset', type='nominal',
                scale=alt.Scale(domain=list_of_datasets)),
        tooltip=['Dataset', 'Encoding_max', 'F1', 'is_imbalanced', 'bio_field']
    )
    
    chart_upper = chart_base.encode(
        alt.Color('F1', type='quantitative', scale=scale_upper),
    ).transform_filter(field_upper)
    
    chart_lower = chart_base.encode(
        alt.Color('F1', type='quantitative', scale=scale_lower, legend=None),
        alt.Opacity('F1', type='quantitative', scale=alt.Scale(
            domain=['Cut-off'], range=[0.2]))
    ).transform_filter(field_lower)
    
    chart_null = chart_base.encode(
        alt.Color('Value', type='nominal', scale=scale_na_values),
    )

    chart_result = alt.layer(
        chart_upper, chart_lower, chart_null).resolve_scale(
        color='independent')
    
    return chart_result


def chart_f1_correlation(df_input):
    
    df_filtered = df_input[['Encoding', 'Dataset', 'F1']]
    df_pivoted = pd.pivot(df_filtered, index='Dataset', columns='Encoding',
                          values='F1')
    df_pivoted.columns.rename('', inplace=True)
    df_pivoted.index.rename('Encoding', inplace=True)
    df_corr = df_pivoted.corr(method='pearson').reset_index()
    df_corr.rename(columns={'': 'Encoding'}, inplace=True)
    df_corr = df_corr.melt('Encoding')
    
    df_corr.columns = ['var_1', 'var_2', 'correlation']
    
    # Create correlation matrix chart
    chart_corr = alt.Chart(df_corr).mark_rect().encode(
        alt.X ('var_1', title=None, axis=alt.Axis(labelAngle=-30,
                                                  title='Encoding')),
        alt.Y ('var_2', title=None, axis=alt.Axis(title='Encoding')),
        alt.Color('correlation', scale=alt.Scale(
            domain=[-1, 1], scheme='redblue', reverse=True)),
        alt.Tooltip(['var_1', 'var_2'])
    ).properties(
        width=alt.Step(40),
        height=alt.Step(40)
    )
    
    # Create text values for each colored element on top of existing chart
    chart_text = chart_corr.mark_text(size=12).encode(
        alt.Text('correlation', format=".2f"),
        color=alt.condition("abs(datum.correlation) > 0.6", alt.value('white'),
                            alt.value('black'))
    )
    
    chart_result = alt.layer(chart_corr, chart_text)
    
    return chart_result.transform_filter("datum.var_1 < datum.var_2")


In [None]:
chart_f1_correlation(df_combined_imb_data)

In [None]:
chart_filter_and_visualize(df_combined_imb_data, 'F1', 0.75)

In [None]:
chart_filter_and_visualize(df_combined_imb_data, 'is_imbalanced', 0.5)

## 3.4 Performance analysis

In [None]:
df_performance_results = pd.read_csv(path_cmangoes_performance_data)
df_performance_results

FileNotFoundError: [Errno 2] No such file or directory: '../Data/Performance_experiments/results.csv'

In [None]:
df_performance_results_new = df_performance_results.drop(
    'Encodings', axis=1).melt('Dataset')
df_performance_results_new['TMP_LEGEND'] = len(
    df_performance_results_new['Dataset']) * ['Median']
df_performance_results_new

NameError: name 'df_performance_results' is not defined

In [None]:
x_domain = alt.selection_interval(bind='scales', encodings=['x'])
chart_performance_error_bars = alt.Chart(
    df_performance_results_new).mark_errorbar(
    extent='ci', color='#afafaf').encode(
    x=alt.X('Dataset', type='nominal', axis=alt.Axis(labelAngle=-45)),
    y=alt.Y('value', type='quantitative', title='Seconds'),
    strokeWidth=alt.value(2)
).add_selection(
    x_domain
)

chart_performance_points_error = alt.Chart(
    df_performance_results_new).mark_point(filled=True, color='black').encode(
    alt.X('Dataset', type='nominal', axis=alt.Axis(
        labelAngle=-45, labels=False, ticks=False, title='')),
    alt.Y('value', type='quantitative', aggregate='median', title=''),
    alt.Tooltip(['Dataset', 'median(value)']),
    alt.Opacity('TMP_LEGEND', type='nominal', legend=alt.Legend(
        title='Value', orient='none', legendX=520, legendY=5,)),

).add_selection(
    x_domain
)

chart_performance_error_bars_result = alt.layer(chart_performance_error_bars,
                                               chart_performance_points_error)
chart_performance_error_bars_result

NameError: name 'df_performance_results_new' is not defined

In [None]:
# Calculate dataset sizes
df_dataset_sizes = pd.DataFrame()
df_dataset_sizes['Dataset'] = list_of_datasets
path_original_data = os.path.join(path_root_data, 'Original_datasets')

list_sizes = []

for string_one_dataset in list_of_datasets:
    path_one_dataset = os.path.join(path_original_data, string_one_dataset)
    list_sizes.append(os.path.getsize(
        os.path.join(path_one_dataset, 'classes.txt')) + os.path.getsize(
        os.path.join(path_one_dataset, 'seqs.fasta')))

df_dataset_sizes['Size_in_bytes'] = list_sizes
df_dataset_sizes


Unnamed: 0,Dataset,Size_in_bytes
0,ace_vaxinpad,18410
1,acp_anticp,18425
2,acp_iacp,13820
3,acp_mlacp,25681
4,afp_amppred,139731
5,afp_antifp,206185
6,aip_aippred,37931
7,aip_antiinflam,66194
8,amp_antibp,38637
9,amp_antibp2,90013


In [None]:
chart_size_bars = alt.Chart(
    df_dataset_sizes).mark_bar(color='gray').encode(
    alt.X('Dataset', type='nominal', axis=alt.Axis(
        labelAngle=-45, labels=False, ticks=False, title='')),
    alt.Y('Size_in_bytes', type='quantitative', title='Bytes'),
    alt.Tooltip(['Dataset', 'Size_in_bytes'])
).add_selection(
    x_domain
)

chart_size_bars



In [None]:
chart_imbalance_bars = alt.Chart(
    df_cmangoes_source_data).mark_bar(color='gray').encode(
    alt.X('Dataset', type='nominal', axis=alt.Axis(labelAngle=-45)),
    alt.Y('is_imbalanced', type='quantitative', title='Imbalance_ratio',
         scale=alt.Scale(domain=[0, 1])),
    alt.Tooltip(['Dataset', 'is_imbalanced'])
).add_selection(
    x_domain
)
chart_imbalance_bars




In [None]:
chart_final_performance_result = alt.vconcat(
    chart_performance_error_bars_result,
    chart_size_bars,
    chart_imbalance_bars
)
chart_final_performance_result

NameError: name 'chart_performance_error_bars_result' is not defined

In [None]:
# Calculate speed of execution
df_dataset_sizes

Unnamed: 0,Dataset,Size_in_bytes
0,ace_vaxinpad,18410
1,acp_anticp,18425
2,acp_iacp,13820
3,acp_mlacp,25681
4,afp_amppred,139731
5,afp_antifp,206185
6,aip_aippred,37931
7,aip_antiinflam,66194
8,amp_antibp,38637
9,amp_antibp2,90013


In [None]:
df_performance_results_new

NameError: name 'df_performance_results_new' is not defined

In [None]:
df_performance_results_speed = df_performance_results_new.groupby(
    'Dataset').median().reset_index()
df_performance_results_speed = df_performance_results_speed.rename(
    columns={'value': 'Time_in_seconds'})
df_performance_results_speed = df_performance_results_speed.join(
    df_dataset_sizes['Size_in_bytes'])
df_performance_results_speed['Size_in_MB'] =\
    df_performance_results_speed['Size_in_bytes'] * 9.5367431640625*10**(-7)
df_performance_results_speed['Bytes_per_second'] =\
    df_performance_results_speed['Size_in_bytes'] /\
    df_performance_results_speed['Time_in_seconds']
df_performance_results_speed['Seconds_per_byte'] =\
    df_performance_results_speed['Time_in_seconds'] /\
    df_performance_results_speed['Size_in_bytes']
df_performance_results_speed['Seconds_per_MB'] =\
    df_performance_results_speed['Time_in_seconds'] /\
    df_performance_results_speed['Size_in_MB']
df_performance_results_speed

NameError: name 'df_performance_results_new' is not defined

In [None]:
df_performance_results_speed['Seconds_per_MB'].median()

NameError: name 'df_performance_results_speed' is not defined

In [None]:
df_performance_results_speed['Seconds_per_byte'].median()

NameError: name 'df_performance_results_speed' is not defined