In [1]:
import os
import json
import numpy as np
import pandas as pd
import altair as alt

In [2]:
# Uncomment if you are using dark jupyter lab/notebook theme
alt.renderers.set_embed_options(theme='dark')
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [3]:
path_cenact_results = os.path.join(
    '..', 'Results', 'CENACT_f1_score_overview.csv')
path_other_encodings_results = os.path.join(
    '..', 'Data', 'Visualization_data', 'peptidereactor_vis_data',
    'hm_bio_data.json')


In [4]:
df_cenact_results = pd.read_csv(path_cenact_results).dropna(subset=['F1'])
df_cenact_results.drop(['Encoding_max', 'bio_field', 'is_imbalanced'],
                       axis=1, inplace=True)  # Removing extra columns
df_cenact_results

Unnamed: 0,Dataset,Encoding,F1,type
0,ace_vaxinpad,cenact_hyd,0.819358,sequence based
1,acp_anticp,cenact_hyd,0.786194,sequence based
2,acp_iacp,cenact_hyd,0.757320,sequence based
3,acp_mlacp,cenact_hyd,0.584432,sequence based
4,afp_amppred,cenact_hyd,0.774120,sequence based
...,...,...,...,...
115,hem_hemopi,cenact_dd,0.859968,sequence based
116,effectorp,cenact_dd,0.522094,sequence based
117,toxinpred_swissprot,cenact_dd,0.823654,sequence based
118,toxinpred_trembl,cenact_dd,0.746116,sequence based


In [5]:
df_other_encodings_results = pd.read_json(path_other_encodings_results).dropna(
    subset=['F1'])
df_other_encodings_results.drop(
    ['Encoding_max', 'index', 'type_field', 'missing', 'bio_field',
     'is_imbalanced'],
    axis=1, inplace=True)  # Removing extra columns

df_other_encodings_results = df_other_encodings_results[
    df_other_encodings_results['F1'] != 'separator']
df_other_encodings_results

Unnamed: 0,Dataset,Encoding,F1,type
0,hiv_v3,dist_f,0.980728,sequence based
1,hiv_v3,blomap,0.978587,sequence based
2,hiv_v3,paac_l,0.97619,sequence based
3,hiv_v3,blosum,0.976344,sequence based
4,hiv_v3,distan,0.925277,structure based
...,...,...,...,...
5175,tce_zhao,cksaag,0.414286,sequence based
5176,tce_zhao,qsorde,0.333333,sequence based
5177,tce_zhao,fldpc_,0.572727,sequence based
5178,tce_zhao,ctdd,0.333333,sequence based


In [6]:
df_combined_results = pd.concat(
    [df_cenact_results, df_other_encodings_results], axis=0)
df_combined_results

Unnamed: 0,Dataset,Encoding,F1,type
0,ace_vaxinpad,cenact_hyd,0.819358,sequence based
1,acp_anticp,cenact_hyd,0.786194,sequence based
2,acp_iacp,cenact_hyd,0.75732,sequence based
3,acp_mlacp,cenact_hyd,0.584432,sequence based
4,afp_amppred,cenact_hyd,0.77412,sequence based
...,...,...,...,...
5175,tce_zhao,cksaag,0.414286,sequence based
5176,tce_zhao,qsorde,0.333333,sequence based
5177,tce_zhao,fldpc_,0.572727,sequence based
5178,tce_zhao,ctdd,0.333333,sequence based


In [7]:
df_sequence_encodings = df_combined_results[
    df_combined_results['type'] == 'sequence based'].reset_index(drop=True)
df_sequence_encodings.drop('type', axis=1, inplace=True)
df_sequence_encodings

Unnamed: 0,Dataset,Encoding,F1
0,ace_vaxinpad,cenact_hyd,0.819358
1,acp_anticp,cenact_hyd,0.786194
2,acp_iacp,cenact_hyd,0.75732
3,acp_mlacp,cenact_hyd,0.584432
4,afp_amppred,cenact_hyd,0.77412
...,...,...,...
3703,tce_zhao,dist_f,0.381818
3704,tce_zhao,cksaag,0.414286
3705,tce_zhao,qsorde,0.333333
3706,tce_zhao,fldpc_,0.572727


In [12]:
df_structure_encodings = df_combined_results[
    df_combined_results['type'] == 'structure based'].reset_index(drop=True)
df_structure_encodings.drop('type', axis=1, inplace=True)
df_structure_encodings['F1'] = pd.to_numeric(df_structure_encodings['F1'])
df_structure_encodings

Unnamed: 0,Dataset,Encoding,F1
0,hiv_v3,distan,0.925277
1,hiv_v3,delaun,0.926829
2,hiv_v3,ssec,0.920000
3,hiv_v3,electr,0.926122
4,hiv_v3,disord,0.920000
...,...,...,...
895,tce_zhao,distan,0.333333
896,tce_zhao,asa,0.000000
897,tce_zhao,sseb,0.066667
898,tce_zhao,disord,0.200000


In [14]:
df_sequence_encodings['F1'] = pd.to_numeric(df_sequence_encodings['F1'])
df_sequence_encodings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3708 entries, 0 to 3707
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Dataset   3708 non-null   object 
 1   Encoding  3708 non-null   object 
 2   F1        3708 non-null   float64
dtypes: float64(1), object(2)
memory usage: 87.0+ KB


In [23]:
df_sorted_sequence_encodings = df_sequence_encodings.groupby(
    'Encoding')[['F1']].median().sort_values('F1', ascending=False)
df_sorted_sequence_encodings

Unnamed: 0_level_0,F1
Encoding,Unnamed: 1_level_1
dist_f,0.888316
cksaap,0.881567
dpc,0.872812
psekraac,0.872761
dde,0.864251
qsorde,0.859727
tpc,0.85908
ngram_,0.856912
apaac_,0.854452
paac_l,0.845311


In [24]:
df_sorted_structure_encodings = df_structure_encodings.groupby(
    'Encoding')[['F1']].median().sort_values('F1', ascending=False)

In [25]:
print(df_sorted_sequence_encodings.to_latex())

\begin{tabular}{lr}
\toprule
 & F1 \\
Encoding &  \\
\midrule
dist_f & 0.888316 \\
cksaap & 0.881567 \\
dpc & 0.872812 \\
psekraac & 0.872761 \\
dde & 0.864251 \\
qsorde & 0.859727 \\
tpc & 0.859080 \\
ngram_ & 0.856912 \\
apaac_ & 0.854452 \\
paac_l & 0.845311 \\
binary & 0.840302 \\
ctriad & 0.839711 \\
ksctri & 0.837366 \\
waac_a & 0.834585 \\
aac & 0.831702 \\
fldpc_ & 0.828188 \\
ctdd & 0.827441 \\
cksaag & 0.823189 \\
gtpc & 0.816525 \\
ctdt & 0.815965 \\
gdpc & 0.806208 \\
ctdc & 0.804434 \\
nmbrot & 0.800000 \\
aainde & 0.799147 \\
fft_aa & 0.779505 \\
flgc_a & 0.773763 \\
cenact_nohyd & 0.769537 \\
cenact_dd & 0.767157 \\
socnum & 0.762086 \\
cenact_hyd & 0.759348 \\
geary_ & 0.745778 \\
cgr_re & 0.745628 \\
moran_ & 0.743752 \\
gaac & 0.734044 \\
egaac_ & 0.729960 \\
eaac_w & 0.729595 \\
blomap & 0.727430 \\
zscale & 0.724898 \\
blosum & 0.713963 \\
\bottomrule
\end{tabular}



In [26]:
print(df_sorted_structure_encodings.to_latex())

\begin{tabular}{lr}
\toprule
 & F1 \\
Encoding &  \\
\midrule
delaun & 0.789711 \\
qsar & 0.726743 \\
electr & 0.704956 \\
distan & 0.654590 \\
disord & 0.574943 \\
asa & 0.571013 \\
ta & 0.566491 \\
sseb & 0.565670 \\
ssec & 0.530964 \\
\bottomrule
\end{tabular}

