In [1]:
import os
import json
import numpy as np
import pandas as pd
import altair as alt

In [2]:
# Uncomment if you are using dark jupyter lab/notebook theme
alt.renderers.set_embed_options(theme='dark')
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

# Data importing and filtering

In [3]:
path_cenact_results = os.path.join(
    '..', 'Results', 'CENACT_f1_score_overview.csv')
path_other_encodings_results = os.path.join(
    '..', 'Data', 'Visualization_data', 'peptidereactor_vis_data',
    'hm_bio_data.json')


In [4]:
df_cenact_results = pd.read_csv(path_cenact_results).dropna(subset=['F1'])
df_cenact_results.drop(['Encoding_max', 'bio_field', 'is_imbalanced'],
                       axis=1, inplace=True)  # Removing extra columns
df_cenact_results

Unnamed: 0,Dataset,Encoding,F1,type
0,ace_vaxinpad,cenact_hyd,0.816760,sequence based
1,acp_anticp,cenact_hyd,0.786909,sequence based
2,acp_iacp,cenact_hyd,0.749565,sequence based
3,acp_mlacp,cenact_hyd,0.589155,sequence based
4,afp_amppred,cenact_hyd,0.769917,sequence based
...,...,...,...,...
181,hiv_v3,cenact_dd,0.800079,sequence based
182,isp_il10pred,cenact_dd,0.475933,sequence based
183,nep_neuropipred,cenact_dd,0.743103,sequence based
184,pip_pipel,cenact_dd,0.347928,sequence based


In [5]:
df_other_encodings_results = pd.read_json(path_other_encodings_results).dropna(
    subset=['F1'])
df_other_encodings_results.drop(
    ['Encoding_max', 'index', 'type_field', 'missing', 'bio_field',
     'is_imbalanced'],
    axis=1, inplace=True)  # Removing extra columns

df_other_encodings_results = df_other_encodings_results[
    df_other_encodings_results['F1'] != 'separator']
df_other_encodings_results

Unnamed: 0,Dataset,Encoding,F1,type
0,hiv_v3,dist_f,0.980728,sequence based
1,hiv_v3,blomap,0.978587,sequence based
2,hiv_v3,paac_l,0.97619,sequence based
3,hiv_v3,blosum,0.976344,sequence based
4,hiv_v3,distan,0.925277,structure based
...,...,...,...,...
5175,tce_zhao,cksaag,0.414286,sequence based
5176,tce_zhao,qsorde,0.333333,sequence based
5177,tce_zhao,fldpc_,0.572727,sequence based
5178,tce_zhao,ctdd,0.333333,sequence based


In [6]:
df_combined_results = pd.concat(
    [df_cenact_results, df_other_encodings_results], axis=0)
df_combined_results

Unnamed: 0,Dataset,Encoding,F1,type
0,ace_vaxinpad,cenact_hyd,0.81676,sequence based
1,acp_anticp,cenact_hyd,0.786909,sequence based
2,acp_iacp,cenact_hyd,0.749565,sequence based
3,acp_mlacp,cenact_hyd,0.589155,sequence based
4,afp_amppred,cenact_hyd,0.769917,sequence based
...,...,...,...,...
5175,tce_zhao,cksaag,0.414286,sequence based
5176,tce_zhao,qsorde,0.333333,sequence based
5177,tce_zhao,fldpc_,0.572727,sequence based
5178,tce_zhao,ctdd,0.333333,sequence based


# Splitting the data according to the type (sequence-based vs. structure-based)

In [7]:
df_sequence_encodings = df_combined_results[
    df_combined_results['type'] == 'sequence based'].reset_index(drop=True)
df_sequence_encodings.drop('type', axis=1, inplace=True)
df_sequence_encodings

Unnamed: 0,Dataset,Encoding,F1
0,ace_vaxinpad,cenact_hyd,0.81676
1,acp_anticp,cenact_hyd,0.786909
2,acp_iacp,cenact_hyd,0.749565
3,acp_mlacp,cenact_hyd,0.589155
4,afp_amppred,cenact_hyd,0.769917
...,...,...,...
3769,tce_zhao,dist_f,0.381818
3770,tce_zhao,cksaag,0.414286
3771,tce_zhao,qsorde,0.333333
3772,tce_zhao,fldpc_,0.572727


In [8]:
df_structure_encodings = df_combined_results[
    df_combined_results['type'] == 'structure based'].reset_index(drop=True)
df_structure_encodings.drop('type', axis=1, inplace=True)
df_structure_encodings['F1'] = pd.to_numeric(df_structure_encodings['F1'])
df_structure_encodings

Unnamed: 0,Dataset,Encoding,F1
0,hiv_v3,distan,0.925277
1,hiv_v3,delaun,0.926829
2,hiv_v3,ssec,0.920000
3,hiv_v3,electr,0.926122
4,hiv_v3,disord,0.920000
...,...,...,...
895,tce_zhao,distan,0.333333
896,tce_zhao,asa,0.000000
897,tce_zhao,sseb,0.066667
898,tce_zhao,disord,0.200000


In [9]:
df_sequence_encodings['F1'] = pd.to_numeric(df_sequence_encodings['F1'])
df_sequence_encodings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3774 entries, 0 to 3773
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Dataset   3774 non-null   object 
 1   Encoding  3774 non-null   object 
 2   F1        3774 non-null   float64
dtypes: float64(1), object(2)
memory usage: 88.6+ KB


# Data sorting and exporting
## Aggregation on data sets per encoding

In [10]:
df_sorted_sequence_encodings = df_sequence_encodings.groupby(
    'Encoding')[['F1']].median().sort_values('F1', ascending=False)
df_sorted_sequence_encodings

Unnamed: 0_level_0,F1
Encoding,Unnamed: 1_level_1
dist_f,0.888316
cksaap,0.881567
dpc,0.872812
psekraac,0.872761
dde,0.864251
qsorde,0.859727
tpc,0.85908
ngram_,0.856912
apaac_,0.854452
paac_l,0.845311


In [11]:
df_sorted_structure_encodings = df_structure_encodings.groupby(
    'Encoding')[['F1']].median().sort_values('F1', ascending=False)

In [12]:
print(df_sorted_sequence_encodings.to_latex())

\begin{tabular}{lr}
\toprule
 & F1 \\
Encoding &  \\
\midrule
dist_f & 0.888316 \\
cksaap & 0.881567 \\
dpc & 0.872812 \\
psekraac & 0.872761 \\
dde & 0.864251 \\
qsorde & 0.859727 \\
tpc & 0.859080 \\
ngram_ & 0.856912 \\
apaac_ & 0.854452 \\
paac_l & 0.845311 \\
binary & 0.840302 \\
ctriad & 0.839711 \\
ksctri & 0.837366 \\
waac_a & 0.834585 \\
aac & 0.831702 \\
fldpc_ & 0.828188 \\
ctdd & 0.827441 \\
cksaag & 0.823189 \\
gtpc & 0.816525 \\
ctdt & 0.815965 \\
gdpc & 0.806208 \\
ctdc & 0.804434 \\
nmbrot & 0.800000 \\
aainde & 0.799147 \\
fft_aa & 0.779505 \\
flgc_a & 0.773763 \\
cenact_nohyd & 0.773716 \\
cenact_hyd & 0.771042 \\
cenact_dd & 0.768236 \\
socnum & 0.762086 \\
geary_ & 0.745778 \\
cgr_re & 0.745628 \\
moran_ & 0.743752 \\
gaac & 0.734044 \\
egaac_ & 0.729960 \\
eaac_w & 0.729595 \\
blomap & 0.727430 \\
zscale & 0.724898 \\
blosum & 0.713963 \\
\bottomrule
\end{tabular}



In [13]:
print(df_sorted_structure_encodings.to_latex())

\begin{tabular}{lr}
\toprule
 & F1 \\
Encoding &  \\
\midrule
delaun & 0.789711 \\
qsar & 0.726743 \\
electr & 0.704956 \\
distan & 0.654590 \\
disord & 0.574943 \\
asa & 0.571013 \\
ta & 0.566491 \\
sseb & 0.565670 \\
ssec & 0.530964 \\
\bottomrule
\end{tabular}



## Aggregation on encodings per data set

In [14]:
df_combined_results.drop('type', axis=1, inplace=True)
df_combined_results

Unnamed: 0,Dataset,Encoding,F1
0,ace_vaxinpad,cenact_hyd,0.81676
1,acp_anticp,cenact_hyd,0.786909
2,acp_iacp,cenact_hyd,0.749565
3,acp_mlacp,cenact_hyd,0.589155
4,afp_amppred,cenact_hyd,0.769917
...,...,...,...
5175,tce_zhao,cksaag,0.414286
5176,tce_zhao,qsorde,0.333333
5177,tce_zhao,fldpc_,0.572727
5178,tce_zhao,ctdd,0.333333


In [15]:
df_sorted_per_dataset_top_1 = df_combined_results.sort_values(
    'F1', ascending=False).groupby(
    'Dataset', as_index=False).head(1).sort_values(
    'Dataset').reset_index(drop=True)

df_sorted_per_dataset_top_1

Unnamed: 0,Dataset,Encoding,F1
0,ace_vaxinpad,cksaap,0.95833
1,acp_anticp,apaac_,0.893867
2,acp_iacp,paac_l,0.901961
3,acp_mlacp,qsorde,0.755599
4,afp_amppred,cksaap,0.910436
...,...,...,...
57,sol_ecoli,cenact_dd,0.722274
58,tce_zhao,aainde,0.666667
59,toxinpred2,cenact_dd,0.747344
60,toxinpred_swissprot,cenact_nohyd,0.835506


In [16]:
print(df_sorted_per_dataset_top_1.to_latex())

\begin{tabular}{llll}
\toprule
 & Dataset & Encoding & F1 \\
\midrule
0 & ace_vaxinpad & cksaap & 0.958330 \\
1 & acp_anticp & apaac_ & 0.893867 \\
2 & acp_iacp & paac_l & 0.901961 \\
3 & acp_mlacp & qsorde & 0.755599 \\
4 & afp_amppred & cksaap & 0.910436 \\
5 & afp_antifp & dist_f & 0.895777 \\
6 & ai4avp_2 & cenact_hyd & 0.716022 \\
7 & aip_aippred & dde & 0.624602 \\
8 & aip_antiinflam & dde & 0.669810 \\
9 & amp_antibp & cksaap & 0.958332 \\
10 & amp_antibp2 & qsorde & 0.924050 \\
11 & amp_csamp & dist_f & 0.980392 \\
12 & amp_fernandes & cenact_dd & 0.822134 \\
13 & amp_gonzales & gdpc & 0.888889 \\
14 & amp_iamp2l & dist_f & 0.817127 \\
15 & amp_modlamp & cksaap & 0.934132 \\
16 & amy_albase & cenact_nohyd & 0.927475 \\
17 & amy_hex & cenact_hyd & 0.691899 \\
18 & atb_antitbp & cksaag & 0.832476 \\
19 & atb_iantitb & cksaap & 0.800000 \\
20 & avp_amppred & cksaap & 0.880951 \\
21 & avp_avppred & cksaap & 0.871587 \\
22 & bce_ibce & dde & 0.654593 \\
23 & c2pred & cenact_dd & 0.7

In [17]:
df_sorted_per_dataset_top_3 = df_combined_results.sort_values(
    'F1', ascending=False).groupby(
    'Dataset', as_index=False).head(3).sort_values(
    'Dataset').reset_index(drop=True)

df_sorted_per_dataset_top_3

Unnamed: 0,Dataset,Encoding,F1
0,ace_vaxinpad,cksaap,0.95833
1,ace_vaxinpad,dist_f,0.942619
2,ace_vaxinpad,cksaap,0.95833
3,acp_anticp,paac_l,0.890097
4,acp_anticp,apaac_,0.893867
...,...,...,...
181,toxinpred_swissprot,cenact_nohyd,0.835506
182,toxinpred_swissprot,cenact_dd,0.825976
183,toxinpred_trembl,cenact_nohyd,0.766152
184,toxinpred_trembl,cenact_dd,0.744815


In [18]:
df_sorted_per_dataset_top_3['Encoding_F1'] = df_sorted_per_dataset_top_3[
    'Encoding'] + ' ' + df_sorted_per_dataset_top_3['F1'].astype(str)

df_sorted_per_dataset_top_3 = df_sorted_per_dataset_top_3.groupby(
    'Dataset', as_index=False)['Encoding_F1'].apply(list).reset_index(drop=True)

df_sorted_per_dataset_top_3 = df_sorted_per_dataset_top_3['Dataset'].to_frame(
    ).join(
    pd.DataFrame(df_sorted_per_dataset_top_3['Encoding_F1'].to_list(),
                 columns=['1st Place', '2nd Place', '3rd Place']))

df_sorted_per_dataset_top_3

Unnamed: 0,Dataset,1st Place,2nd Place,3rd Place
0,ace_vaxinpad,cksaap 0.9583304396000001,dist_f 0.9426190956,cksaap 0.9583304396000001
1,acp_anticp,paac_l 0.8900966184,apaac_ 0.8938673342000001,apaac_ 0.8938673342000001
2,acp_iacp,dist_f 0.8979591837,paac_l 0.9019607843,paac_l 0.9019607843
3,acp_mlacp,waac_a 0.741025641,qsorde 0.7555994730000001,qsorde 0.7555994730000001
4,afp_amppred,dist_f 0.9025079148,cksaap 0.9104362061,cksaap 0.9104362061
...,...,...,...,...
57,sol_ecoli,cenact_hyd 0.7222739168362897,cenact_dd 0.7222739168362897,cenact_nohyd 0.714074074074074
58,tce_zhao,aainde 0.6666666667000001,aainde 0.6666666667000001,zscale 0.6076923077
59,toxinpred2,cenact_nohyd 0.7466785371866869,cenact_hyd 0.7473438956197576,cenact_dd 0.7473438956197576
60,toxinpred_swissprot,cenact_hyd 0.8317068077404983,cenact_nohyd 0.835505824274132,cenact_dd 0.825975884630356


In [19]:
df_sorted_per_dataset_top_3[
    ['1st Encoding', '1st F1']] = df_sorted_per_dataset_top_3[
    '1st Place'].str.split(' ', expand=True)
df_sorted_per_dataset_top_3[
    ['2nd Encoding', '2nd F1']] = df_sorted_per_dataset_top_3[
    '2nd Place'].str.split(' ', expand=True)
df_sorted_per_dataset_top_3[
    ['3rd Encoding', '3rd F1']] = df_sorted_per_dataset_top_3[
    '3rd Place'].str.split(' ', expand=True)
df_sorted_per_dataset_top_3

Unnamed: 0,Dataset,1st Place,2nd Place,3rd Place,1st Encoding,1st F1,2nd Encoding,2nd F1,3rd Encoding,3rd F1
0,ace_vaxinpad,cksaap 0.9583304396000001,dist_f 0.9426190956,cksaap 0.9583304396000001,cksaap,0.9583304396000001,dist_f,0.9426190956,cksaap,0.9583304396000001
1,acp_anticp,paac_l 0.8900966184,apaac_ 0.8938673342000001,apaac_ 0.8938673342000001,paac_l,0.8900966184,apaac_,0.8938673342000001,apaac_,0.8938673342000001
2,acp_iacp,dist_f 0.8979591837,paac_l 0.9019607843,paac_l 0.9019607843,dist_f,0.8979591837,paac_l,0.9019607843,paac_l,0.9019607843
3,acp_mlacp,waac_a 0.741025641,qsorde 0.7555994730000001,qsorde 0.7555994730000001,waac_a,0.741025641,qsorde,0.7555994730000001,qsorde,0.7555994730000001
4,afp_amppred,dist_f 0.9025079148,cksaap 0.9104362061,cksaap 0.9104362061,dist_f,0.9025079148,cksaap,0.9104362061,cksaap,0.9104362061
...,...,...,...,...,...,...,...,...,...,...
57,sol_ecoli,cenact_hyd 0.7222739168362897,cenact_dd 0.7222739168362897,cenact_nohyd 0.714074074074074,cenact_hyd,0.7222739168362897,cenact_dd,0.7222739168362897,cenact_nohyd,0.714074074074074
58,tce_zhao,aainde 0.6666666667000001,aainde 0.6666666667000001,zscale 0.6076923077,aainde,0.6666666667000001,aainde,0.6666666667000001,zscale,0.6076923077
59,toxinpred2,cenact_nohyd 0.7466785371866869,cenact_hyd 0.7473438956197576,cenact_dd 0.7473438956197576,cenact_nohyd,0.7466785371866869,cenact_hyd,0.7473438956197576,cenact_dd,0.7473438956197576
60,toxinpred_swissprot,cenact_hyd 0.8317068077404983,cenact_nohyd 0.835505824274132,cenact_dd 0.825975884630356,cenact_hyd,0.8317068077404983,cenact_nohyd,0.835505824274132,cenact_dd,0.825975884630356


In [20]:
df_sorted_per_dataset_top_3 = df_sorted_per_dataset_top_3.drop(
    ['1st Place', '2nd Place', '3rd Place'], axis=1)
df_sorted_per_dataset_top_3

Unnamed: 0,Dataset,1st Encoding,1st F1,2nd Encoding,2nd F1,3rd Encoding,3rd F1
0,ace_vaxinpad,cksaap,0.9583304396000001,dist_f,0.9426190956,cksaap,0.9583304396000001
1,acp_anticp,paac_l,0.8900966184,apaac_,0.8938673342000001,apaac_,0.8938673342000001
2,acp_iacp,dist_f,0.8979591837,paac_l,0.9019607843,paac_l,0.9019607843
3,acp_mlacp,waac_a,0.741025641,qsorde,0.7555994730000001,qsorde,0.7555994730000001
4,afp_amppred,dist_f,0.9025079148,cksaap,0.9104362061,cksaap,0.9104362061
...,...,...,...,...,...,...,...
57,sol_ecoli,cenact_hyd,0.7222739168362897,cenact_dd,0.7222739168362897,cenact_nohyd,0.714074074074074
58,tce_zhao,aainde,0.6666666667000001,aainde,0.6666666667000001,zscale,0.6076923077
59,toxinpred2,cenact_nohyd,0.7466785371866869,cenact_hyd,0.7473438956197576,cenact_dd,0.7473438956197576
60,toxinpred_swissprot,cenact_hyd,0.8317068077404983,cenact_nohyd,0.835505824274132,cenact_dd,0.825975884630356


In [21]:
print(df_sorted_per_dataset_top_3.to_latex())

\begin{tabular}{llllllll}
\toprule
 & Dataset & 1st Encoding & 1st F1 & 2nd Encoding & 2nd F1 & 3rd Encoding & 3rd F1 \\
\midrule
0 & ace_vaxinpad & cksaap & 0.9583304396000001 & dist_f & 0.9426190956 & cksaap & 0.9583304396000001 \\
1 & acp_anticp & paac_l & 0.8900966184 & apaac_ & 0.8938673342000001 & apaac_ & 0.8938673342000001 \\
2 & acp_iacp & dist_f & 0.8979591837 & paac_l & 0.9019607843 & paac_l & 0.9019607843 \\
3 & acp_mlacp & waac_a & 0.741025641 & qsorde & 0.7555994730000001 & qsorde & 0.7555994730000001 \\
4 & afp_amppred & dist_f & 0.9025079148 & cksaap & 0.9104362061 & cksaap & 0.9104362061 \\
5 & afp_antifp & dist_f & 0.8957765789000001 & dist_f & 0.8957765789000001 & cksaap & 0.8896599088 \\
6 & ai4avp_2 & cenact_nohyd & 0.7149224806201551 & cenact_dd & 0.7089169000933707 & cenact_hyd & 0.7160219103046902 \\
7 & aip_aippred & dde & 0.6246019108 & dde & 0.6246019108 & egaac_ & 0.5797962649 \\
8 & aip_antiinflam & dde & 0.6698097633 & dde & 0.6698097633 & waac_a & 0.61712

In [None]:

######################################################################
######################################################################
# TODO:
# Check why this table has 62 data sets while the previous one has 61
######################################################################
######################################################################
