In [35]:
import os
import numpy as np
import pandas as pd

datasets = {
    'c-binding': 'Protein-peptide binding affinity (canonical)',
    'binding': 'Protein-peptide binding affinity',
    'c-cpp': 'Cell penetration (canonical)',
    'cpp': 'Cell penetration',
    'nc-binding': 'Protein-peptide binding affinity (non-canonical)',
    'nc-cpp': 'Cell penetration (non-canonical)',
    'nc-antibacterial': "Antibacterial (non-canonical)",
    'antibacterial': "Antibacterial",
    'c-antibacterial': "Antibacterial (canonical)",
    'nc-antiviral': "Antiviral (non-canonical)",
    'antiviral': "Antiviral",
    'c-antiviral': "Antiviral (canonical)"
}
datasets_sim = [
    'c-binding', 'nc-binding', 'c-cpp',
    'nc-cpp', 'c-antibacterial', 'nc-antibacterial',
    'c-antiviral', 'nc-antiviral'
]
metrics_class = {
    'acc': 'Accuracy', 'f1_weighted': 'Weighted F1',
    'mcc': 'Matthew\'s correlation coefficient',
    'auroc': "Area under the ROC curve"
}
metrics_reg = {
    'pcc': 'Pearson\'s R', 'spcc': 'Spearman\'s R',
    'rmse': 'RMSE'
}
fancy_rep = {
    'esm2-8m': 'ESM2 8M',
    'esm2-150m': "ESM2 150M",
    'prot-t5-xl': "Prot-T5-XL",
    'molformer': 'Molformer-XL',
    'chemberta': 'ChemBERTa-2',
    'pepfunn': 'PepFuNN',
    'ecfp': "ECFP-16",
    'ecfp-count': "ECFP-16 count",
    'pepclm': "PeptideCLM",
    'pepland': "Pepland",
    ## Add new representations
    'ecfp-count': "ECFP-16 with counts"
}
metrics_fancy = metrics_class 
metrics_fancy.update(metrics_reg)
REGRESSION = ['c-binding', 'nc-binding']
DPI = 512
sorted(datasets_sim)

['c-antibacterial',
 'c-antiviral',
 'c-binding',
 'c-cpp',
 'nc-antibacterial',
 'nc-antiviral',
 'nc-binding',
 'nc-cpp']

In [36]:
from scipy.stats import kruskal, wilcoxon


def get_stats(df: pd.DataFrame, order: list):
    g_df = df.groupby('rep')
    groups = {}

    for n, mini_df in g_df:
        groups[n] = mini_df['GOOD'].to_numpy()

    names = {name: idx for idx, name in enumerate(order)}
    mtx = np.ones((len(groups), len(groups)))

    for pair1 in order:
        for pair2 in order:
            if pair1 == pair2:
                continue    
            if len(groups[pair1]) != len(groups[pair2]):
                print(pair1, pair2, len(groups[pair1]), len(groups[pair2]))
                # raise IndexError("All experiments from same dataset should have the same number of entries.")
            else:
                value = wilcoxon(
                    groups[pair1], groups[pair2],
                    alternative='greater'
                )[1]
            mtx[names[pair1], names[pair2]] = value
    return groups, mtx

def order_datasets(df: pd.DataFrame) -> list:
    names = [n for n, g in df.groupby('rep')]
    means = df.groupby('rep')['GOOD'].mean()
    order = np.argsort(means)
    names = [names[i] for i in order]
    return names

def define_table(df: pd.DataFrame) -> pd.DataFrame:
    df['rep'] = df.rep.map(fancy_rep)
    df = df[df['threshold'] != 'random']
    all_proto_df = []
    for idx, dataset in enumerate(df.dataset.unique()):
        tmp_df = df[df['dataset'] == dataset].copy()
        if not dataset.startswith('c-') and not dataset.startswith('nc-'):
            metric = 'GOOD'
        else:
            if dataset in REGRESSION:
                metric = 'spcc'
            else:
                metric = 'mcc'

        # for model in tmp_df.model.unique():
        #     for threshold in tmp_df.threshold.unique():
        #         for rep in tmp_df.rep.unique():
        #             tmp_df_2 = tmp_df[(tmp_df.model == model) & (tmp_df.threshold == threshold) & (tmp_df.rep == rep)]
        #             if len(tmp_df_2) < 1:
        #                 continue
        for idx, x in tmp_df.iterrows():
            entry = {
                'rep': x.rep, 'threshold': x.threshold,
                'model': x.model, 'GOOD': x[metric],
                'dataset': dataset
            }
            all_proto_df.append(entry)

    df = pd.DataFrame(all_proto_df)
    print("Standard deviation:", df.GOOD.std())

    names = order_datasets(df)
    groups, mtx = get_stats(df, names)

    # Kruskal-Wallis test
    p_value = kruskal(*list(groups.values()))[1]
    print(f"Kruskal-Wallis p: {p_value:.2g}")

    n = len(groups)
    alpha_adj = 0.05 * 2 / (n * (n - 1))
    ranks = []
    names.reverse()
    for idx, name in enumerate(names):
        if idx == 0:
            ranks.append(1)
            continue
        else:
            if mtx[idx, idx - 1] < alpha_adj:
                ranks.append(ranks[-1])
            else:
                ranks.append(ranks[-1] + 1)

    final_table = []
    for name, rank in zip(names, ranks):
        entry = {"Representation": name}
        for dataset in df.dataset.unique():
            tmp_df = df[(df['rep'] == name) & (df['dataset'] == dataset)].copy()
            sem = f"{tmp_df['GOOD'].sem():.2f}"
            mean = f"{tmp_df['GOOD'].mean():.2f}"
            entry[datasets[dataset]] = f"{mean} ± {sem}"
    
        tmp_df = df[df['rep'] == name].copy()
        sem = f"{tmp_df['GOOD'].sem():.2f}"
        mean = f"{tmp_df['GOOD'].mean():.2f}"

        entry['Average'] = f"{mean} ± {sem}"
        if p_value < 0.05:
            entry["Significant rank"] = f"**--{rank}--**" if rank < 2 else rank
        else:
            entry["Significant rank"] = "-"
        final_table.append(entry)

    table = pd.DataFrame(final_table)
    return table

In [37]:
dir = '../Results/no-generalisation'
df = pd.DataFrame()

for file in os.listdir(dir):

    path = os.path.join(dir, file)
    tmp_df = pd.read_csv(path)
    tmp_df['model'] = file.split('_')[1]
    tmp_df['pre_pca'] = float(file.split('_')[3])
    tmp_df['post_pca'] = float(file.split('_')[5])
    tmp_df['dataset'] = file.split('_')[0]
    tmp_df['rep'] = file.split('_')[6][:-4]
    df = pd.concat([df, tmp_df])

In [38]:
# Canonical SVM
print("Values for SVM Canonical")
df_c = df[df['dataset'].map(lambda x: x.startswith('c-'))].copy()
df_c = df_c[df_c['model'] == 'svm'].copy()
tab1 = define_table(df_c)
print(tab1.to_latex(index=False))

Values for SVM Canonical
Standard deviation: 0.2506314595593855
Kruskal-Wallis p: 5.4e-190
\begin{tabular}{lllllll}
\toprule
Representation & Protein-peptide binding affinity (canonical) & Antibacterial (canonical) & Antiviral (canonical) & Cell penetration (canonical) & Average & Significant rank \\
\midrule
ECFP-16 & 0.85 ± 0.01 & 0.69 ± 0.02 & 0.74 ± 0.01 & 0.92 ± 0.01 & 0.80 ± 0.01 & **--1--** \\
PepFuNN & 0.84 ± 0.01 & 0.64 ± 0.02 & 0.73 ± 0.01 & 0.88 ± 0.01 & 0.77 ± 0.01 & **--1--** \\
ECFP-16 with counts & 0.80 ± 0.01 & 0.64 ± 0.02 & 0.71 ± 0.01 & 0.95 ± 0.01 & 0.77 ± 0.01 & 2 \\
Prot-T5-XL & 0.84 ± 0.01 & 0.52 ± 0.02 & 0.75 ± 0.01 & 0.85 ± 0.01 & 0.75 ± 0.01 & 3 \\
ESM2 150M & 0.52 ± 0.01 & 0.31 ± 0.01 & 0.51 ± 0.01 & 0.72 ± 0.02 & 0.51 ± 0.01 & 3 \\
ESM2 8M & 0.52 ± 0.01 & 0.29 ± 0.01 & 0.45 ± 0.01 & 0.68 ± 0.01 & 0.48 ± 0.01 & 3 \\
Molformer-XL & 0.38 ± 0.01 & 0.19 ± 0.01 & 0.41 ± 0.02 & 0.73 ± 0.02 & 0.43 ± 0.02 & 3 \\
ChemBERTa-2 & 0.53 ± 0.01 & 0.25 ± 0.01 & 0.19 ± 0.01 & 

In [39]:
# Non-canonical SVM
print("Values for SVM Non-Canonical")
df_c = df[df['dataset'].map(lambda x: x.startswith('nc-'))].copy()
df_c = df_c[df_c['model'] == 'svm'].copy()
tab1 = define_table(df_c)
print(tab1.to_latex(index=False))

Values for SVM Non-Canonical
Standard deviation: 0.175098547748244
Kruskal-Wallis p: 9.6e-56
\begin{tabular}{lllllll}
\toprule
Representation & Antibacterial (non-canonical) & Antiviral (non-canonical) & Cell penetration (non-canonical) & Protein-peptide binding affinity (non-canonical) & Average & Significant rank \\
\midrule
Molformer-XL & 0.88 ± 0.01 & 0.87 ± 0.02 & 0.85 ± 0.01 & 0.74 ± 0.02 & 0.83 ± 0.01 & **--1--** \\
ECFP-16 with counts & 0.87 ± 0.01 & 0.83 ± 0.02 & 0.64 ± 0.02 & 0.86 ± 0.01 & 0.80 ± 0.01 & **--1--** \\
ECFP-16 & 0.89 ± 0.01 & 0.65 ± 0.03 & 0.66 ± 0.02 & 0.89 ± 0.01 & 0.77 ± 0.01 & 2 \\
PepFuNN & 0.87 ± 0.01 & 0.74 ± 0.02 & 0.37 ± 0.02 & 0.86 ± 0.02 & 0.71 ± 0.02 & 3 \\
PeptideCLM & 0.77 ± 0.02 & 0.75 ± 0.02 & 0.70 ± 0.01 & 0.62 ± 0.02 & 0.70 ± 0.01 & 3 \\
ChemBERTa-2 & 0.74 ± 0.01 & 0.79 ± 0.01 & 0.43 ± 0.02 & 0.72 ± 0.01 & 0.68 ± 0.01 & 4 \\
Pepland & 0.60 ± 0.01 & 0.66 ± 0.02 & 0.51 ± 0.02 & 0.50 ± 0.02 & 0.57 ± 0.01 & 5 \\
\bottomrule
\end{tabular}



In [40]:
# Canonical LightGBM
print("Values for LightGBM Canonical")
df_c = df[df['dataset'].map(lambda x: x.startswith('c-'))].copy()
df_c = df_c[df_c['model'] == 'lightgbm'].copy()
tab1 = define_table(df_c)
tab1

Values for LightGBM Canonical
Standard deviation: 0.11677337980803058
Kruskal-Wallis p: 1.7e-10


Unnamed: 0,Representation,Antiviral (canonical),Protein-peptide binding affinity (canonical),Cell penetration (canonical),Antibacterial (canonical),Average,Significant rank
0,ESM2 8M,0.78 ± 0.01,0.90 ± 0.01,0.91 ± 0.01,0.81 ± 0.02,0.85 ± 0.01,**--1--**
1,ECFP-16 with counts,0.75 ± 0.01,0.91 ± 0.01,0.94 ± 0.01,0.79 ± 0.02,0.84 ± 0.01,**--1--**
2,Prot-T5-XL,0.77 ± 0.01,0.90 ± 0.00,0.91 ± 0.01,0.81 ± 0.02,0.84 ± 0.01,**--1--**
3,ESM2 150M,0.76 ± 0.01,0.88 ± 0.01,0.91 ± 0.01,0.81 ± 0.02,0.83 ± 0.01,2
4,ECFP-16,0.74 ± 0.01,0.90 ± 0.01,0.92 ± 0.01,0.77 ± 0.02,0.83 ± 0.01,2
5,ChemBERTa-2,0.73 ± 0.01,0.89 ± 0.01,0.90 ± 0.01,0.80 ± 0.02,0.82 ± 0.01,2
6,PeptideCLM,0.71 ± 0.01,0.86 ± 0.00,0.90 ± 0.01,0.79 ± 0.02,0.81 ± 0.01,3
7,Pepland,0.70 ± 0.01,0.89 ± 0.01,0.88 ± 0.01,0.78 ± 0.02,0.81 ± 0.01,3
8,Molformer-XL,0.68 ± 0.02,0.88 ± 0.01,0.91 ± 0.01,0.77 ± 0.02,0.80 ± 0.01,4
9,PepFuNN,0.73 ± 0.01,0.76 ± 0.01,0.89 ± 0.01,0.68 ± 0.02,0.76 ± 0.01,5


In [41]:
# Non-canonical LightGBM
print("Values for LightGBM Non-canonical")
df_nc = df[df['dataset'].map(lambda x: x.startswith('nc-'))].copy()
df_nc = df_nc[df_nc['model'] == 'lightgbm']
tab2 = define_table(df_nc)
tab2

Values for LightGBM Non-canonical
Standard deviation: 0.14272781779176788
Kruskal-Wallis p: 1.5e-36


Unnamed: 0,Representation,Antiviral (non-canonical),Antibacterial (non-canonical),Protein-peptide binding affinity (non-canonical),Cell penetration (non-canonical),Average,Significant rank
0,Molformer-XL,0.91 ± 0.01,0.88 ± 0.01,0.85 ± 0.02,0.89 ± 0.01,0.88 ± 0.01,**--1--**
1,ChemBERTa-2,0.91 ± 0.01,0.87 ± 0.00,0.88 ± 0.01,0.84 ± 0.02,0.88 ± 0.01,**--1--**
2,ECFP-16,0.87 ± 0.01,0.90 ± 0.01,0.87 ± 0.01,0.71 ± 0.02,0.84 ± 0.01,**--1--**
3,PeptideCLM,0.83 ± 0.02,0.88 ± 0.00,0.85 ± 0.01,0.78 ± 0.01,0.83 ± 0.01,2
4,ECFP-16 with counts,0.87 ± 0.01,0.89 ± 0.01,0.86 ± 0.02,0.65 ± 0.04,0.82 ± 0.01,3
5,Pepland,0.78 ± 0.01,0.85 ± 0.01,0.83 ± 0.01,0.62 ± 0.02,0.77 ± 0.01,3
6,PepFuNN,0.74 ± 0.02,0.88 ± 0.01,0.73 ± 0.02,0.44 ± 0.01,0.70 ± 0.02,4


In [42]:
# Canonical to non-canonical
dir = '../Results/canonical'

data = []

for file in os.listdir(dir):
    dataset = file.split('_')[0]
    rep = file.split('_')[6][:-4]
    path = os.path.join(dir, file)
    experiment = dir.split("/")[-1][0].upper() + dir.split("/")[-1][1:]
    model = file.split('_')[1]

    if model != 'lightgbm':
        continue

    tmp_df = pd.read_csv(path)
    tmp_df['model'] = file.split('_')[1]
    tmp_df['pre_pca'] = float(file.split('_')[3])
    tmp_df['post_pca'] = float(file.split('_')[5])
    tmp_df['dataset'] = dataset
    tmp_df['rep'] = rep
    tmp_df['experiment'] = experiment

    if 'binding' in dataset:
        metric = 'spcc'
    else:
        metric = 'mcc'

    for m, th in zip(tmp_df[f'{metric}_nc'], tmp_df['threshold']):
        data.append({
            'dataset': dataset,
            "GOOD": m,
            "Test set": "Non-canonical",
            "model": "lightgbm",
            "rep": rep,
            'threshold': th

        })
    for m, th in zip(tmp_df[f'{metric}_c'], tmp_df['threshold']):
        data.append({
            'dataset': dataset,
            "GOOD": m,
            "Test set": "Canonical",
            "model": "lightgbm",
            "rep": rep,
            'threshold': th

        })

df = pd.DataFrame(data)

In [43]:
print("Canonical to non-canonical")
tab3 = define_table(df[(df['Test set'] == 'Non-canonical')].copy())
tab3

Canonical to non-canonical
Standard deviation: 0.17828515687698224
Kruskal-Wallis p: 6.9e-10


Unnamed: 0,Representation,Protein-peptide binding affinity,Antiviral,Antibacterial,Cell penetration,Average,Significant rank
0,ChemBERTa-2,0.15 ± 0.01,0.38 ± 0.02,0.27 ± 0.01,0.07 ± 0.01,0.22 ± 0.01,**--1--**
1,ECFP-16,0.05 ± 0.01,0.35 ± 0.02,0.32 ± 0.01,0.10 ± 0.02,0.20 ± 0.01,2
2,PeptideCLM,0.32 ± 0.01,0.16 ± 0.01,0.23 ± 0.01,-0.06 ± 0.02,0.16 ± 0.01,3
3,ECFP-16 with counts,0.06 ± 0.01,0.27 ± 0.02,0.32 ± 0.01,-0.02 ± 0.01,0.15 ± 0.01,4
4,PepFuNN,-0.17 ± 0.02,0.29 ± 0.01,0.38 ± 0.01,0.01 ± 0.02,0.11 ± 0.02,5
5,Molformer-XL,0.14 ± 0.01,0.11 ± 0.01,0.39 ± 0.01,-0.15 ± 0.02,0.11 ± 0.02,6
6,Pepland,0.05 ± 0.01,0.15 ± 0.02,-0.04 ± 0.01,0.20 ± 0.02,0.10 ± 0.01,7


In [44]:
print(tab1.to_markdown(index=False))

| Representation      | Antiviral (canonical)   | Protein-peptide binding affinity (canonical)   | Cell penetration (canonical)   | Antibacterial (canonical)   | Average     | Significant rank   |
|:--------------------|:------------------------|:-----------------------------------------------|:-------------------------------|:----------------------------|:------------|:-------------------|
| ESM2 8M             | 0.78 ± 0.01             | 0.90 ± 0.01                                    | 0.91 ± 0.01                    | 0.81 ± 0.02                 | 0.85 ± 0.01 | **--1--**          |
| ECFP-16 with counts | 0.75 ± 0.01             | 0.91 ± 0.01                                    | 0.94 ± 0.01                    | 0.79 ± 0.02                 | 0.84 ± 0.01 | **--1--**          |
| Prot-T5-XL          | 0.77 ± 0.01             | 0.90 ± 0.00                                    | 0.91 ± 0.01                    | 0.81 ± 0.02                 | 0.84 ± 0.01 | **--1--**          |
| ESM2 150M    

In [45]:
print(tab2.to_markdown(index=False))

| Representation      | Antiviral (non-canonical)   | Antibacterial (non-canonical)   | Protein-peptide binding affinity (non-canonical)   | Cell penetration (non-canonical)   | Average     | Significant rank   |
|:--------------------|:----------------------------|:--------------------------------|:---------------------------------------------------|:-----------------------------------|:------------|:-------------------|
| Molformer-XL        | 0.91 ± 0.01                 | 0.88 ± 0.01                     | 0.85 ± 0.02                                        | 0.89 ± 0.01                        | 0.88 ± 0.01 | **--1--**          |
| ChemBERTa-2         | 0.91 ± 0.01                 | 0.87 ± 0.00                     | 0.88 ± 0.01                                        | 0.84 ± 0.02                        | 0.88 ± 0.01 | **--1--**          |
| ECFP-16             | 0.87 ± 0.01                 | 0.90 ± 0.01                     | 0.87 ± 0.01                                        | 0.71 ± 

In [46]:
print(tab3.to_markdown(index=False))

| Representation      | Protein-peptide binding affinity   | Antiviral   | Antibacterial   | Cell penetration   | Average     | Significant rank   |
|:--------------------|:-----------------------------------|:------------|:----------------|:-------------------|:------------|:-------------------|
| ChemBERTa-2         | 0.15 ± 0.01                        | 0.38 ± 0.02 | 0.27 ± 0.01     | 0.07 ± 0.01        | 0.22 ± 0.01 | **--1--**          |
| ECFP-16             | 0.05 ± 0.01                        | 0.35 ± 0.02 | 0.32 ± 0.01     | 0.10 ± 0.02        | 0.20 ± 0.01 | 2                  |
| PeptideCLM          | 0.32 ± 0.01                        | 0.16 ± 0.01 | 0.23 ± 0.01     | -0.06 ± 0.02       | 0.16 ± 0.01 | 3                  |
| ECFP-16 with counts | 0.06 ± 0.01                        | 0.27 ± 0.02 | 0.32 ± 0.01     | -0.02 ± 0.01       | 0.15 ± 0.01 | 4                  |
| PepFuNN             | -0.17 ± 0.02                       | 0.29 ± 0.01 | 0.38 ± 0.01     | 0.01 ± 0.02  