In [11]:
import os
import numpy as np
import pandas as pd

datasets = {
    'c-binding': 'Protein-peptide binding affinity (canonical)',
    'binding': 'Protein-peptide binding affinity',
    'c-cpp': 'Cell penetration (canonical)',
    'cpp': 'Cell penetration',
    'nc-binding': 'Protein-peptide binding affinity (non-canonical)',
    'nc-cpp': 'Cell penetration (non-canonical)',
    'nc-antibacterial': "Antibacterial (non-canonical)",
    'antibacterial': "Antibacterial",
    'c-antibacterial': "Antibacterial (canonical)",
    'nc-antiviral': "Antiviral (non-canonical)",
    'antiviral': "Antiviral",
    'c-antiviral': "Antiviral (canonical)"
}
datasets_sim = [
    'c-binding', 'nc-binding', 'c-cpp',
    'nc-cpp', 'c-antibacterial', 'nc-antibacterial',
    'c-antiviral', 'nc-antiviral'
]
metrics_class = {
    'acc': 'Accuracy', 'f1_weighted': 'Weighted F1',
    'mcc': 'Matthew\'s correlation coefficient',
    'auroc': "Area under the ROC curve"
}
metrics_reg = {
    'pcc': 'Pearson\'s R', 'spcc': 'Spearman\'s R',
    'rmse': 'RMSE'
}
fancy_rep = {
    'esm2-8m': 'ESM2 8M',
    'esm2-150m': "ESM2 150M",
    'prot-t5-xl': "Prot-T5-XL",
    'molformer': 'Molformer-XL',
    'chemberta': 'ChemBERTa-2',
    'pepfunn': 'PepFuNN',
    'ecfp': "ECFP-16",
    'ecfp-count': "ECFP-16 count",
    'pepclm': "PeptideCLM",
    'pepland': "Pepland",
    ## Add new representations
    'ecfp-count': "ECFP-16 with counts"
}
metrics_fancy = metrics_class 
metrics_fancy.update(metrics_reg)
REGRESSION = ['c-binding', 'nc-binding']
DPI = 512
sorted(datasets_sim)

['c-antibacterial',
 'c-antiviral',
 'c-binding',
 'c-cpp',
 'nc-antibacterial',
 'nc-antiviral',
 'nc-binding',
 'nc-cpp']

In [12]:
from scipy.stats import kruskal, wilcoxon


def get_stats(df: pd.DataFrame, order: list):
    g_df = df.groupby('rep')
    groups = {}

    for n, mini_df in g_df:
        groups[n] = mini_df['GOOD'].to_numpy()

    names = {name: idx for idx, name in enumerate(order)}
    mtx = np.ones((len(groups), len(groups)))

    for pair1 in order:
        for pair2 in order:
            if pair1 == pair2:
                continue    
            if len(groups[pair1]) != len(groups[pair2]):
                value = 2
            else:
                value = wilcoxon(
                    groups[pair1], groups[pair2],
                    alternative='greater'
                )[1]
            mtx[names[pair1], names[pair2]] = value
    return groups, mtx

def order_datasets(df: pd.DataFrame) -> list:
    names = [n for n, g in df.groupby('rep')]
    means = df.groupby('rep')['GOOD'].mean()
    order = np.argsort(means)
    names = [names[i] for i in order]
    return names

def define_table(df: pd.DataFrame) -> pd.DataFrame:
    df['rep'] = df.rep.map(fancy_rep)
    df = df[df['threshold'] != 'random']
    all_proto_df = []
    for idx, dataset in enumerate(df.dataset.unique()):
        tmp_df = df[df['dataset'] == dataset].copy()
        if not dataset.startswith('c-') and not dataset.startswith('nc-'):
            metric = 'GOOD'
        else:
            if dataset in REGRESSION:
                metric = 'spcc'
            else:
                metric = 'mcc'

        for model in tmp_df.model.unique():
            for threshold in tmp_df.threshold.unique():
                for rep in tmp_df.rep.unique():
                    tmp_df_2 = tmp_df[(tmp_df.model == model) & (tmp_df.threshold == threshold) & (tmp_df.rep == rep)]
                    if len(tmp_df_2) < 1:
                        continue
                    entry = {
                        'rep': rep, 'threshold': threshold,
                        'model': model, 'GOOD': tmp_df_2[metric].mean(),
                        'dataset': dataset
                    }
                    all_proto_df.append(entry)

    df = pd.DataFrame(all_proto_df)
    print("Standard deviation:", df.GOOD.std())

    names = order_datasets(df)
    groups, mtx = get_stats(df, names)

    # Kruskal-Wallis test
    p_value = kruskal(*list(groups.values()))[1]
    print(f"Kruskal-Wallis p: {p_value:.1g}")

    n = len(groups)
    alpha_adj = 0.05 * 2 / (n * (n - 1))
    ranks = []
    names.reverse()
    for idx, name in enumerate(names):
        if idx == 0:
            ranks.append(1)
            continue
        else:
            if mtx[idx, idx - 1] < alpha_adj:
                ranks.append(ranks[-1])
            else:
                ranks.append(ranks[-1] + 1)

    final_table = []
    for name, rank in zip(names, ranks):
        entry = {"Representation": name}
        for dataset in df.dataset.unique():
            tmp_df = df[(df['rep'] == name) & (df['dataset'] == dataset)].copy()
            sem = f"{tmp_df['GOOD'].sem():.1g}"
            if len(sem) == 4:
                mean = f"{tmp_df['GOOD'].mean():.2f}"
            elif len(sem) == 3:
                mean = f"{tmp_df['GOOD'].mean():.1f}"
            else:
                mean = f"{tmp_df['GOOD'].mean():.3g}"
            entry[datasets[dataset]] = f"{mean} ± {sem}"
    
        tmp_df = df[df['rep'] == name].copy()
        sem = f"{tmp_df['GOOD'].sem():.1g}"
        if len(sem) == 4:
            mean = f"{tmp_df['GOOD'].mean():.2f}"
        elif len(sem) == 3:
            mean = f"{tmp_df['GOOD'].mean():.1f}"
        else:
            mean = f"{tmp_df['GOOD'].mean():.3g}"

        entry['Average'] = f"{mean} ± {sem}"
        entry["Significant rank"] = f"**--{rank}--**" if rank < 2 else rank
        final_table.append(entry)

    table = pd.DataFrame(final_table)
    return table

In [13]:
dir = '../Results/no-generalisation'
df = pd.DataFrame()

for file in os.listdir(dir):

    path = os.path.join(dir, file)
    tmp_df = pd.read_csv(path)
    tmp_df['model'] = file.split('_')[1]
    tmp_df['pre_pca'] = float(file.split('_')[3])
    tmp_df['post_pca'] = float(file.split('_')[5])
    tmp_df['dataset'] = file.split('_')[0]
    tmp_df['rep'] = file.split('_')[6][:-4]
    df = pd.concat([df, tmp_df])

In [14]:
# Canonical LightGBM
print("Values for LightGBM Canonical")
df_c = df[df['dataset'].map(lambda x: x.startswith('c-'))].copy()
df_c = df_c[df_c['model'] == 'lightgbm'].copy()
tab1 = define_table(df_c)
tab1

Values for LightGBM Canonical
Standard deviation: 0.11544005258082035
Kruskal-Wallis p: 0.02


Unnamed: 0,Representation,Antiviral (canonical),Protein-peptide binding affinity (canonical),Cell penetration (canonical),Antibacterial (canonical),Average,Significant rank
0,ECFP-16 with counts,0.75 ± 0.03,0.92 ± 0.01,0.93 ± 0.01,0.79 ± 0.03,0.86 ± 0.01,**--1--**
1,ESM2 8M,0.78 ± 0.03,0.90 ± 0.02,0.91 ± 0.02,0.81 ± 0.05,0.85 ± 0.02,2
2,Prot-T5-XL,0.77 ± 0.03,0.90 ± 0.01,0.91 ± 0.02,0.81 ± 0.05,0.84 ± 0.02,3
3,ESM2 150M,0.76 ± 0.03,0.88 ± 0.02,0.91 ± 0.02,0.81 ± 0.05,0.83 ± 0.02,4
4,ECFP-16,0.74 ± 0.03,0.90 ± 0.01,0.92 ± 0.02,0.77 ± 0.06,0.83 ± 0.02,4
5,ChemBERTa-2,0.73 ± 0.03,0.89 ± 0.01,0.90 ± 0.02,0.80 ± 0.05,0.82 ± 0.02,5
6,PeptideCLM,0.71 ± 0.03,0.86 ± 0.01,0.90 ± 0.02,0.79 ± 0.05,0.81 ± 0.02,6
7,Pepland,0.70 ± 0.03,0.89 ± 0.02,0.88 ± 0.03,0.78 ± 0.05,0.81 ± 0.02,7
8,Molformer-XL,0.68 ± 0.04,0.88 ± 0.02,0.91 ± 0.02,0.77 ± 0.05,0.80 ± 0.02,8
9,PepFuNN,0.73 ± 0.03,0.76 ± 0.01,0.89 ± 0.03,0.68 ± 0.05,0.76 ± 0.02,9


In [15]:
# Non-canonical LightGBM
print("Values for LightGBM Non-canonical")
df_nc = df[df['dataset'].map(lambda x: x.startswith('nc-'))].copy()
df_nc = df_nc[df_nc['model'] == 'lightgbm']
tab2 = define_table(df_nc)
tab2

Values for LightGBM Non-canonical
Standard deviation: 0.11042846704661881
Kruskal-Wallis p: 3e-10


Unnamed: 0,Representation,Antiviral (non-canonical),Antibacterial (non-canonical),Protein-peptide binding affinity (non-canonical),Cell penetration (non-canonical),Average,Significant rank
0,ECFP-16 with counts,0.86 ± 0.02,0.90 ± 0.01,0.87 ± 0.03,0.92 ± 0.02,0.88 ± 0.01,**--1--**
1,Molformer-XL,0.91 ± 0.03,0.88 ± 0.01,0.85 ± 0.04,0.89 ± 0.03,0.88 ± 0.02,2
2,ChemBERTa-2,0.91 ± 0.03,0.87 ± 0.01,0.88 ± 0.03,0.84 ± 0.04,0.88 ± 0.01,2
3,ECFP-16,0.87 ± 0.03,0.90 ± 0.01,0.87 ± 0.03,0.71 ± 0.03,0.84 ± 0.02,3
4,PeptideCLM,0.83 ± 0.02,0.88 ± 0.01,0.85 ± 0.03,0.78 ± 0.02,0.83 ± 0.01,4
5,Pepland,0.78 ± 0.03,0.85 ± 0.02,0.83 ± 0.03,0.62 ± 0.05,0.77 ± 0.02,5
6,PepFuNN,0.74 ± 0.03,0.88 ± 0.01,0.73 ± 0.05,0.62 ± 0.02,0.74 ± 0.02,6


In [16]:
# Canonical to non-canonical
print("Values for generalizing from canonical to non-canonical")
dir = '../Results/canonical'

data = []

for file in os.listdir(dir):
    dataset = file.split('_')[0]
    rep = file.split('_')[6][:-4]
    path = os.path.join(dir, file)
    experiment = dir.split("/")[-1][0].upper() + dir.split("/")[-1][1:]
    model = file.split('_')[1]
    if model != 'lightgbm':
        continue

    tmp_df = pd.read_csv(path)
    tmp_df['model'] = file.split('_')[1]
    tmp_df['pre_pca'] = float(file.split('_')[3])
    tmp_df['post_pca'] = float(file.split('_')[5])
    tmp_df['dataset'] = dataset
    tmp_df['rep'] = rep
    tmp_df['experiment'] = experiment

    if 'binding' in dataset:
        metric = 'spcc'
    else:
        metric = 'mcc'

    for m, th in zip(tmp_df[f'{metric}_nc'], tmp_df['threshold']):
        data.append({
            'dataset': dataset,
            "GOOD": m,
            "Test set": "Non-canonical",
            "model": "lightgbm",
            "rep": rep,
            'threshold': th

        })

df = pd.DataFrame(data)

Values for generalizing from canonical to non-canonical


In [17]:
tab3 = define_table(df)
tab3

Standard deviation: 0.17016033393341223
Kruskal-Wallis p: 0.03


Unnamed: 0,Representation,Protein-peptide binding affinity,Antiviral,Antibacterial,Cell penetration,Average,Significant rank
0,ChemBERTa-2,0.15 ± 0.02,0.38 ± 0.03,0.266 ± 0.009,0.07 ± 0.03,0.22 ± 0.03,**--1--**
1,ECFP-16,0.05 ± 0.03,0.35 ± 0.04,0.315 ± 0.008,0.05 ± 0.03,0.19 ± 0.03,2
2,PeptideCLM,0.32 ± 0.02,0.16 ± 0.02,0.23 ± 0.02,0.02 ± 0.04,0.19 ± 0.02,3
3,Molformer-XL,0.14 ± 0.02,0.11 ± 0.01,0.39 ± 0.02,-0.01 ± 0.04,0.15 ± 0.03,4
4,ECFP-16 with counts,0.06 ± 0.03,0.27 ± 0.03,0.319 ± 0.009,-0.17 ± 0.02,0.13 ± 0.04,5
5,PepFuNN,-0.17 ± 0.05,0.29 ± 0.02,0.38 ± 0.02,-0.02 ± 0.03,0.11 ± 0.05,6
6,Pepland,0.05 ± 0.02,0.15 ± 0.03,-0.04 ± 0.01,0.16 ± 0.04,0.09 ± 0.02,7


In [18]:
print(tab1.to_markdown(index=False))

| Representation      | Antiviral (canonical)   | Protein-peptide binding affinity (canonical)   | Cell penetration (canonical)   | Antibacterial (canonical)   | Average     | Significant rank   |
|:--------------------|:------------------------|:-----------------------------------------------|:-------------------------------|:----------------------------|:------------|:-------------------|
| ECFP-16 with counts | 0.75 ± 0.03             | 0.92 ± 0.01                                    | 0.93 ± 0.01                    | 0.79 ± 0.03                 | 0.86 ± 0.01 | **--1--**          |
| ESM2 8M             | 0.78 ± 0.03             | 0.90 ± 0.02                                    | 0.91 ± 0.02                    | 0.81 ± 0.05                 | 0.85 ± 0.02 | 2                  |
| Prot-T5-XL          | 0.77 ± 0.03             | 0.90 ± 0.01                                    | 0.91 ± 0.02                    | 0.81 ± 0.05                 | 0.84 ± 0.02 | 3                  |
| ESM2 150M    

In [19]:
print(tab2.to_markdown(index=False))

| Representation      | Antiviral (non-canonical)   | Antibacterial (non-canonical)   | Protein-peptide binding affinity (non-canonical)   | Cell penetration (non-canonical)   | Average     | Significant rank   |
|:--------------------|:----------------------------|:--------------------------------|:---------------------------------------------------|:-----------------------------------|:------------|:-------------------|
| ECFP-16 with counts | 0.86 ± 0.02                 | 0.90 ± 0.01                     | 0.87 ± 0.03                                        | 0.92 ± 0.02                        | 0.88 ± 0.01 | **--1--**          |
| Molformer-XL        | 0.91 ± 0.03                 | 0.88 ± 0.01                     | 0.85 ± 0.04                                        | 0.89 ± 0.03                        | 0.88 ± 0.02 | 2                  |
| ChemBERTa-2         | 0.91 ± 0.03                 | 0.87 ± 0.01                     | 0.88 ± 0.03                                        | 0.84 ± 

In [20]:
print(tab3.to_markdown(index=False))

| Representation      | Protein-peptide binding affinity   | Antiviral   | Antibacterial   | Cell penetration   | Average     | Significant rank   |
|:--------------------|:-----------------------------------|:------------|:----------------|:-------------------|:------------|:-------------------|
| ChemBERTa-2         | 0.15 ± 0.02                        | 0.38 ± 0.03 | 0.266 ± 0.009   | 0.07 ± 0.03        | 0.22 ± 0.03 | **--1--**          |
| ECFP-16             | 0.05 ± 0.03                        | 0.35 ± 0.04 | 0.315 ± 0.008   | 0.05 ± 0.03        | 0.19 ± 0.03 | 2                  |
| PeptideCLM          | 0.32 ± 0.02                        | 0.16 ± 0.02 | 0.23 ± 0.02     | 0.02 ± 0.04        | 0.19 ± 0.02 | 3                  |
| Molformer-XL        | 0.14 ± 0.02                        | 0.11 ± 0.01 | 0.39 ± 0.02     | -0.01 ± 0.04       | 0.15 ± 0.03 | 4                  |
| ECFP-16 with counts | 0.06 ± 0.03                        | 0.27 ± 0.03 | 0.319 ± 0.009   | -0.17 ± 0.02 