# Tests on statistical difference

This is the analysis notebook on statistical difference of top match results from different models.

- for top matching results (figure 2), use proportions ztest on two binary series
- for efo-batet scores (figure 3), use kolmogorov-smirnov two sample test on two continuous series

In [2]:
from pathlib import Path

import pandas as pd
import numpy as np
from scipy import stats
from statsmodels.stats.proportion import proportions_ztest
from itertools import combinations

In [3]:
df_path = Path("all_top.csv")
assert df_path.exists()
df0 = pd.read_csv(df_path)
print(df0.info())
df0

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1191 entries, 0 to 1190
Data columns (total 9 columns):
BLUEBERT-EFO    1191 non-null float64
BioBERT         1191 non-null float64
BioSentVec      1191 non-null float64
BlueBERT        1191 non-null float64
GUSE            1191 non-null float64
Spacy           1191 non-null float64
SciSpacy        1191 non-null float64
Zooma           1191 non-null float64
Levenshtein     1191 non-null float64
dtypes: float64(9)
memory usage: 83.9 KB
None


Unnamed: 0,BLUEBERT-EFO,BioBERT,BioSentVec,BlueBERT,GUSE,Spacy,SciSpacy,Zooma,Levenshtein
0,0.411765,0.190476,1.000000,0.400000,0.045455,0.053763,1.000000,0.000000,0.400000
1,0.185185,1.000000,1.000000,0.305556,0.173913,1.000000,1.000000,0.000000,0.900000
2,0.714286,0.076923,0.076923,0.076923,0.117647,0.222222,0.714286,0.076923,0.086957
3,0.071429,0.227273,0.105263,0.058824,0.105263,0.105263,0.105263,0.117647,0.117647
4,0.800000,0.833333,0.833333,0.833333,1.000000,0.833333,0.833333,0.500000,0.444444
...,...,...,...,...,...,...,...,...,...
1186,0.800000,0.333333,0.800000,0.888889,0.666667,0.350000,0.368421,0.470588,0.296296
1187,1.000000,0.408163,0.689655,0.666667,0.703704,0.666667,0.703704,0.178571,0.833333
1188,1.000000,0.555556,0.555556,0.555556,0.555556,0.555556,0.555556,0.555556,0.555556
1189,0.142857,0.312500,0.692308,1.000000,0.692308,0.692308,0.692308,1.000000,1.000000


In [4]:
df1 = (
    pd.melt(df0.reset_index(drop=False).rename(columns={"index": "id"}), 
            id_vars=["id"], var_name="model")
    .assign(value_map=lambda df: df["value"].apply(lambda x: 0.0 if x < 1.0 else 1.0))
)
df1

Unnamed: 0,id,model,value,value_map
0,0,BLUEBERT-EFO,0.411765,0.0
1,1,BLUEBERT-EFO,0.185185,0.0
2,2,BLUEBERT-EFO,0.714286,0.0
3,3,BLUEBERT-EFO,0.071429,0.0
4,4,BLUEBERT-EFO,0.800000,0.0
...,...,...,...,...
10714,1186,Levenshtein,0.296296,0.0
10715,1187,Levenshtein,0.833333,0.0
10716,1188,Levenshtein,0.555556,0.0
10717,1189,Levenshtein,1.000000,1.0


In [5]:
df1.groupby("model").agg({"value_map": [lambda s: s.sum(), lambda s: s.sum() / len(s) * 100]})

Unnamed: 0_level_0,value_map,value_map
Unnamed: 0_level_1,<lambda_0>,<lambda_1>
model,Unnamed: 1_level_2,Unnamed: 2_level_2
BLUEBERT-EFO,462.0,38.790932
BioBERT,343.0,28.799328
BioSentVec,480.0,40.302267
BlueBERT,347.0,29.135181
GUSE,341.0,28.631402
Levenshtein,266.0,22.334173
SciSpacy,435.0,36.523929
Spacy,340.0,28.547439
Zooma,442.0,37.111671


In [6]:
combs = list(combinations(df1["model"].drop_duplicates().tolist(), 2))

## tests on top matching

In [7]:
# https://www.statsmodels.org/devel/generated/statsmodels.stats.proportion.proportions_ztest.html
df_prop_ztest = pd.DataFrame([
    {
        "series0": series0,
        "series1": series1,
        "test_res": proportions_ztest(
            count=np.array([
                df1[df1["model"] == series0]["value_map"].sum(),
                df1[df1["model"] == series1]["value_map"].sum(),
            ]),
            nobs=np.array([
                df1[df1["model"] == series0]["value_map"].pipe(len),
                df1[df1["model"] == series1]["value_map"].pipe(len),
            ])
        )
    }
    for series0, series1 in combs
]).assign(
    test_stat=lambda df: df["test_res"].apply(lambda t: t[0]),
    test_pval=lambda df: df["test_res"].apply(lambda t: t[1]),
    pval_leq_005=lambda df: df["test_pval"].apply(lambda x: x <= 0.05),
    pval_leq_010=lambda df: df["test_pval"].apply(lambda x: x <= 0.10),
).drop(columns=["test_res"])
df_prop_ztest

Unnamed: 0,series0,series1,test_stat,test_pval,pval_leq_005,pval_leq_010
0,BLUEBERT-EFO,BioBERT,5.154708,2.540267e-07,True,True
1,BLUEBERT-EFO,BioSentVec,-0.754287,0.4506768,False,False
2,BLUEBERT-EFO,BlueBERT,4.975424,6.510497e-07,True,True
3,BLUEBERT-EFO,GUSE,5.24454,1.566726e-07,True,True
4,BLUEBERT-EFO,Spacy,5.289504,1.226482e-07,True,True
5,BLUEBERT-EFO,SciSpacy,1.141761,0.2535534,False,False
6,BLUEBERT-EFO,Zooma,0.844461,0.398412,False,False
7,BLUEBERT-EFO,Levenshtein,8.717531,2.843336e-18,True,True
8,BioBERT,BioSentVec,-5.902941,3.570772e-09,True,True
9,BioBERT,BlueBERT,-0.180678,0.85662,False,False


## tests on efo-batet scores

In [8]:
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ks_2samp.html#scipy.stats.ks_2samp
df_ks2samp = pd.DataFrame([
    {
        "series0": series0,
        "series1": series1,
        "test_res": stats.ks_2samp(
            df1[df1["model"] == series0]["value"].to_numpy(),
            df1[df1["model"] == series1]["value"].to_numpy(),
        )
    }
    for series0, series1 in combs
]).assign(
    test_stat=lambda df: df["test_res"].apply(lambda t: t[0]),
    test_pval=lambda df: df["test_res"].apply(lambda t: t[1]),
    pval_leq_005=lambda df: df["test_pval"].apply(lambda x: x <= 0.05),
    pval_leq_010=lambda df: df["test_pval"].apply(lambda x: x <= 0.10),
).drop(columns=["test_res"])
df_ks2samp

Unnamed: 0,series0,series1,test_stat,test_pval,pval_leq_005,pval_leq_010
0,BLUEBERT-EFO,BioBERT,0.142737,5.384227e-11,True,True
1,BLUEBERT-EFO,BioSentVec,0.13602,5.076454e-10,True,True
2,BLUEBERT-EFO,BlueBERT,0.170445,1.610935e-15,True,True
3,BLUEBERT-EFO,GUSE,0.208228,5.229497e-23,True,True
4,BLUEBERT-EFO,Spacy,0.20403,4.229504e-22,True,True
5,BLUEBERT-EFO,SciSpacy,0.130143,3.305194e-09,True,True
6,BLUEBERT-EFO,Zooma,0.203191,6.391052e-22,True,True
7,BLUEBERT-EFO,Levenshtein,0.29639,1.55917e-46,True,True
8,BioBERT,BioSentVec,0.115029,2.784133e-07,True,True
9,BioBERT,BlueBERT,0.042821,0.2249447,False,False


## Display

In [21]:
def format_test_results(row):
    pval = row["test_pval"]
    stat = row["test_stat"]
    star = ""
    if pval <= 0.01:
        star = "***"
    elif pval <= 0.05:
        star = "**"
    elif pval <= 0.1:
        star = "*"
    res = "{stat:.3f}{star} ({pval:.3e})".format(stat=stat, star=star, pval=pval)
    return res

df_disp0 = df_prop_ztest.assign(
    disp=lambda df: df.apply(format_test_results, axis=1)
)[["series0", "series1", "disp"]]

df_disp1 = df_ks2samp.assign(
    disp=lambda df: df.apply(format_test_results, axis=1)
)[["series0", "series1", "disp"]]

df_disp = df_disp0.rename(columns={"disp": "prop_ztest"}).merge(
    df_disp1.rename(columns={"disp": "ks2samp"}),
    left_on=["series0", "series1"],
    right_on=["series0", "series1"],
)
df_disp

Unnamed: 0,series0,series1,prop_ztest,ks2samp
0,BLUEBERT-EFO,BioBERT,5.155*** (2.540e-07),0.143*** (5.384e-11)
1,BLUEBERT-EFO,BioSentVec,-0.754 (4.507e-01),0.136*** (5.076e-10)
2,BLUEBERT-EFO,BlueBERT,4.975*** (6.510e-07),0.170*** (1.611e-15)
3,BLUEBERT-EFO,GUSE,5.245*** (1.567e-07),0.208*** (5.229e-23)
4,BLUEBERT-EFO,Spacy,5.290*** (1.226e-07),0.204*** (4.230e-22)
5,BLUEBERT-EFO,SciSpacy,1.142 (2.536e-01),0.130*** (3.305e-09)
6,BLUEBERT-EFO,Zooma,0.844 (3.984e-01),0.203*** (6.391e-22)
7,BLUEBERT-EFO,Levenshtein,8.718*** (2.843e-18),0.296*** (1.559e-46)
8,BioBERT,BioSentVec,-5.903*** (3.571e-09),0.115*** (2.784e-07)
9,BioBERT,BlueBERT,-0.181 (8.566e-01),0.043 (2.249e-01)
