KSC corpora experiments for paper

In [None]:
import sys  
sys.path.insert(0, '../src')
sys.path.insert(0, '../../meme/src')
sys.path.insert(0, '../../comparing-corpora/src')

import numpy as np
import pandas as pd
from compcor.text_tokenizer_embedder import STTokenizerEmbedder
from compcor.corpus_metrics import *
import matplotlib.pyplot as plt
import seaborn as sns
import os
from utils import QUORA, load_paraphrases, common_tokens_matrix, sort_numsuffix
from binarize_metrics import *
from ksc_methods import plotKSC, runKSC, runKSC_fixed_sample

In [None]:
# generate base samples from QUORA of size subsample_size to ensure that C_A and C_B are paired paraphrases
np.random.seed(42)
subsample_size = 50
nrepetitions = 5 # number of random samplings of entire KSC set

res_out_dir = os.path.join(os.getcwd(), 'ksc_results', 'data')
fig_out_dir = os.path.join(os.getcwd(), 'ksc_results', 'figures')

# generate a single sampling of subsample_size pairs of paraphrases, to use for the KSC corpora
corpus1, corpus2 = load_paraphrases(subsample=subsample_size)
print(corpus1[:5])
print(corpus2[:5])

In [None]:
# plot pairs of paraphrases in embedding space
from sentence_transformers import SentenceTransformer
# embed and do TSNE transformation
from sklearn.manifold import TSNE
import umap # need to install umap-learn and not umap
import matplotlib.pyplot as plt
# from mycolorpy import colorlist as mcp
# import itertools
sentence_transformer_model = "all-MiniLM-L12-v2"

reducers = {"TSNE": TSNE(n_components=2, metric=cosine_arccos_transform), 
            "UMAP": umap.UMAP(metric=cosine_arccos_transform)}

# use the same model as the encoder, for consistency
def embed_sentences(sentences, normalize=False):
    embedder = SentenceTransformer(sentence_transformer_model)
    vectors = embedder.encode(sentences, show_progress_bar=True)
    return preprocessing.normalize(vectors) if normalize else vectors

corpus_embeddings = [embed_sentences(sentences=corp) for corp in [corpus1, corpus2]]
projection = {kk: vv.fit_transform(np.vstack(corpus_embeddings))
              for kk, vv in reducers.items()}

lc = len(corpus1)
# after fitting together, then split into two
projection = {kk: [vv[:lc, :], vv[lc:, :]] for kk, vv in projection.items()}

In [None]:
# documment distances
dmat = cosine_arccos_transform(c1=np.vstack(corpus_embeddings)).astype(float)
dim = dmat.shape[0]
halfdim = int(dim/2)

# heatmap
fig = plt.imshow(dmat, cmap='hot', vmin=0)
ax = plt.gca()
ax.set_title('A and B document cosine distances ' + r'$\delta$')
# make grid at half
ticks = np.arange(-0.5, dim + 0.5, halfdim)
tick_labels = [""] * len(ticks)
ax.set_xticks(ticks, labels=tick_labels, minor=False)
ax.set_yticks(ticks, labels=tick_labels, minor=False)
ax.grid(color='gray', linestyle='-', linewidth=0.5)
# labels at minor
label_locs = np.array([0.25, 0.75]) * dim
labels = ["A", "B"]
ax.set_xticks(label_locs, labels=labels, minor=True)
ax.set_yticks(label_locs, labels=labels, minor=True)
	
plt.colorbar()
plt.savefig(os.path.join(fig_out_dir, 'cosine_distance_mat.pdf'))
plt.show()

# density plot
# distance between each document and itself (0 by definition)
equal_distances = np.diag(dmat).astype(float)
A_vs_B = dmat[0:halfdim, halfdim:] # upper quadrant
paraphrase_distances = np.diag(A_vs_B)  
upper_idx = np.triu_indices(n=halfdim, k=1)
# take non-identical pairs within A and B, and non-paraphrases between A and B
nonparaphrase_distances = np.concatenate((dmat[:halfdim, :halfdim][upper_idx], dmat[halfdim:, halfdim:][upper_idx], A_vs_B[upper_idx]))


dist_type = ['paraphrases'] * len(paraphrase_distances) + ['identical'] * len(equal_distances) + ['non-paraphrases'] * len(nonparaphrase_distances)
Ddf = pd.DataFrame({"d": np.concatenate((paraphrase_distances, equal_distances, nonparaphrase_distances)),
                    "type": dist_type})
# add a small nonzero value so can plot densities
Ddf = pd.concat([Ddf, pd.DataFrame({'d': 0.0001, 'type': 'identical'}, index=[len(Ddf)])])


g = sns.kdeplot(data=Ddf, x='d', hue='type', common_norm=False, clip=[dmat.min(), dmat.max()], warn_singular=False, legend=False)
plotted_lines = g.get_lines()
max_dens = max([plotted_lines[ii].get_data()[1].max() for ii in [0,2]]) * 1.025
# plt.legend(loc='upper center', fontsize=20)	
# lty = ['dotted', 'solid', 'dashed']
# handles = g.legend_.legendHandles[::-1]
# for line, lt, handle in zip(g.lines, lty, handles):
# 	line.set_linestyle(lt)
# 	handle.set_ls(lt)
g.set_ylim(top=max_dens)

# plt.legend(loc='upper center', fontsize='x-small')
# g.legend_.set_font(fontsize='x-small')
plt.xticks(fontsize='xx-small')
plt.yticks(fontsize='xx-small')
g.set_xlabel(xlabel='distance ' + r'$\delta$', fontsize='xx-small')
g.set_ylabel(ylabel='density', fontsize='xx-small')
plt.title('Document cosine distances ' + r'$\delta$', fontsize='small')
plt.savefig(os.path.join(fig_out_dir, 'distribution_cosine_distance_mat.pdf'))
plt.show()

# print(os.path.join(fig_out_dir, 'distribution_cosine_distance_mat.pdf'))
	

In [None]:
# now plot
import seaborn as sns
import pandas as pd

colors = ['orange', 'blue']

for pmethod, proj in projection.items():
    for ii, cc in enumerate(proj):
        fc = colors[ii] if ii == 0 else 'none'
        plt.scatter(cc[:,0], cc[:,1], facecolors=fc, edgecolors=colors[ii], label='corpus {}'.format(ii+1))
    for xy0, xy1 in zip(proj[0], proj[1]):
        plt.arrow(x=xy0[0], y=xy0[1], dx=xy1[0] - xy0[0], dy=xy1[1] - xy0[1])
    plt.legend(fontsize='xx-small')
    plt.xticks(fontsize='xx-small')
    plt.yticks(fontsize='xx-small')
    plt.title('{} projection'.format(pmethod))
    plt.savefig(os.path.join(fig_out_dir, '{}_paraphrase_projection.pdf'.format(pmethod)))
    plt.show()

In [None]:
# Illustrate how Directed Average Hausdorff Distance works

from illustrate_distributionality import *
from sklearn.metrics.pairwise import euclidean_distances

X0, X1 = gen_data(n_samples=15, draw=False, jitter_std=2.0)
X0 = X0.X
dmat = euclidean_distances(X=X0, Y=X1)
neibs0 = np.argmin(dmat, axis=1) # nearest neighbor of each in X0
neibs1 = np.argmin(dmat, axis=0) # nearest neighbor of each in X1
dxy0 = X1[neibs0,:] - X0
dxy1 = X0[neibs1,:] - X1

# average directional distance
dxy0_mean = np.sqrt(np.square(dxy0).sum(axis=1)).mean()
dxy1_mean = np.sqrt(np.square(dxy1).sum(axis=1)).mean() 

plt.scatter(X0[:,0], X0[:,1], s=120, facecolors='none', edgecolors='green', label='sample 1', alpha=0.5)
plt.scatter(X1[:,0], X1[:,1], s=120, c='green', label='sample 2', alpha=0.5)

for xy, dxy in zip(X0, dxy0):
	plt.arrow(x=xy[0], y=xy[1], dx=dxy[0], dy=dxy[1], color='grey', linestyle="dotted", width=0.01, head_width=0.4, length_includes_head=True)
plt.arrow(x=xy[0], y=xy[1], dx=dxy[0], dy=dxy[1], color='grey', linestyle="dotted", width=0.01, head_width=0.4, length_includes_head=True, label='neighbor 1->2')
for xy, dxy in zip(X1, dxy1):
	plt.arrow(x=xy[0], y=xy[1], dx=dxy[0], dy=dxy[1], color='green', linestyle="solid", width=0.001, head_width=0.4, length_includes_head=True)
plt.arrow(x=xy[0], y=xy[1], dx=dxy[0], dy=dxy[1], color='green', linestyle="solid", width=0.001, head_width=0.4, length_includes_head=True, label='neighbor 2->1')


plt.title('AHD distance = ({} + {})/2'.format(np.round(dxy0_mean, 2), np.round(dxy1_mean, 2)))
plt.legend(fontsize='x-small')

# set aspect as equal
plt.gca().set_aspect(1.0)
plt.savefig(os.path.join(fig_out_dir, 'Directed_Average_Hausdorff.pdf'))
plt.show()




In [None]:
# define more specific metrics
# define specific values of k for PR
PR1 = lambda c1, c2: pr_distance(corpus1=c1, corpus2=c2, nearest_k=1)
PR2 = lambda c1, c2: pr_distance(corpus1=c1, corpus2=c2, nearest_k=2)
PR5 = lambda c1, c2: pr_distance(corpus1=c1, corpus2=c2, nearest_k=5)
PR10 = lambda c1, c2: pr_distance(corpus1=c1, corpus2=c2, nearest_k=10)

DC1 = lambda c1, c2: dc_distance(corpus1=c1, corpus2=c2, nearest_k=1)
DC2 = lambda c1, c2: dc_distance(corpus1=c1, corpus2=c2, nearest_k=2)
DC5 = lambda c1, c2: dc_distance(corpus1=c1, corpus2=c2, nearest_k=5)
DC10 = lambda c1, c2: dc_distance(corpus1=c1, corpus2=c2, nearest_k=10)

# using cosine rather than euclidean
PR1_cos =  lambda c1, c2: pr_distance(corpus1=c1, corpus2=c2, nearest_k=1, cosine=True)
PR2_cos =  lambda c1, c2: pr_distance(corpus1=c1, corpus2=c2, nearest_k=2, cosine=True)
PR3_cos =  lambda c1, c2: pr_distance(corpus1=c1, corpus2=c2, nearest_k=3, cosine=True)
PR5_cos =  lambda c1, c2: pr_distance(corpus1=c1, corpus2=c2, nearest_k=5, cosine=True)
PR10_cos = lambda c1, c2: pr_distance(corpus1=c1, corpus2=c2, nearest_k=10, cosine=True)

DC1_cos =  lambda c1, c2: dc_distance(corpus1=c1, corpus2=c2, nearest_k=1, cosine=True)
DC2_cos =  lambda c1, c2: dc_distance(corpus1=c1, corpus2=c2, nearest_k=2, cosine=True)
DC3_cos =  lambda c1, c2: dc_distance(corpus1=c1, corpus2=c2, nearest_k=3, cosine=True)
DC5_cos =  lambda c1, c2: dc_distance(corpus1=c1, corpus2=c2, nearest_k=5, cosine=True)
DC10_cos = lambda c1, c2: dc_distance(corpus1=c1, corpus2=c2, nearest_k=10, cosine=True)

# create list of metrics and names for plotting
base_metrics = [fid_distance, mauve_distance, classifier_distance, IRPR_distance,
               DC1_cos, DC2_cos, DC3_cos, DC5_cos, DC10_cos, PR1_cos, PR2_cos, PR3_cos, PR5_cos, PR10_cos, Directed_Hausdorff_distance, Energy_distance]
base_metrics_names = ['FID', 'MAUVE', 'CLASSIFIER', 'IRPR', 'DC_1','DC_2', 'DC_3','DC_5', 'DC_10', 'PR_1', 'PR_2', 'PR_3', 'PR_5', 'PR_10', 'HAUSDORFF', 'ENERGY']

In [None]:
# run KSC on original corpora
metrics_df, distances_df, _ = runKSC_fixed_sample(base_metrics, metric_names=base_metrics_names, corpus1=corpus1, corpus2=corpus2, repetitions=nrepetitions, n=subsample_size, k=15, coverage=True, output=None)

is_baseline = np.isin(distances_df['metric'], ['ENERGY', 'HAUSDORFF'])
plotKSC(distances_df.loc[is_baseline], boxplot=True, standardized=True, ncolumns=2, output=fig_out_dir, fname='KSC_standardized_baselines.pdf')
plotKSC(distances_df.loc[np.logical_not(is_baseline)], boxplot=True, standardized=True, ncolumns=8, output=fig_out_dir, fname='KSC_standardized_others.pdf')
selected = ['FID', 'MAUVE', 'IRPR', 'CLASSIFIER', 'DC_1', 'DC_2', 'DC_5', 'PR_1', 'PR_2', 'PR_5']
plotKSC(distances_df.loc[np.isin(distances_df['metric'], selected)], boxplot=True, standardized=True, ncolumns=5, output=fig_out_dir, fname='KSC_standardized_others_subset.pdf')
plotKSC(distances_df.loc[np.isin(distances_df['metric'], selected)], boxplot=True, standardized=True, ncolumns=5, output=fig_out_dir, fname='KSC_raw_others_subset.pdf')


Now try to group each distance metric $d$ into more/less distributional by how close it is to the baselines ENERGY and HAUSDORFF

In [None]:
sys.path.insert(0, '../src')
from probability_functions import *
from itertools import combinations
from ndicts import NestedDict
from copy import deepcopy
from statsmodels.nonparametric.kde import KDEUnivariate
from collections import defaultdict, namedtuple
from ksc_methods import runKSC_fixed_sample, calcKSC_scores_fixed_sample, get_metric_dependant_data
from scipy.stats import anderson_ksamp, cramervonmises_2samp
from statsmodels.stats.multitest import multipletests

def est_density(x_true):
    # estimate probability of xs
    return KDEUnivariate(x_true).fit()

def obs2ell_dict(ells, d):
    ed = defaultdict(list)
    for ell, dd in zip(ells, d):
        ed[ell].append(dd)
    
    ed = {ell: np.array(vv) for ell, vv in ed.items()}    
    return ed

gof_tests = {'Cramer Von-Mises': lambda x, y: cramervonmises_2samp(x, y).pvalue,
             'Anderson-Darling': lambda x, y: anderson_ksamp(samples=[x, y]).pvalue}
    
inf_clip = 1e50


class KSC_probabilities:

    EVAL_STD_PTS = np.linspace(start=-8, stop=8, num=3000)
    EVAL_STD_DX = EVAL_STD_PTS[1] - EVAL_STD_PTS[0]
        
    def __init__(self, corpus1, corpus2, nrepetitions=5, k=5):
        # check are the same length
        if len(corpus1) != len(corpus2):
            raise ValueError('corpora must have the same length')
        
        self.corpus1, self.corpus2 = corpus1, corpus2
        self.n = len(self.corpus1)
        self.k = max(1, min(self.n, int(k)))
        self.nrepetitions = nrepetitions
        self.like_res = namedtuple('like_res', 'metric_name full_lv full_decision trunc_lv trunc_decision')

        
        metrics = [Directed_Hausdorff_distance, Energy_distance]
        metrics_names = ['Hausdorff', 'Energy']
        
        n = max(len(corpus1), len(corpus2))
        _, sim_distances, self.ksc_indices = runKSC_fixed_sample(metrics, metric_names=metrics_names,
                                                                 corpus1=self.corpus1, corpus2=self.corpus2, 
                                                                 repetitions=self.nrepetitions, n=self.n, k=self.k,
                                                                 coverage=True, output=None)
        self.ells = sim_distances['l'].unique().tolist()
        # use standardized distances
        # pool across repetitions for each ell 
        self.base_metric_distances = {mn: obs2ell_dict(ells=df['l'], d=df['distance_score'])
                                 for mn, df in sim_distances.groupby('metric')}
        self.ell_wts = {ell: len(vv) for ell, vv in self.base_metric_distances[metrics_names[0]].items()}
        sum_wts = np.sum([vv for vv in self.ell_wts.values()])
        # how much to weight each ell's samples
        self.ell_wts.update({ell: vv/sum_wts for ell, vv in self.ell_wts.items()})
        
        self.base_metric_kdes = {mn: {ell: self._est_density(vals) for ell, vals in elldict.items()}
                                for mn, elldict in self.base_metric_distances.items()}


    def _est_density(self, x_true):
        # estimate probability of xs
        uv = np.unique(x_true)
        if len(uv) == 1:
            uv = uv[0]
            # add value on either side so bandwidth won't be 0
            x_true = np.concatenate((x_true, [uv - KSC_probabilities.EVAL_STD_DX, uv + KSC_probabilities.EVAL_STD_DX]))
        return KDEUnivariate(x_true).fit()

    
    def _loglike(self, densfunc, xs, as_sum=True):
        res = np.clip(a=np.log(densfunc.evaluate(xs)), a_min=-1*inf_clip, a_max=inf_clip)        
        return res.sum() if as_sum else res


    def _squared_discrepancy(self, base_dens_func, test_dens_func):
        # Fan 1994 approximate test
        return KSC_probabilities.EVAL_STD_DX * np.sum(np.square(base_dens_func.evaluate(KSC_probabilities.EVAL_STD_PTS) - test_dens_func.evaluate(KSC_probabilities.EVAL_STD_PTS)))
        
    
    def calc_metric_distances(self, metric, metric_name):
        # test a new metric vs the two baselines
        c1 = get_metric_dependant_data(metric, self.corpus1)
        c2 = get_metric_dependant_data(metric, self.corpus2)
        
        distance_dfs = pd.concat([calcKSC_scores_fixed_sample(c1, c2, indices_from_each=ksc_idxs, metric=metric, metric_name=metric_name, rep=ii)[0]
                                  for ii, ksc_idxs in enumerate(self.ksc_indices)])
        # distance_dfs = pd.DataFrame(distance_dfs, columns=['metric', 'repetition', 'i', 'j', 'l', 'distance', 'distance_score'])
        # drop NaN values
        distance_dfs.dropna(subset=['distance', 'distance_score'], inplace=True)
    
        
        # combine across repetitions
        eval_metric_distances = obs2ell_dict(ells=distance_dfs['l'].astype(int), d=distance_dfs['distance_score'].astype(float))
        missing_ells = [ell for ell in eval_metric_distances if ell not in self.ells]
        for ell in missing_ells:
            eval_metric_distances[ell] = np.array([]).astype(float)
        
        return eval_metric_distances


    def test_new_metric(self, metric, metric_name, alpha=0.05, wtd=True):
        metric_distances = self.calc_metric_distances(metric, metric_name)
        metric_distances_kdes = {ell: self._est_density(vals) for ell, vals in metric_distances.items()}
        
        pvalues = {tn: {bmn: np.array([tf(x=metric_distances[ell], y=ellvals) for ell, ellvals in bmvals.items()])
                       for bmn, bmvals in self.base_metric_distances.items()}
                   for tn, tf in gof_tests.items()}
        # use default Holm-Sidak
        mult_test_res = {tn: {bmn: multipletests(pvals=bpvals, alpha=alpha, returnsorted=True)
                              for bmn, bpvals in tpvals.items()}
                         for tn, tpvals in pvalues.items()}

        # calculate estimate of sum squared deviation betwen densities (take mean to adjust for number of ells)
        fan_test_stat = {bmn: np.mean([(self.ell_wts[ell] if wtd else 1/len(self.ell_wts)) * self._squared_discrepancy(base_dens_func=bmnf, test_dens_func=metric_distances_kdes[ell])
                                    for ell, bmnf in bkdefuncs.items()])
                         for bmn, bkdefuncs in self.base_metric_kdes.items()}
        
        return mult_test_res, fan_test_stat

In [None]:
# test various metrics against this baseline
ksc_tester = KSC_probabilities(corpus1=corpus1, corpus2=corpus2, k=15)

# show KDEs of resulting baselines metric distributions

import matplotlib.pyplot as plt

eval_points = np.linspace(start=-4, stop=3, num=1000)
colors = ['red', 'blue']
ltys = ['solid', 'dashed']
kde_types = list(ksc_tester.base_metric_kdes.keys())

fig, axs = plt.subplots(nrows=ksc_tester.k - 1, ncols=1, sharex=True, sharey=False)
for ii, ax in enumerate(fig.axes):
    for kt, col, lty in zip(kde_types, colors, ltys):
        hts = ksc_tester.base_metric_kdes[kt][ii+1].evaluate(eval_points)
        ax.plot(eval_points, hts, color=col, linestyle=lty)
    ax.axes.yaxis.set_ticklabels([])
    # ax.axes.set_ylabel(r'$\ell={}$'.format(ii+1), rotation=0, fontsize=10)
    ax.axes.set_ylabel(ii+1, rotation=0, fontsize=10)
    ax.yaxis.set_label_coords(-.03, 0.2)

fig.suptitle("KDEs " + r'$\hat{f}_d^{\ell}$' + " of " + r'$\tilde{D}^p_{\ell}(A,B,d)$' + ' vs ' + r'$\ell$', fontsize=20)
fig.supylabel(r'$\ell$', fontsize=20, rotation=0)
plt.savefig(os.path.join(fig_out_dir, 'KSC_baseline_testing_KDEs.pdf'))
plt.show()


In [None]:
# perform testing on metrics to group them

test_results_wtd = [ksc_tester.test_new_metric(metric=mm, metric_name=mn, wtd=True) for mm, mn in zip(base_metrics, base_metrics_names)]
test_results_unwtd = [ksc_tester.test_new_metric(metric=mm, metric_name=mn, wtd=False) for mm, mn in zip(base_metrics, base_metrics_names)]

In [None]:
# plot the deviations

res_dfs = {'weighted': pd.DataFrame([vv[1] for vv in test_results_wtd]),
           'unweighted': pd.DataFrame([vv[1] for vv in test_results_unwtd])}


for kk, df in res_dfs.items():
    df['metric'] = base_metrics_names
    df = df.loc[np.logical_not(np.isin(df['metric'], ['HAUSDORFF', 'ENERGY']))]
    df = pd.melt(df, id_vars=['metric'], value_vars=['Energy', 'Hausdorff'])
    df.rename(columns={'variable': 'type', 'value': 'squared deviation'}, inplace=True)

    res_dfs[kk] = df
    
    g = sns.catplot(data=df, kind="bar",
        y="metric", x="squared deviation", hue="type",
        palette="dark", alpha=.6, orient='h', height=10, aspect=1)
    plt.title(r'Squared deviation of KDEs from baseline, $\ell$-{}'.format(kk))
    plt.savefig(os.path.join(fig_out_dir, 'KDE_deviations_{}.pdf'.format(kk)))
    plt.show()


In [None]:
# investigate effect of components of DC and PR
PR1_cos_comp =  lambda c1, c2: pr_distance(corpus1=c1, corpus2=c2, nearest_k=1, components=True, cosine=True)
PR2_cos_comp =  lambda c1, c2: pr_distance(corpus1=c1, corpus2=c2, nearest_k=2, components=True, cosine=True)
PR3_cos_comp =  lambda c1, c2: pr_distance(corpus1=c1, corpus2=c2, nearest_k=3, components=True, cosine=True)
PR5_cos_comp =  lambda c1, c2: pr_distance(corpus1=c1, corpus2=c2, nearest_k=5, components=True, cosine=True)
PR10_cos_comp = lambda c1, c2: pr_distance(corpus1=c1, corpus2=c2, nearest_k=10, components=True, cosine=True)

ks = [1,2,3,5,10]
PR_component_metrics = [PR1_cos_comp, PR2_cos_comp, PR3_cos_comp, PR5_cos_comp, PR10_cos_comp]
PR_names = ['PR_{}'.format(ii) for ii in ks]

DC1_cos_comp =  lambda c1, c2: dc_distance(corpus1=c1, corpus2=c2, nearest_k=1, components=True, cosine=True)
DC2_cos_comp =  lambda c1, c2: dc_distance(corpus1=c1, corpus2=c2, nearest_k=2, components=True, cosine=True)
DC3_cos_comp =  lambda c1, c2: dc_distance(corpus1=c1, corpus2=c2, nearest_k=3, components=True, cosine=True)
DC5_cos_comp =  lambda c1, c2: dc_distance(corpus1=c1, corpus2=c2, nearest_k=5, components=True, cosine=True)
DC10_cos_comp = lambda c1, c2: dc_distance(corpus1=c1, corpus2=c2, nearest_k=10, components=True, cosine=True)

DC_component_metrics = [DC1_cos_comp, DC2_cos_comp, DC3_cos_comp, DC5_cos_comp, DC10_cos_comp]
DC_names = ['DC_{}'.format(ii) for ii in ks]

_, pr_distances_df, _ = runKSC_fixed_sample(PR_component_metrics, metric_names=PR_names, corpus1=corpus1, corpus2=corpus2, repetitions=1, n=subsample_size, k=15, coverage=True, output=None)
tmp = pd.DataFrame(pr_distances_df['distance'].tolist()) # extract the distance 
pr_distances_df.drop(columns=['distance'], inplace=True)
pr_distances_df = pd.concat([pr_distances_df, tmp], axis=1)

_, dc_distances_df, _ = runKSC_fixed_sample(DC_component_metrics, metric_names=PR_names, corpus1=corpus1, corpus2=corpus2, repetitions=1, n=subsample_size, k=15, coverage=True, output=None)
tmp = pd.DataFrame(dc_distances_df['distance'].tolist()) # extract the distance tuples
dc_distances_df.drop(columns=['distance'], inplace=True)
dc_distances_df = pd.concat([dc_distances_df, tmp], axis=1)

In [None]:
# now plot the components

dc_distances_df['k'] = [int(vv.split('_')[1]) for vv in dc_distances_df['metric']]
pr_distances_df['k'] = [int(vv.split('_')[1]) for vv in pr_distances_df['metric']]
dc_distances_df.rename(columns={'distance_score': 'standardized distance'}, inplace=True)
pr_distances_df.rename(columns={'distance_score': 'standardized distance'}, inplace=True)

fig, axs = plt.subplots(1, 3, figsize=(10 * 3, 8))

for ax, vv in zip(fig.axes, ['precision', 'recall', 'standardized distance']):
    g = sns.boxplot(x='l', y=vv, data=pr_distances_df, ax=ax, hue='k')
    g.set(ylabel=None, xlabel=r'$\ell$')
    ax.set_title(vv, fontdict={'size': 'xx-large'})
fig.suptitle('Precision-Recall (PR) by k neighbors', fontsize='xx-large')
plt.tight_layout()
plt.savefig(os.path.join(fig_out_dir, 'pr_distance_vs_k.pdf'))
plt.show()


fig, axs = plt.subplots(1, 3, figsize=(10 * 3, 8))

for ax, vv in zip(fig.axes, ['density', 'coverage', 'standardized distance']):
    g = sns.boxplot(x='l', y=vv, data=dc_distances_df, ax=ax, hue='k')
    g.set(ylabel=None, xlabel=r'$\ell$')
    ax.set_title(vv, fontdict={'size': 'xx-large'})
fig.suptitle('Density-Coverage (DC) by k neighbors', fontsize='xx-large')
plt.tight_layout()
plt.savefig(os.path.join(fig_out_dir, 'dc_distance_vs_k.pdf'))
plt.show()