In [1]:
import os
from pathlib import Path
DIR_HOME = Path(os.getcwd()).parent
DIR_CONVERSATION = DIR_HOME / "data" / "conversations"

import sys
sys.path.append(str(DIR_HOME))

import json
import pandas as pd
with open(DIR_CONVERSATION / "text-ada-001-single-response.json") as f:
    responses = json.load(f)

from src.utils import cohen_d, norm_diff_stdev
from src.metrics import SentenceBERTDiversity, Length
grp_metrics = [SentenceBERTDiversity("paraphrase-MiniLM-L3-v2")]
ind_metrics = [Length()]

In [2]:
for response in responses:
    for metric in grp_metrics:
        response[metric.name] = metric(response["completion"])[0]
    for metric in ind_metrics:
        response[metric.name] = metric(response["completion"])

df_responses = pd.DataFrame(responses).set_index(["qid", "cid"])
df_responses.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,model,max_tokens,stop,n,temperature,top_p,frequency_penalty,presence_penalty,prompt,question,completion,sentencebert_diversity,length
qid,cid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,0,text-ada-001,128,"[Therapist:, Patient:]",8,0.5,0.7,1.0,1.0,Below is a conversation between a patient and ...,How would you feel if someone called you a jerk?,"[I would feel embarrassed and humiliated., I w...",0.337541,"[6, 6, 6, 4, 6, 6, 10, 13]"
0,1,text-ada-001,128,"[Therapist:, Patient:]",8,0.1,0.7,1.0,1.0,Below is a conversation between a patient and ...,How would you feel if someone called you a jerk?,"[I would feel embarrassed and humiliated., I w...",0.054004,"[6, 6, 6, 6, 6, 6, 6, 6]"
0,2,text-ada-001,128,"[Therapist:, Patient:]",8,1.0,0.7,1.0,1.0,Below is a conversation between a patient and ...,How would you feel if someone called you a jerk?,[I would feel upset and would likely feel humi...,0.367895,"[9, 4, 17, 9, 6, 6, 13, 6]"
0,3,text-ada-001,128,"[Therapist:, Patient:]",8,0.5,0.5,1.0,1.0,Below is a conversation between a patient and ...,How would you feel if someone called you a jerk?,"[I would feel embarrassed and humiliated., I w...",0.149487,"[6, 6, 6, 7, 10, 6, 6, 6]"
0,4,text-ada-001,128,"[Therapist:, Patient:]",8,0.5,0.9,1.0,1.0,Below is a conversation between a patient and ...,How would you feel if someone called you a jerk?,"[I would feel angry and frustrated., I would f...",0.286294,"[6, 6, 6, 6, 6, 15, 6, 6]"


In [4]:
base_cid = 0
last_cid = 10
last_qid = 23

print("\nDiversity comparison >>>")
stats = []
for cid in range((base_cid + 1), (last_cid + 1)):
    diversity_base = df_responses.xs(base_cid, level="cid").sentencebert_diversity.values
    diversity_test = df_responses.xs(cid, level="cid").sentencebert_diversity.values
    stats.append({"base_cfg": base_cid,
                  "test_cfg": cid,
                  "cohen_d": f"{cohen_d(diversity_base, diversity_test):.3f}",
                  "norm_diff_stdev": f"{norm_diff_stdev(diversity_base, diversity_test):.3f}"})
print(pd.DataFrame(stats))

print("\nLength comparison >>>")
stats = []
for cid in range((base_cid + 1), (last_cid + 1)):
    for qid in range(last_qid + 1):
        length_base = df_responses.xs((qid, base_cid), level=["qid", "cid"]).length.values[0]
        length_test = df_responses.xs((qid, cid), level=["qid", "cid"]).length.values[0]

        stats.append({"base_cfg": base_cid,
                      "test_cfg": cid,
                      "qid": qid,
                      "cohen_d": f"{cohen_d(length_base, length_test):.3f}",
                      "norm_diff_stdev": f"{norm_diff_stdev(length_base, length_test):.3f}"})
print(pd.DataFrame(stats))


Diversity comparison >>>
   base_cfg  test_cfg cohen_d norm_diff_stdev
0         0         1   1.191           0.042
1         0         2  -0.373           0.224
2         0         3   0.482          -0.192
3         0         4   0.151           0.039
4         0         5   0.050          -0.133
5         0         6  -0.030           0.093
6         0         7   0.049          -0.085
7         0         8   0.113           0.026
8         0         9  -0.052          -0.000
9         0        10   0.159          -0.072

Length comparison >>>
     base_cfg  test_cfg  qid cohen_d norm_diff_stdev
0           0         1    0   0.549           1.414
1           0         1    1   0.368           0.558
2           0         1    2   0.048           0.371
3           0         1    3  -0.055          -0.485
4           0         1    4  -0.877           0.539
..        ...       ...  ...     ...             ...
235         0        10   19  -0.597          -0.404
236         0        