In [1]:
import os
from pathlib import Path
DIR_HOME = Path(os.getcwd()).parent
DIR_CONVERSATION = DIR_HOME / "data" / "conversations"

import sys
sys.path.append(str(DIR_HOME))

import json
import pandas as pd
with open(DIR_CONVERSATION / "text-davinci-003-single-response.json") as f:
    responses = json.load(f)

from src.utils import cohen_d, norm_diff_stdev
from src.metrics import SentenceBERTDiversity, Length
grp_metrics = [SentenceBERTDiversity("paraphrase-MiniLM-L3-v2")]
ind_metrics = [Length()]

In [2]:
for response in responses:
    for metric in grp_metrics:
        response[metric.name] = metric(response["completion"])[0]
    for metric in ind_metrics:
        response[metric.name] = metric(response["completion"])

df_responses = pd.DataFrame(responses).set_index(["qid", "cid"])
df_responses.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,model,max_tokens,stop,n,temperature,top_p,frequency_penalty,presence_penalty,prompt,question,completion,sentencebert_diversity,length
qid,cid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,0,text-davinci-003,128,"[Therapist:, Patient:]",8,0.5,0.7,1.0,1.0,Below is a conversation between a patient and ...,How would you feel if someone called you a jerk?,[I would feel embarrassed and ashamed. I would...,0.254657,"[39, 41, 41, 41, 44, 42, 31, 43]"
0,1,text-davinci-003,128,"[Therapist:, Patient:]",8,0.1,0.7,1.0,1.0,Below is a conversation between a patient and ...,How would you feel if someone called you a jerk?,[I would feel very hurt and angry. I would als...,0.166667,"[33, 32, 30, 33, 41, 30, 33, 33]"
0,2,text-davinci-003,128,"[Therapist:, Patient:]",8,1.0,0.7,1.0,1.0,Below is a conversation between a patient and ...,How would you feel if someone called you a jerk?,[I would feel very upset and angry. I would al...,0.309026,"[36, 44, 45, 43, 44, 45, 33, 55]"
0,3,text-davinci-003,128,"[Therapist:, Patient:]",8,0.5,0.5,1.0,1.0,Below is a conversation between a patient and ...,How would you feel if someone called you a jerk?,[I would feel very hurt and disrespected. I th...,0.342677,"[44, 46, 42, 29, 41, 40, 42, 37]"
0,4,text-davinci-003,128,"[Therapist:, Patient:]",8,0.5,0.9,1.0,1.0,Below is a conversation between a patient and ...,How would you feel if someone called you a jerk?,[I would feel very hurt and angry. I would als...,0.311913,"[32, 43, 45, 47, 40, 42, 52, 39]"


In [3]:
base_cid = 0
last_cid = 10
last_qid = 23

print("\nDiversity comparison >>>")
stats = []
for cid in range((base_cid + 1), (last_cid + 1)):
    diversity_base = df_responses.xs(base_cid, level="cid").sentencebert_diversity.values
    diversity_test = df_responses.xs(cid, level="cid").sentencebert_diversity.values
    stats.append({"base_cfg": base_cid,
                  "test_cfg": cid,
                  "cohen_d": f"{cohen_d(diversity_base, diversity_test):.3f}",
                  "norm_diff_stdev": f"{norm_diff_stdev(diversity_base, diversity_test):.3f}"})
print(pd.DataFrame(stats))

print("\nLength comparison >>>")
stats = []
for cid in range((base_cid + 1), (last_cid + 1)):
    for qid in range(last_qid + 1):
        length_base = df_responses.xs((qid, base_cid), level=["qid", "cid"]).length.values[0]
        length_test = df_responses.xs((qid, cid), level=["qid", "cid"]).length.values[0]
        stats.append({"base_cfg": base_cid,
                      "test_cfg": cid,
                      "qid": qid,
                      "cohen_d": f"{cohen_d(length_base, length_test):.3f}",
                      "norm_diff_stdev": f"{norm_diff_stdev(length_base, length_test):.3f}"})
print(pd.DataFrame(stats))


Diversity comparison >>>
   base_cfg  test_cfg cohen_d norm_diff_stdev
0         0         1   1.584           0.416
1         0         2  -0.128           0.167
2         0         3   0.176           0.099
3         0         4   0.073           0.055
4         0         5   0.321           0.155
5         0         6   0.290           0.194
6         0         7   0.018           0.006
7         0         8   0.209           0.030
8         0         9   0.150           0.194
9         0        10   0.050           0.058

Length comparison >>>
     base_cfg  test_cfg  qid cohen_d norm_diff_stdev
0           0         1    0   1.902           0.156
1           0         1    1   1.007           0.070
2           0         1    2  -0.063           0.521
3           0         1    3   0.238           1.137
4           0         1    4  -0.548           0.360
..        ...       ...  ...     ...             ...
235         0        10   19   0.568          -0.913
236         0        