In [1]:
import os
from pathlib import Path
DIR_HOME = Path(os.getcwd()).parent
DIR_CONV = DIR_HOME / "data" / "conversations"
DIR_STAT = DIR_HOME / "data" / "statistics"

import sys
sys.path.append(str(DIR_HOME))

from scipy.stats import ttest_ind
from statsmodels.stats.multitest import multipletests

import json
import pandas as pd
with open(DIR_CONV / "text-davinci-003-single-response.json") as f:
    responses = json.load(f)

import sqlalchemy
import mysql.connector as connection

from src.utils import cohen_d, norm_diff_stdev
from src.metrics import SentenceBERTDiversity, Length
grp_metrics = [SentenceBERTDiversity("paraphrase-MiniLM-L3-v2")]
ind_metrics = [Length()]

In [2]:
for response in responses:
    for metric in grp_metrics:
        response[metric.name] = metric(response["completion"])[0]
    for metric in ind_metrics:
        response[metric.name] = metric(response["completion"])

df_responses = pd.DataFrame(responses).set_index(["qid", "cid"])
df_responses.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,model,max_tokens,stop,n,temperature,top_p,frequency_penalty,presence_penalty,prompt,question,completion,sentencebert_diversity,length
qid,cid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,0,text-davinci-003,128,"[Therapist:, Patient:]",8,1.0,1.0,0.0,0.0,Below is a conversation between a patient and ...,How would you feel if someone called you a jerk?,[I would feel angry and disrespected. I think ...,0.30915,"[33, 41, 45, 44, 42, 43, 28, 50]"
0,1,text-davinci-003,128,"[Therapist:, Patient:]",8,0.1,1.0,0.0,0.0,Below is a conversation between a patient and ...,How would you feel if someone called you a jerk?,[I would feel very hurt and angry. I would fee...,0.084255,"[29, 36, 29, 31, 41, 36, 29, 29]"
0,2,text-davinci-003,128,"[Therapist:, Patient:]",8,0.5,1.0,0.0,0.0,Below is a conversation between a patient and ...,How would you feel if someone called you a jerk?,[I would feel very hurt and angry. I would fee...,0.292966,"[31, 32, 38, 24, 23, 37, 42, 32]"
0,3,text-davinci-003,128,"[Therapist:, Patient:]",8,1.5,1.0,0.0,0.0,Below is a conversation between a patient and ...,How would you feel if someone called you a jerk?,[I would feel hurt and upset — likely to take ...,0.407062,"[25, 52, 59, 53, 30, 45, 48, 39]"
0,4,text-davinci-003,128,"[Therapist:, Patient:]",8,1.0,0.4,0.0,0.0,Below is a conversation between a patient and ...,How would you feel if someone called you a jerk?,[I would feel embarrassed and ashamed. I would...,0.221068,"[33, 43, 29, 35, 35, 29, 29, 28]"


In [3]:
base_cid = 0
last_cid = 12
last_qid = 23

grp_stats = []
ind_stats = []

for cid in range((base_cid + 1), (last_cid + 1)):

    for metric in grp_metrics:
        val_base = df_responses.xs(base_cid, level="cid").loc[:, metric.name].values
        val_test = df_responses.xs(cid, level="cid").loc[:, metric.name].values
        grp_stats.append({"base_cfg": base_cid,
                          "test_cfg": cid,
                          "metric": metric.name,
                          "cohen_d": cohen_d(val_base, val_test),
                          "norm_diff_stdev": norm_diff_stdev(val_base, val_test)})

    for qid in range(last_qid + 1):
        for metric in ind_metrics:
            val_base = df_responses.xs((qid, base_cid), level=["qid", "cid"]).loc[:, metric.name].values[0]
            val_test = df_responses.xs((qid, cid), level=["qid", "cid"]).loc[:, metric.name].values[0]
            t_stats, p_val = ttest_ind(val_base, val_test)
            ind_stats.append({"base_cfg": base_cid,
                              "test_cfg": cid,
                              "qid": qid,
                              "metric": metric.name,
                              "cohen_d": cohen_d(val_base, val_test),
                              "norm_diff_stdev": norm_diff_stdev(val_base, val_test),
                              "t_stats": t_stats,
                              "p_val": p_val})

In [4]:
pd.DataFrame(grp_stats)

Unnamed: 0,base_cfg,test_cfg,metric,cohen_d,norm_diff_stdev
0,0,1,sentencebert_diversity,2.375071,0.255818
1,0,2,sentencebert_diversity,1.021291,0.146023
2,0,3,sentencebert_diversity,-1.006818,0.135542
3,0,4,sentencebert_diversity,1.443739,0.02521
4,0,5,sentencebert_diversity,0.895206,0.061441
5,0,6,sentencebert_diversity,0.611165,0.041811
6,0,7,sentencebert_diversity,-0.035742,0.054867
7,0,8,sentencebert_diversity,-0.141855,0.029493
8,0,9,sentencebert_diversity,-0.185723,0.013732
9,0,10,sentencebert_diversity,0.056167,0.078281


In [5]:
pd.DataFrame(ind_stats)

Unnamed: 0,base_cfg,test_cfg,qid,metric,cohen_d,norm_diff_stdev,t_stats,p_val
0,0,1,0,length,1.392375,0.406230,2.784750,0.014612
1,0,1,1,length,2.493999,0.530689,4.987998,0.000199
2,0,1,2,length,-0.089211,0.692073,-0.178422,0.860948
3,0,1,3,length,1.024154,0.541975,2.048307,0.059765
4,0,1,4,length,-0.646181,0.574595,-1.292363,0.217156
...,...,...,...,...,...,...,...,...
283,0,12,19,length,-0.567204,-0.316047,-1.134408,0.275674
284,0,12,20,length,-0.095164,-0.727292,-0.190327,0.851785
285,0,12,21,length,-0.433698,-0.164585,-0.867396,0.400348
286,0,12,22,length,0.549720,0.141331,1.099440,0.290123


## __Compute LIWC2015 Metrics with `DLATK`__

In [6]:
table_name = "davinci_single_response"
df_responses = pd.DataFrame(responses) \
                    .rename(columns={"completion": "message"}) \
                    .loc[:, ["qid", "cid", "message"]] \
                    .explode("message")

sql_drop_table = f"""DROP TABLE IF EXISTS {table_name};"""
sql_create_table = f"""
CREATE TABLE {table_name} (
    message_id INT(11) NOT NULL AUTO_INCREMENT PRIMARY KEY,
    qid INT(10) UNSIGNED NOT NULL,
    cid INT(10) UNSIGNED NOT NULL,
    message TEXT
);
"""

In [7]:
cnx = connection.connect(host="localhost",
                         database="wangy49",
                         user="wangy49")
cursor = cnx.cursor()

# Create a SQLAlchemy engine from the MySQL connection
engine = sqlalchemy.create_engine('mysql+mysqlconnector://', creator=lambda: cnx)

In [8]:
cursor.execute(sql_drop_table)
cursor.execute(sql_create_table)
cnx.commit()

In [9]:
df_responses.to_sql(table_name, 
                    con=engine, 
                    if_exists="append", 
                    index=False)

2496

In [10]:
cnx.close()

In [14]:
# !sudo service mysql start
# !mysqldump -u wangy49 wangy49 > ../db/wangy49.sql

"""
DROP TABLE IF EXISTS davinci_single_resp_liwc; 
CREATE TABLE davinci_single_resp_liwc AS 
    SELECT 
        message_id, 
        qid, 
        cid, 
        feat, 
        value, 
        group_norm 
    FROM 
        feat$cat_LIWC2015$davinci_single_resp$message_id$1gra AS f 
        INNER JOIN (SELECT qid, cid, message_id FROM davinci_single_resp) AS i 
        ON f.group_id = i.message_id;
"""

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


'\nDROP TABLE IF EXISTS davinci_single_resp_liwc; \nCREATE TABLE davinci_single_resp_liwc AS \n    SELECT \n        message_id, \n        qid, \n        cid, \n        feat, \n        value, \n        group_norm \n    FROM \n        feat$cat_LIWC2015$davinci_single_resp$message_id$1gra AS f \n        INNER JOIN (SELECT qid, cid, message_id FROM davinci_single_resp) AS i \n        ON f.group_id = i.message_id;\n'

In [12]:
feat_keep = {"FEEL", "NEGEMO", "POSEMO", "SAD", "SOCIAL"}
df_liwc_stats = pd.read_csv(DIR_STAT / "text_davinci_003_single_response_liwc.csv")
df_liwc_stats = (df_liwc_stats
                     .loc[df_liwc_stats.feat.isin(feat_keep), :]
                     .copy()
                     .pivot(index=["message_id", "qid", "cid"],
                            columns="feat",
                            values="group_norm")
                     .fillna(0) 
                     .reset_index()
                     .set_index("message_id")
                     .groupby(["qid", "cid"])
                     .agg(list))
df_liwc_stats.head()

Unnamed: 0_level_0,feat,FEEL,NEGEMO,POSEMO,SAD,SOCIAL
qid,cid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,"[0.0465116279069767, 0.0444444444444444, 0.044...","[0.0930232558139536, 0.111111111111111, 0.0666...","[0.0, 0.0, 0.0, 0.0222222222222222, 0.0, 0.0, ...","[0.0, 0.0, 0.0222222222222222, 0.0222222222222...","[0.162790697674419, 0.133333333333333, 0.11111..."
0,1,"[0.0810810810810811, 0.114285714285714, 0.0909...","[0.054054054054054, 0.0571428571428572, 0.0909...","[0.081081081081081, 0.0, 0.0303030303030303, 0...","[0.027027027027027, 0.0285714285714286, 0.0303...","[0.027027027027027, 0.114285714285714, 0.06060..."
0,2,"[0.05, 0.0625, 0.0204081632653061, 0.043478260...","[0.075, 0.0624999999999999, 0.122448979591837,...","[0.025, 0.0208333333333333, 0.0, 0.0, 0.0, 0.0...","[0.0, 0.0208333333333333, 0.0, 0.0217391304347...","[0.05, 0.145833333333333, 0.0408163265306122, ..."
0,3,"[0.0416666666666666, 0.06, 0.0652173913043478,...","[0.0416666666666666, 0.06, 0.0869565217391304,...","[0.0416666666666666, 0.02, 0.0434782608695652,...","[0.0208333333333333, 0.02, 0.0217391304347826,...","[0.125, 0.16, 0.130434782608696, 0.03125, 0.15..."
0,4,"[0.114285714285714, 0.0425531914893618, 0.0612...","[0.0571428571428572, 0.0851063829787236, 0.102...","[0.0, 0.0, 0.0204081632653061, 0.0196078431372...","[0.0285714285714286, 0.0212765957446809, 0.020...","[0.0857142857142858, 0.0851063829787236, 0.163..."


In [13]:
base_cid = 0
last_cid = 10
last_qid = 23

liwc_stats = []
for cid in range((base_cid + 1), (last_cid + 1)):
    for qid in range(last_qid + 1):
        for feat in feat_keep:
            val_base = df_liwc_stats.xs((qid, base_cid), level=["qid", "cid"]).loc[:, feat].values[0]
            val_test = df_liwc_stats.xs((qid, cid), level=["qid", "cid"]).loc[:, feat].values[0]
            t_stats, p_val = ttest_ind(val_base, val_test)
            liwc_stats.append({"base_cfg": base_cid,
                               "test_cfg": cid,
                               "qid": qid,
                               "metric": feat,
                               "cohen_d": cohen_d(val_base, val_test),
                               "norm_diff_stdev": norm_diff_stdev(val_base, val_test),
                               "t_stats": t_stats,
                               "p_val": p_val})
pd.DataFrame(liwc_stats)

Unnamed: 0,base_cfg,test_cfg,qid,metric,cohen_d,norm_diff_stdev,t_stats,p_val
0,0,1,0,NEGEMO,0.957413,0.194188,1.914826,0.076173
1,0,1,0,FEEL,-1.296219,0.769431,-2.592437,0.021288
2,0,1,0,SOCIAL,1.892809,0.264155,3.785617,0.002007
3,0,1,0,SAD,-1.275073,1.079959,-2.550147,0.023111
4,0,1,0,POSEMO,-1.511432,-0.905179,-3.022865,0.009128
...,...,...,...,...,...,...,...,...
1195,0,10,23,NEGEMO,-1.303860,-0.863168,-2.607720,0.020664
1196,0,10,23,FEEL,-0.207689,-0.037724,-0.415377,0.684162
1197,0,10,23,SOCIAL,0.621753,-0.029023,1.243506,0.234102
1198,0,10,23,SAD,0.500000,1.414214,1.000000,0.334282
