In [1]:
import numpy as np
from tabulate import tabulate
import os
import pandas as pd
import json
from scipy.stats import ttest_ind
from statsmodels.stats.multitest import multipletests
import matplotlib.pyplot as plt


In [2]:
def get_empprompt_index():
    previous_prompt = "You 're at a parade where armed war veterans wave from a passing military float . Suddenly , a shrill noise rings out , and the vets simultaneously fire their weapons into the crowds ."
    gen_story_path = "/ssd005/projects/story_bias/human_story_prompt/gen_story_mapping.json"
    with open(gen_story_path, 'r') as file:
        json_data = json.load(file)
    keys_list = list(json_data.keys())
    index_prev = keys_list.index(previous_prompt)
    return_index = int(index_prev) + 1
    return return_index

EMPTY_INDEX = get_empprompt_index()

In [3]:
def get_data(scorepath, remove_empty):
    scorepath_new = scorepath + "/out_scores.csv"
    scores_df = pd.read_csv(scorepath_new, header=None)
    scores_df.columns = ["pind", "sind", "score"]
    if remove_empty:
        scores_df = scores_df[scores_df['pind'] != EMPTY_INDEX]
    data = scores_df.dropna(subset=['score'])
    return data


def get_all_data(emotions, attributes, scores, human_flag, remove_empty=True, return_df=False):
    data_storage = {}
    if human_flag:
        root_directory='/ssd005/projects/story_bias/human_story_prompt/outputs/kp'
    else:
        root_directory='/ssd005/projects/story_bias/human_story_prompt/gen_outputs/kp'
    for emotion_name in emotions:
        emotion_path = os.path.join(root_directory, emotion_name)
        if os.path.isdir(emotion_path):
            for attribute_name in attributes:
                attribute_path = os.path.join(emotion_path, attribute_name)
                if os.path.isdir(attribute_path):
                    for score_name in scores:
                        score_path = os.path.join(attribute_path, score_name)
                        if os.path.isdir(score_path):
                            result = get_data(score_path, remove_empty)
                            # keyname = human_key + score_path.split('kp/')[-1]
                            keyname = score_path.split('kp/')[-1]
                            if return_df:
                                data_storage[keyname] = result
                            else:
                                data_storage[keyname] = result['score']
    return data_storage
    

In [4]:
emotions = ['arousal','valence','intellect', 'appearance','power', 'dominance']
attributes=["all", "full", "comet", "sub"]
scores=['avg','sim', 'axis']
human_data = get_all_data(emotions, attributes, scores, human_flag=True, remove_empty=True, return_df = False)
generated_data = get_all_data(emotions, attributes, scores, human_flag=False, remove_empty=True, return_df = False)

In [5]:
print(human_data.keys())
print(len(human_data['arousal/comet/sim']))
print(len(generated_data['arousal/comet/sim']))

dict_keys(['arousal/all/avg', 'arousal/all/sim', 'arousal/all/axis', 'arousal/full/avg', 'arousal/full/sim', 'arousal/full/axis', 'arousal/comet/avg', 'arousal/comet/sim', 'arousal/comet/axis', 'arousal/sub/avg', 'arousal/sub/sim', 'arousal/sub/axis', 'valence/all/avg', 'valence/all/sim', 'valence/all/axis', 'valence/full/avg', 'valence/full/sim', 'valence/full/axis', 'valence/comet/avg', 'valence/comet/sim', 'valence/comet/axis', 'valence/sub/avg', 'valence/sub/sim', 'valence/sub/axis', 'intellect/all/avg', 'intellect/all/sim', 'intellect/full/avg', 'intellect/full/sim', 'intellect/comet/avg', 'intellect/comet/sim', 'intellect/comet/axis', 'intellect/sub/avg', 'intellect/sub/sim', 'appearance/all/avg', 'appearance/all/sim', 'appearance/full/avg', 'appearance/full/sim', 'appearance/comet/avg', 'appearance/comet/sim', 'appearance/comet/axis', 'appearance/sub/avg', 'appearance/sub/sim', 'power/all/avg', 'power/all/sim', 'power/all/axis', 'power/full/avg', 'power/full/sim', 'power/full/ax

# Summary Stats

In [21]:
def single_summary_stats(result_series):
    res_lst = list(result_series)
    mean_value = np.mean(res_lst)
    median_value = np.median(res_lst)
    std_deviation = np.std(res_lst)
    return [mean_value, median_value, std_deviation]

def human_gpt_summary_stats(human_data, generated_data, emotion, embed_type, score_type):
    # embed_type = comet, sub (spacy clause based)
    # score_type = axis, sim
    key = emotion + "/" + embed_type + "/" + score_type
    human_val = human_data[key]
    gen_val = generated_data[key]
    human_stats = single_summary_stats(human_val)
    gen_stats = single_summary_stats(gen_val)
    return (human_stats, gen_stats)

In [22]:
def tabulate_summary_stats(human_data, generated_data, embed_type, score_type):
    emotions = ['arousal','valence','intellect', 'appearance','power', 'dominance']
    mean_lst = []
    median_lst = []
    stdev_lst = []
    for emotion in emotions:
        (human_stats, gen_stats) = human_gpt_summary_stats(human_data, generated_data, emotion, embed_type, score_type)
        human_mean, human_median, human_std = human_stats
        gen_mean, gen_median, gen_std = gen_stats
        
        mean_lst.append([emotion, f"{human_mean:.3f}", f"{gen_mean:.3f}"])
        median_lst.append([emotion, f"{human_median:.3f}", f"{gen_median:.3f}"])
        stdev_lst.append([emotion, f"{human_std:.3f}", f"{gen_std:.3f}"])

    headers_mean = ["Attribute", "Human Mean", "Generated Mean"]
    headers_median = ["Attribute", "Human Median", "Generated Median"]
    headers_stdev = ["Attribute", "Human Standard Deviation", "Generated Standard Deviation"]
    print(tabulate(mean_lst, headers=headers_mean, tablefmt="grid"))
    print(tabulate(median_lst, headers=headers_median, tablefmt="grid"))
    print(tabulate(stdev_lst, headers=headers_stdev, tablefmt="grid"))


In [23]:
tabulate_summary_stats(human_data, generated_data, "comet", "sim")

+-------------+--------------+------------------+
| Attribute   |   Human Mean |   Generated Mean |
| arousal     |        0.087 |            0.081 |
+-------------+--------------+------------------+
| valence     |        0.082 |            0.082 |
+-------------+--------------+------------------+
| intellect   |        0.095 |            0.091 |
+-------------+--------------+------------------+
| appearance  |        0.089 |            0.085 |
+-------------+--------------+------------------+
| power       |        0.102 |            0.098 |
+-------------+--------------+------------------+
| dominance   |        0.079 |            0.079 |
+-------------+--------------+------------------+
+-------------+----------------+--------------------+
| Attribute   |   Human Median |   Generated Median |
| arousal     |          0.086 |              0.08  |
+-------------+----------------+--------------------+
| valence     |          0.082 |              0.082 |
+-------------+---------------

# T-test

In [11]:
def ttest_gen_v_human(human_data, generated_data, num_samples, sample_size, random_seed):
    np.random.seed(random_seed)

    keys_list = list(human_data.keys())
    table_data = []
    for key in keys_list:
        if "sim" not in key or "comet" not in key:
            continue
        t_stat, p_value = ttest_ind(human_data[key], generated_data[key])
        p_samples = []
        for _ in range(num_samples):
            # Sample from each dataframe
            human_sample = np.random.choice(human_data[key], size=sample_size, replace=False)
            generated_sample = np.random.choice(generated_data[key], size=sample_size, replace=False)

            # Perform t-test for each pair of samples
            t_statistic_fm, p_value_sample = ttest_ind(human_sample, generated_sample)
            p_samples.append(p_value_sample)

        # Apply multiple comparison correction (e.g., Bonferroni correction)
        adjusted_p_values_sample = multipletests(p_samples, method='bonferroni')[1]

        average_adjusted_p_values = np.mean(adjusted_p_values_sample, axis=0)

        if np.isnan(p_value):
            continue
        row = [key, p_value, average_adjusted_p_values]
        table_data.append(row)
    

    # Adjust the layout and display the figure
    plt.tight_layout()
    plt.show()
    custom_headers = ["Dimension", "Population t-test p-value", "Sample t-test p-value"]

    print(tabulate(table_data, headers=custom_headers, tablefmt="grid"))


In [12]:
ttest_gen_v_human(human_data, generated_data, num_samples=5, sample_size=3000, random_seed = 999)

<Figure size 640x480 with 0 Axes>

+----------------------+-----------------------------+-------------------------+
| Dimension            |   Population t-test p-value |   Sample t-test p-value |
| arousal/comet/sim    |                 0           |            1.18072e-190 |
+----------------------+-----------------------------+-------------------------+
| valence/comet/sim    |                 2.31702e-06 |            1            |
+----------------------+-----------------------------+-------------------------+
| intellect/comet/sim  |                 0           |            3.51014e-100 |
+----------------------+-----------------------------+-------------------------+
| appearance/comet/sim |                 0           |            2.04207e-51  |
+----------------------+-----------------------------+-------------------------+
| power/comet/sim      |                 0           |            4.56721e-76  |
+----------------------+-----------------------------+-------------------------+
| dominance/comet/sim  |    

# Cohen's D

In [13]:
def cohen_d(group1, group2):
    mean1, mean2 = np.mean(group1), np.mean(group2)
    std1, std2 = np.std(group1, ddof=1), np.std(group2, ddof=1)
    n1, n2 = len(group1), len(group2)

    pooled_std = np.sqrt(((n1 - 1) * std1**2 + (n2 - 1) * std2**2) / (n1 + n2 - 2))

    cohens_d = (mean1 - mean2) / pooled_std

    return cohens_d

In [14]:
def cohend_gpt_v_human(human_data, generated_data):

    keys_list = list(human_data.keys())
    table_data = []
    for key in keys_list:
        if "sim" not in key or "comet" not in key:
            continue
        effect_size = cohen_d(human_data[key], generated_data[key])
        row = [key, effect_size]
        table_data.append(row)

    custom_headers = ["Dimension", "Cohen's D Effect Size"]

    print(tabulate(table_data, headers=custom_headers, tablefmt="grid"))

In [15]:
cohend_gpt_v_human(human_data, generated_data)

+----------------------+-------------------------+
| Dimension            |   Cohen's D Effect Size |
| arousal/comet/sim    |               0.804571  |
+----------------------+-------------------------+
| valence/comet/sim    |               0.0162609 |
+----------------------+-------------------------+
| intellect/comet/sim  |               0.586079  |
+----------------------+-------------------------+
| appearance/comet/sim |               0.419092  |
+----------------------+-------------------------+
| power/comet/sim      |               0.54078   |
+----------------------+-------------------------+
| dominance/comet/sim  |              -0.0367103 |
+----------------------+-------------------------+
