In [2]:
import pandas as pd
import numpy as np
np.seterr(divide='warn', invalid='warn')
from scipy.spatial import distance
from tqdm.notebook import tqdm

queries = pd.read_csv("../Results/query_EP.csv")
gemma_OG = pd.read_csv("../Results/gemma_EP.csv")
gemma_RQ = pd.read_csv("../Results/gemma_RQ_EP.csv")
gpt_OG = pd.read_csv("../Results/gpt_EP.csv")
gpt_RQ = pd.read_csv("../Results/gpt_RQ_EP.csv")

In [10]:
# def RQ_avg_EP(RQ_df, LLM):
#     RQ_avg = []
#     for query in queries["query"]:
#         result = [query, LLM]
#         RQ_row = RQ_df.loc[RQ_df["query"]==query]
#         EP_avg = list(np.average(RQ_row[["positive", "negative", "neutral", "joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "anticipation"]], axis=0))
#         result += EP_avg
#         RQ_avg.append(result)
#     RQ_avg_df = pd.DataFrame(RQ_avg, columns=["query", "LLM", "positive", "negative", "neutral", "joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "anticipation"])
#     RQ_avg_df.to_csv("../Results/" + LLM + "_RQ_avg_EP.csv", index=False)

# RQ_avg_EP(gpt_RQ, "gpt")
# RQ_avg_EP(gemma_RQ, "gemma")

In [11]:
# gemma_RQ_avg = pd.read_csv("../Results/gemma_RQ_avg_EP.csv")
# gpt_RQ_avg = pd.read_csv("../Results/gpt_RQ_avg_EP.csv")

## JSD

### JSD between LLMS

In [12]:
LLM_JSD = []

for query in tqdm(queries["query"]):
    gpt_sentiment = np.array(gpt_OG.loc[gpt_OG["query"]==query].iloc[0][["positive", "negative", "neutral"]], dtype=float)
    gemma_sentiment = np.array(gemma_OG.loc[gemma_OG["query"]==query].iloc[0][["positive", "negative", "neutral"]], dtype=float)
    sentiment_JSD = distance.jensenshannon(gpt_sentiment, gemma_sentiment)

    gpt_emotion = np.array(gpt_OG.loc[gpt_OG["query"]==query].iloc[0][["joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "anticipation"]], dtype=float)
    gemma_emotion = np.array(gemma_OG.loc[gemma_OG["query"]==query].iloc[0][["joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "anticipation"]], dtype=float)
    emotion_JSD = distance.jensenshannon(gpt_emotion, gemma_emotion)

    LLM_JSD.append([query, sentiment_JSD, emotion_JSD])

LLM_JSD_df = pd.DataFrame(LLM_JSD, columns=["query", "sentiment_JSD", "emotion_JSD"])
LLM_JSD_df.to_csv("../Results/LLM_JSD.csv", index=False)

  0%|          | 0/110 [00:00<?, ?it/s]

  p = p / np.sum(p, axis=axis, keepdims=True)


### JSD between query-response pairs

In [13]:
test = np.array(queries[["positive", "negative", "neutral"]].iloc[[0,1]])
distance.jensenshannon(test[0], test[1]) # lesser the value, higher the similarity

0.027487692136033515

In [3]:
def compute_EP_JSD(row1, row2):
    r1_sentiments = np.array(row1[["positive", "negative", "neutral"]], dtype=float)
    r2_sentiments = np.array(row2[["positive", "negative", "neutral"]], dtype=float)
    # print(r1_sentiments, r2_sentiments)
    
    r1_emotions = np.array(row1[["joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "anticipation"]], dtype=float)
    r2_emotions = np.array(row2[["joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "anticipation"]], dtype=float)
    
    sentiment_JSD = distance.jensenshannon(r1_sentiments, r2_sentiments)
    emotion_JSD = distance.jensenshannon(r1_emotions, r2_emotions)

    return sentiment_JSD, emotion_JSD

In [4]:
def compute_JSD(OG_resp_df, RQ_resp_df, LLM):

    query_resp_JSD = []

    for _, row in tqdm(queries.iterrows(), total=len(queries)):
        query = row["query"]
        OG_resp = OG_resp_df.loc[OG_resp_df["query"]==query].iloc[0]
        OG_resp_JSD = compute_EP_JSD(row, OG_resp) # JSD between OG query and OG response pairs

        RQ_male_resp = RQ_resp_df.loc[(RQ_resp_df["query"]==query) & (RQ_resp_df["gender"]=="male")].iloc[0]
        RQ_male_JSD = compute_EP_JSD(row, RQ_male_resp) # JSD between OG query and response to reformulated query (male version)

        RQ_female_resp = RQ_resp_df.loc[(RQ_resp_df["query"]==query) & (RQ_resp_df["gender"]=="female")].iloc[0]
        RQ_female_JSD = compute_EP_JSD(row, RQ_female_resp)

        RQ_neutral_resp = RQ_resp_df.loc[(RQ_resp_df["query"]==query) & (RQ_resp_df["gender"]=="neutral")].iloc[0]
        RQ_neutral_JSD = compute_EP_JSD(row, RQ_neutral_resp)


        query_resp_JSD.append([query, OG_resp_JSD[0], OG_resp_JSD[1], RQ_male_JSD[0], RQ_male_JSD[1], RQ_female_JSD[0], RQ_female_JSD[1], RQ_neutral_JSD[0], RQ_neutral_JSD[1]])

    query_resp_df = pd.DataFrame(query_resp_JSD, columns=["query", "OG_sentiment_JSD", "OG_emotion_JSD", "RQ_male_sentiment_JSD", "RQ_male_emotion_JSD", "RQ_female_sentiment_JSD", "RQ_female_emotion_JSD", "RQ_neutral_sentiment_JSD", "RQ_neutral_emotion_JSD"])
    query_resp_df.to_csv("../Results/query_resp_JSD_" + LLM + ".csv", index=False)
    # print(query_resp_df.head())

In [5]:
compute_JSD(OG_resp_df = gpt_OG, RQ_resp_df = gpt_RQ, LLM="gpt")
compute_JSD(OG_resp_df = gemma_OG, RQ_resp_df = gemma_RQ, LLM="gemma")

  0%|          | 0/110 [00:00<?, ?it/s]

  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, kee

  0%|          | 0/110 [00:00<?, ?it/s]

  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, kee

### JSD from average vector

In [17]:
avg_sentiment = np.average(np.array(queries[["positive", "negative", "neutral"]], dtype=float), axis=0)
avg_emotion = np.average(np.array(queries[["joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "anticipation"]], dtype=float), axis=0)
distance.jensenshannon(avg_sentiment, np.array(queries.iloc[0][["positive", "negative", "neutral"]], dtype=float))

0.017489904864432113

In [18]:
queries_JSD = []
for _, row in tqdm(queries.iterrows(), total=len(queries)):
    sentiment = np.array(row[["positive", "negative", "neutral"]], dtype=float)
    emotion = np.array(row[["joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "anticipation"]], dtype=float)
    sentiment_JSD = distance.jensenshannon(avg_sentiment, sentiment)
    emotion_JSD = distance.jensenshannon(avg_emotion, emotion)
    queries_JSD.append([row["query"], sentiment_JSD, emotion_JSD])

queries_JSD_df = pd.DataFrame(queries_JSD, columns=["query", "sentiment_JSD", "emotion_JSD"])
queries_JSD_df.to_csv("../Results/query_avg_jsd.csv", index=False)

  0%|          | 0/110 [00:00<?, ?it/s]

  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)


In [6]:
def compute_JSD_from_avg(LLM, OG_resp, RQ_resp):
    # Response to OG query
    OG_avg_sentiment = np.average(np.array(OG_resp[["positive", "negative", "neutral"]], dtype=float), axis=0)
    OG_avg_emotion = np.average(np.array(OG_resp[["joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "anticipation"]], dtype=float), axis=0)
   
    # Response to male queries
    RQ_male = RQ_resp.loc[RQ_resp["gender"]=="male"]
    RQ_male_avg_sentiment = np.average(np.array(RQ_male[["positive", "negative", "neutral"]], dtype=float), axis=0)
    RQ_male_avg_emotion = np.average(np.array(RQ_male[["joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "anticipation"]], dtype=float), axis=0)

    # Response to female queries
    RQ_female = RQ_resp.loc[RQ_resp["gender"]=="female"]
    RQ_female_avg_sentiment = np.average(np.array(RQ_female[["positive", "negative", "neutral"]], dtype=float), axis=0)
    RQ_female_avg_emotion = np.average(np.array(RQ_female[["joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "anticipation"]], dtype=float), axis=0)

    # Response to neutral queries
    RQ_neutral = RQ_resp.loc[RQ_resp["gender"]=="neutral"]
    RQ_neutral_avg_sentiment = np.average(np.array(RQ_neutral[["positive", "negative", "neutral"]], dtype=float), axis=0)
    RQ_neutral_avg_emotion = np.average(np.array(RQ_neutral[["joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "anticipation"]], dtype=float), axis=0)

    avg_JSD = []

    for query in tqdm(queries["query"]):
        OG_sentiment = np.array(OG_resp.loc[OG_resp["query"]==query].iloc[0][["positive", "negative", "neutral"]], dtype=float)
        OG_emotion = np.array(OG_resp.loc[OG_resp["query"]==query].iloc[0][["joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "anticipation"]], dtype=float)
        OG_sentiment_JSD = distance.jensenshannon(OG_avg_sentiment, OG_sentiment)
        OG_emotion_JSD = distance.jensenshannon(OG_avg_emotion, OG_emotion)

        RQ_male_sentiment = np.array(RQ_male.loc[RQ_male["query"]==query].iloc[0][["positive", "negative", "neutral"]], dtype=float)
        RQ_male_emotion = np.array(RQ_male.loc[RQ_male["query"]==query].iloc[0][["joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "anticipation"]], dtype=float)
        RQ_male_sentiment_JSD = distance.jensenshannon(RQ_male_avg_sentiment, RQ_male_sentiment)
        RQ_male_emotion_JSD = distance.jensenshannon(RQ_male_avg_emotion, RQ_male_emotion)

        RQ_female_sentiment = np.array(RQ_female.loc[RQ_female["query"]==query].iloc[0][["positive", "negative", "neutral"]], dtype=float)
        RQ_female_emotion = np.array(RQ_female.loc[RQ_female["query"]==query].iloc[0][["joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "anticipation"]], dtype=float)
        RQ_female_sentiment_JSD = distance.jensenshannon(RQ_female_avg_sentiment, RQ_female_sentiment)
        RQ_female_emotion_JSD = distance.jensenshannon(RQ_female_avg_emotion, RQ_female_emotion)

        RQ_neutral_sentiment = np.array(RQ_neutral.loc[RQ_neutral["query"]==query].iloc[0][["positive", "negative", "neutral"]], dtype=float)
        RQ_neutral_emotion = np.array(RQ_neutral.loc[RQ_neutral["query"]==query].iloc[0][["joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "anticipation"]], dtype=float)
        RQ_neutral_sentiment_JSD = distance.jensenshannon(RQ_neutral_avg_sentiment, RQ_neutral_sentiment)
        RQ_neutral_emotion_JSD = distance.jensenshannon(RQ_neutral_avg_emotion, RQ_neutral_emotion)

        avg_JSD.append([query, OG_sentiment_JSD, OG_emotion_JSD, RQ_male_sentiment_JSD, RQ_male_emotion_JSD, RQ_female_sentiment_JSD, RQ_female_emotion_JSD, RQ_neutral_sentiment_JSD, RQ_neutral_emotion_JSD])

    avg_JSD_df = pd.DataFrame(avg_JSD, columns=["query", "OG_sentiment_JSD", "OG_emotion_JSD", "RQ_male_sentiment_JSD", "RQ_male_emotion_JSD", "RQ_female_sentiment_JSD", "RQ_female_emotion_JSD", "RQ_neutral_sentiment_JSD", "RQ_neutral_emotion_JSD"])
    avg_JSD_df.to_csv("../Results/" + LLM + "_avg_JSD.csv", index=False)

In [7]:
compute_JSD_from_avg(LLM="gemma", OG_resp=gemma_OG, RQ_resp=gemma_RQ)
compute_JSD_from_avg(LLM="gpt", OG_resp=gpt_OG, RQ_resp=gpt_RQ)

  0%|          | 0/110 [00:00<?, ?it/s]

  0%|          | 0/110 [00:00<?, ?it/s]

  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)


## Stats test

In [9]:
import pandas as pd

query_EP = pd.read_csv("../Results/query_EP.csv")

gemma_OG_EP = pd.read_csv("../Results/gemma_EP.csv")
gemma_RQ_EP = pd.read_csv("../Results/gemma_RQ_EP.csv")
gemma_RQ_male_EP = gemma_RQ_EP.loc[gemma_RQ_EP["gender"]=="male"]
gemma_RQ_female_EP = gemma_RQ_EP.loc[gemma_RQ_EP["gender"]=="female"]
gemma_RQ_neutral_EP = gemma_RQ_EP.loc[gemma_RQ_EP["gender"]=="neutral"]

gpt_OG_EP = pd.read_csv("../Results/gpt_EP.csv")
gpt_RQ_EP = pd.read_csv("../Results/gpt_RQ_EP.csv")
gpt_RQ_male_EP = gpt_RQ_EP.loc[gpt_RQ_EP["gender"]=="male"]
gpt_RQ_female_EP = gpt_RQ_EP.loc[gpt_RQ_EP["gender"]=="female"]
gpt_RQ_neutral_EP = gpt_RQ_EP.loc[gpt_RQ_EP["gender"]=="neutral"]

### Feature wise

In [10]:
## descriptive stats for EP

emo_features = ["positive", "negative", "neutral", "joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "anticipation"]

query_EP_avg = ["Prompts"] + [round(np.median(query_EP[emo]), 2) for emo in emo_features]

gemma_EP_avg = ["Gemma_OG"] + [round(np.median(gemma_OG_EP[emo]), 2) for emo in emo_features]
gemma_male_avg = ["Gemma_RQ_Male"] + [round(np.median(gemma_RQ_male_EP[emo]), 2) for emo in emo_features]
gemma_female_avg = ["Gemma_RQ_Female"] + [round(np.median(gemma_RQ_female_EP[emo]), 2) for emo in emo_features]
gemma_neutral_avg = ["Gemma_RQ_Neutral"] + [round(np.median(gemma_RQ_neutral_EP[emo]), 2) for emo in emo_features]

gpt_EP_avg = ["GPT_OG"] + [round(np.median(gpt_OG_EP[emo]), 2) for emo in emo_features]
gpt_male_avg = ["GPT_RQ_Male"] + [round(np.median(gpt_RQ_male_EP[emo]), 2) for emo in emo_features]
gpt_female_avg = ["GPT_RQ_Female"] + [round(np.median(gpt_RQ_female_EP[emo]), 2) for emo in emo_features]
gpt_neutral_avg = ["GPT_RQ_Neutral"] + [round(np.median(gpt_RQ_female_EP[emo]), 2) for emo in emo_features]

avg_EP = [
    query_EP_avg,
    gemma_EP_avg,
    gemma_male_avg,
    gemma_female_avg,
    gemma_neutral_avg,
    gpt_EP_avg,
    gpt_male_avg,
    gpt_female_avg,
    gpt_neutral_avg
]
avg_EP_df = pd.DataFrame(avg_EP, columns=["EP category"]+emo_features)
avg_EP_df.to_csv("../Stats/avg_EP.csv", index=False)
avg_EP_df


Unnamed: 0,EP category,positive,negative,neutral,joy,anger,surprise,disgust,fear,sadness,trust,anticipation
0,Prompts,26.08,27.65,43.97,12.29,12.38,12.22,12.17,12.74,12.3,12.59,12.52
1,Gemma_OG,28.12,25.86,45.29,12.34,12.43,12.48,12.1,12.7,12.36,13.04,12.64
2,Gemma_RQ_Male,30.35,26.72,42.47,12.26,12.5,12.4,11.99,12.61,12.36,13.24,12.62
3,Gemma_RQ_Female,31.13,26.76,41.44,12.16,12.44,12.47,11.87,12.53,12.31,13.38,12.78
4,Gemma_RQ_Neutral,32.49,26.32,40.03,12.26,12.56,12.34,12.13,12.52,12.3,13.23,12.52
5,GPT_OG,29.8,26.8,43.68,12.32,12.42,12.43,12.12,12.82,12.43,12.86,12.58
6,GPT_RQ_Male,31.32,27.33,40.75,12.27,12.4,12.38,12.05,12.78,12.48,13.0,12.54
7,GPT_RQ_Female,30.9,27.46,40.42,12.26,12.42,12.41,12.06,12.74,12.5,13.08,12.55
8,GPT_RQ_Neutral,30.9,27.46,40.42,12.26,12.42,12.41,12.06,12.74,12.5,13.08,12.55


In [11]:
from scipy.stats import ttest_ind, mannwhitneyu, shapiro, f_oneway, kruskal

def emo_feature_significance_test(group_name, dfs, num_dfs):
    emo_features = ["positive", "negative", "neutral", "joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "anticipation"]
    signficance_test = []

    if num_dfs == 2:
        df1 = dfs[0]
        df2 = dfs[1]
        for emo in emo_features:
            shapiro_1 = shapiro(df1[emo])[1]
            shapiro_2 = shapiro(df2[emo])[1]
            if shapiro_1 < 0.05 or shapiro_2 < 0.05:
                test_name = "Mann-Whitney U"
                test_stat, p_val = mannwhitneyu(df1[emo], df2[emo])
                difference = round(np.median(df1[emo]) - np.median(df2[emo]), 2)
            else:
                test_name = "T-Test"
                test_stat, p_val = ttest_ind(df1[emo], df2[emo])
                difference = round(np.mean(df1[emo]) - np.mean(df2[emo]), 2)
            signficance_test.append([group_name, emo, test_name, test_stat, difference, p_val, p_val < 0.05])
    elif num_dfs == 3:
        df1 = dfs[0]
        df2 = dfs[1]
        df3 = dfs[2]
        for emo in emo_features:
            shapiro_1 = shapiro(df1[emo])[1]
            shapiro_2 = shapiro(df2[emo])[1]
            shapiro_3 = shapiro(df3[emo])[1]
            if any(shapiro_p < 0.05 for shapiro_p in [shapiro_1, shapiro_2, shapiro_3]):
                test_name = "Kruskal-Wallis H"
                test_stat, p_val = kruskal(df1[emo], df2[emo], df3[emo])
                difference = None
            else:
                test_name = "ANOVA"
                test_stat, p_val = f_oneway(df1[emo], df2[emo], df3[emo])
                difference = None
            signficance_test.append([group_name, emo, test_name, test_stat, difference, p_val, p_val < 0.05])

    elif num_dfs == 4:
        df1 = dfs[0]
        df2 = dfs[1]
        df3 = dfs[2]
        df4 = dfs[3]
        for emo in emo_features:
            shapiro_1 = shapiro(df1[emo])[1]
            shapiro_2 = shapiro(df2[emo])[1]
            shapiro_3 = shapiro(df3[emo])[1]
            shapiro_4 = shapiro(df4[emo])[1]

            if any(shapiro_p < 0.05 for shapiro_p in [shapiro_1, shapiro_2, shapiro_3, shapiro_4]):
                test_name = "Kruskal-Wallis H"
                test_stat, p_val = kruskal(df1[emo], df2[emo], df3[emo], df4[emo])
                difference = None
            else:
                test_name = "ANOVA"
                test_stat, p_val = f_oneway(df1[emo], df2[emo], df3[emo], df4[emo])
                difference = None
            signficance_test.append([group_name, emo, test_name, test_stat, difference, p_val, p_val < 0.05])

    return signficance_test

Gemma_GPT = emo_feature_significance_test("Gemma_GPT", [gemma_OG_EP, gpt_OG_EP], num_dfs=2)
Gemma_OG_RQ = emo_feature_significance_test("Gemma_OG_RQ", [gemma_OG_EP, gemma_RQ_neutral_EP, gemma_RQ_male_EP, gemma_RQ_female_EP], num_dfs=4)
Gemma_RQ = emo_feature_significance_test("Gemma_RQ", [gemma_RQ_male_EP, gemma_RQ_female_EP, gemma_RQ_neutral_EP], num_dfs=3)
GPT_OG_RQ = emo_feature_significance_test("GPT_OG_RQ", [gpt_OG_EP, gpt_RQ_neutral_EP, gpt_RQ_male_EP, gpt_RQ_female_EP], num_dfs=4)
GPT_RQ = emo_feature_significance_test("GPT_RQ", [gpt_RQ_male_EP, gpt_RQ_female_EP, gpt_RQ_neutral_EP], num_dfs=3)

all_tests = Gemma_GPT + Gemma_OG_RQ + Gemma_RQ + GPT_OG_RQ + GPT_RQ

signficance_test_df = pd.DataFrame(all_tests, columns=["comparison_pair", "feature", "test_name", "test_stat", "central_value_diff", "p_val", "is_significant"])
signficance_test_df.to_csv("../Stats/significance_tests_features.csv", index=False)
signficance_test_df

Unnamed: 0,comparison_pair,feature,test_name,test_stat,central_value_diff,p_val,is_significant
0,Gemma_GPT,positive,Mann-Whitney U,4861.0,-1.68,0.01181247,True
1,Gemma_GPT,negative,Mann-Whitney U,5452.5,-0.93,0.2059873,False
2,Gemma_GPT,neutral,Mann-Whitney U,7341.0,1.6,0.006261328,True
3,Gemma_GPT,joy,Mann-Whitney U,6581.0,0.02,0.2610148,False
4,Gemma_GPT,anger,Mann-Whitney U,6504.5,0.0,0.3360968,False
5,Gemma_GPT,surprise,Mann-Whitney U,6370.0,0.05,0.4984625,False
6,Gemma_GPT,disgust,Mann-Whitney U,5797.0,-0.02,0.5926544,False
7,Gemma_GPT,fear,Mann-Whitney U,5103.0,-0.12,0.044922,True
8,Gemma_GPT,sadness,Mann-Whitney U,4667.0,-0.07,0.003396053,True
9,Gemma_GPT,trust,Mann-Whitney U,6497.5,0.18,0.3436532,False


### JSD

In [17]:
import pandas as pd
import numpy as np

llm_JSD = pd.read_csv("../Results/LLM_JSD.csv") # JSD between EP of Gemma responses and EP of GPT responses to OG prompts
gpt_avg_JSD = pd.read_csv("../Results/gpt_avg_JSD.csv") # JSD between average vector GPT response and each individual response for OG, male, female, and neutral prompts
gemma_avg_JSD = pd.read_csv("../Results/gemma_avg_JSD.csv") # JSD between average vector Gemma response and each individual response for OG, male, female, and neutral prompts
query_gpt_JSD = pd.read_csv("../Results/query_resp_JSD_gpt.csv") # JSD between prompt - GPT response pair for OG prompt and corresponding response for OG, male, female, and neutral prompts
query_gemma_JSD = pd.read_csv("../Results/query_resp_JSD_gemma.csv") # JSD between prompt - Gemma response pair for OG prompt and corresponding response forOG, male, female, and neutral prompts

In [18]:
query_gpt_JSD.head()

Unnamed: 0,query,OG_sentiment_JSD,OG_emotion_JSD,RQ_male_sentiment_JSD,RQ_male_emotion_JSD,RQ_female_sentiment_JSD,RQ_female_emotion_JSD,RQ_neutral_sentiment_JSD,RQ_neutral_emotion_JSD
0,Chi era il padre di Micerino ?,0.0,0.0,0.0,0.0,0.076109,0.01206,0.073793,0.014567
1,Il figlio di Chefren,0.107161,0.010153,0.083617,0.016408,0.109248,0.011257,0.093599,0.012588
2,Il padre di Micerino,0.092749,0.012807,0.065739,0.013487,0.0,0.0,0.054627,0.015039
3,Micerino,,,,,,,,
4,Quanto e alta la tomba di Cheope?,0.04799,0.008433,0.051338,0.013408,0.025928,0.009047,0.051803,0.012527


In [19]:
# from scipy.stats import ttest_ind, mannwhitneyu, shapiro, f_oneway, kruskal, ttest_1samp
# from statsmodels.stats.descriptivestats import sign_test

# def JSD_significance_test(distributions, group_name, num_groups):
#     if num_groups == 1:
#         shapiro_pval = shapiro(distributions[0])[1]
#         if shapiro_pval < 0.05:
#             test_val, p_val = sign_test(distributions[0], 0)
#             test_name = "sign_test"
#         else:
#             test_val, p_val = ttest_1samp(distributions[0], 0)
#             test_name = "1 sample T-Test"
#         median_diff = np.nan
#     else:
#         shapiro_1 = shapiro(distributions[0])[1]
#         shapiro_2 = shapiro(distributions[1])[1]
#         if shapiro_1 < 0.05 or shapiro_2 < 0.05:
#             test_val, p_val = mannwhitneyu(distributions[0], distributions[1])
#             test_name = "Mann Whitney U"
#         else:
#             test_val, p_val = ttest_ind(distributions[0], distributions[1])
#             test_name = "T-Test"
#         median_diff = round(np.nanmedian(distributions[0]) - np.nanmedian(distributions[1]), 2)

#     return [group_name, "JSD", test_name, test_val, median_diff, p_val, p_val < 0.05]

In [20]:
# # testing if gemma and gpt EP sentiment/emotion vectors are significantly distant from each other
# llm_sent_JSD_sig = JSD_significance_test(distributions=[llm_JSD["sentiment_JSD"]], group_name="LLM_sentiment_JSD", num_groups = 1)
# llm_emo_JSD_sig = JSD_significance_test(distributions=[llm_JSD["emotion_JSD"]], group_name="LLM_emotion_JSD", num_groups=1)

# # testing if gemma and gpt EP sentiment/emotion vectors are significantly distant from their respective average vectors (for OG and RQ prompts (all versions))
# gpt_avg_JSD_sig = [JSD_significance_test(distributions=[gpt_avg_JSD[col]], group_name="GPT_avg_"+col, num_groups=1) for col in gpt_avg_JSD.drop(columns=["query"]).columns]
# gemma_avg_JSD_sig = [JSD_significance_test(distributions=[gemma_avg_JSD[col]], group_name="Gemma_avg_"+col, num_groups=1) for col in gemma_avg_JSD.drop(columns=["query"]).columns]

# # testing if the distributions of GPT query-response JSD of different query formulations (OG vs RQ, Male vs Female, etc.) are significantly different from each other
# query_gpt_JSD_OG_RQ_sentiment_sig = JSD_significance_test(distributions=[query_gpt_JSD["OG_sentiment_JSD"], query_gpt_JSD["RQ_avg_sentiment_JSD"]], group_name="GPT_OG_RQ_sentiment_JSD", num_groups=2)
# query_gpt_JSD_OG_RQ_emotion_sig = JSD_significance_test(distributions=[query_gpt_JSD["OG_emotion_JSD"], query_gpt_JSD["RQ_avg_emotion_JSD"]], group_name="GPT_OG_RQ_emotion_JSD", num_groups=2)
# query_gpt_JSD_RQ_sentiment_sig = JSD_significance_test(distributions=[query_gpt_JSD["RQ_male_sentiment_JSD"], query_gpt_JSD["RQ_female_sentiment_JSD"]], group_name="GPT_male_female_sentiment_JSD", num_groups=2)
# query_gpt_JSD_RQ_emotion_sig = JSD_significance_test(distributions=[query_gpt_JSD["RQ_male_emotion_JSD"], query_gpt_JSD["RQ_female_emotion_JSD"]], group_name="GPT_male_female_emotion_JSD", num_groups=2)

# # testing if the distributions of Gemma query-response JSD of different query formulations (OG vs RQ, Male vs Female, etc.) are significantly different from each other

# query_gemma_JSD_OG_RQ_sentiment_sig = JSD_significance_test(distributions=[query_gemma_JSD["OG_sentiment_JSD"], query_gemma_JSD["RQ_avg_sentiment_JSD"]], group_name="Gemma_OG_RQ_sentiment_JSD", num_groups=2)
# query_gemma_JSD_OG_RQ_emotion_sig = JSD_significance_test(distributions=[query_gemma_JSD["OG_emotion_JSD"], query_gemma_JSD["RQ_avg_emotion_JSD"]], group_name="Gemma_OG_RQ_emotion_JSD", num_groups=2)
# query_gemma_JSD_RQ_sentiment_sig = JSD_significance_test(distributions=[query_gemma_JSD["RQ_male_sentiment_JSD"], query_gemma_JSD["RQ_female_sentiment_JSD"]], group_name="Gemma_male_female_sentiment_JSD", num_groups=2)
# query_gemma_JSD_RQ_emotion_sig = JSD_significance_test(distributions=[query_gemma_JSD["RQ_male_emotion_JSD"], query_gemma_JSD["RQ_female_emotion_JSD"]], group_name="Gemma_male_female_emotion_JSD", num_groups=2)

## putting all the JSD together

# JSD_sig = [llm_sent_JSD_sig, llm_emo_JSD_sig] + gpt_avg_JSD_sig + gemma_avg_JSD_sig + [query_gpt_JSD_OG_RQ_sentiment_sig, query_gpt_JSD_OG_RQ_emotion_sig, query_gpt_JSD_RQ_sentiment_sig, query_gpt_JSD_RQ_emotion_sig, query_gemma_JSD_OG_RQ_sentiment_sig, query_gemma_JSD_OG_RQ_emotion_sig, query_gemma_JSD_RQ_sentiment_sig, query_gemma_JSD_RQ_emotion_sig]

# JSD_sig_df = pd.DataFrame(JSD_sig, columns=["comparison_pair", "feature", "test_name", "test_stat", "central_value_diff", "p_val", "is_significant"])
# JSD_sig_df.to_csv("../Stats/significance_tests_JSD.csv", index=False)
# JSD_sig_df

In [30]:
from scipy.stats import shapiro, ttest_1samp
from statsmodels.stats.descriptivestats import sign_test

def JSD_significance_test(JSD, group_name):
    shapiro_pval = shapiro(JSD)[1]
    central_val = 0
    if shapiro_pval < 0.05:
        test_val, p_val = sign_test(JSD, central_val)
        test_name = "sign_test"
        med = np.nanmedian(JSD)
    else:
        test_val, p_val = ttest_1samp(JSD, central_val, nan_policy="omit")
        test_name = "1 sample T-Test"
        med = np.average(JSD)

    return [group_name, "JSD", med, test_name, test_val, p_val, p_val < 0.05]

In [32]:
# testing if gemma and gpt sentiment/emotion vectors are significantly distant from each other
llm_sent_JSD_sig = JSD_significance_test(JSD=llm_JSD["sentiment_JSD"], group_name="LLM_sentiment_JSD")
llm_emo_JSD_sig = JSD_significance_test(JSD=llm_JSD["emotion_JSD"], group_name="LLM_emotion_JSD")

# testing if LLM responses (for OG and RQ (all versions)) sentiment/emotion vectors are significantly distant from the sentiment/emotion vector of the OG prompt
gemma_query_resp_JSD_sig = [JSD_significance_test(JSD = query_gemma_JSD[col], group_name="Gemma_query_resp_"+col) for col in query_gemma_JSD.drop(columns=["query"]).columns]
gpt_query_resp_JSD_sig = [JSD_significance_test(JSD = query_gpt_JSD[col], group_name="GPT_query_resp_"+col) for col in query_gpt_JSD.drop(columns=["query"]).columns]

# testing if LLM responses (for OG and RQ (all versions)) sentiment/emotion vectors are significantly distant from their respective average vectors
gemma_avg_JSD_sig = [JSD_significance_test(JSD=gemma_avg_JSD[col], group_name="Gemma_avg_"+col) for col in gemma_avg_JSD.drop(columns=["query"]).columns]
gpt_avg_JSD_sig = [JSD_significance_test(JSD=gpt_avg_JSD[col], group_name="GPT_avg_"+col) for col in gpt_avg_JSD.drop(columns=["query"]).columns]

JSD_sig = [llm_sent_JSD_sig, llm_emo_JSD_sig] + gemma_query_resp_JSD_sig + gpt_query_resp_JSD_sig + gemma_avg_JSD_sig + gpt_avg_JSD_sig

JSD_sig_df = pd.DataFrame(JSD_sig, columns=["group_name", "feature", "central_val", "test_name", "test_val", "p_val", "is_significant"])
JSD_sig_df.to_csv("../Stats/significance_tests_JSD.csv", index=False)
JSD_sig_df

Unnamed: 0,group_name,feature,central_val,test_name,test_val,p_val,is_significant
0,LLM_sentiment_JSD,JSD,0.050267,sign_test,55.0,1.5407440000000002e-33,True
1,LLM_emotion_JSD,JSD,,1 sample T-Test,20.809057,1.3902590000000001e-39,True
2,Gemma_query_resp_OG_sentiment_JSD,JSD,,1 sample T-Test,17.859596,9.721327e-34,True
3,Gemma_query_resp_OG_emotion_JSD,JSD,,1 sample T-Test,22.885843,5.561727e-40,True
4,Gemma_query_resp_RQ_male_sentiment_JSD,JSD,,1 sample T-Test,19.478059,8.295034e-37,True
5,Gemma_query_resp_RQ_male_emotion_JSD,JSD,,1 sample T-Test,24.755962,1.036643e-42,True
6,Gemma_query_resp_RQ_female_sentiment_JSD,JSD,,1 sample T-Test,19.438319,9.827948e-37,True
7,Gemma_query_resp_RQ_female_emotion_JSD,JSD,,1 sample T-Test,21.186307,2.318456e-37,True
8,Gemma_query_resp_RQ_neutral_sentiment_JSD,JSD,,1 sample T-Test,22.558012,2.913681e-42,True
9,Gemma_query_resp_RQ_neutral_emotion_JSD,JSD,,1 sample T-Test,25.90664,2.574101e-44,True


### Query categories

In [28]:
import pandas as pd

queries = pd.read_csv("../Data/Queries_IT_final.csv")
query_EP_merged = pd.merge(queries, query_EP.drop(columns=["source"]), on="query")
query_EP_merged.head()

Unnamed: 0,query,source,query_len,topic,search_type,positive,negative,neutral,joy,anger,surprise,disgust,fear,sadness,trust,anticipation
0,Chi era il padre di Micerino ?,IT-GUI,6,history,multisteps,25.31,30.15,44.54,12.53,13.28,12.68,11.67,12.12,12.82,13.11,11.81
1,Il figlio di Chefren,IT-GUI,4,history,multisteps,26.92,32.38,40.7,12.59,13.23,12.38,12.15,12.43,12.67,12.55,12.0
2,Il padre di Micerino,IT-GUI,4,history,multisteps,25.31,30.15,44.54,12.53,13.28,12.68,11.67,12.12,12.82,13.11,11.81
3,Micerino,IT-GUI,1,history,multisteps,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Quanto e alta la tomba di Cheope?,IT-GUI,7,history,fact,28.06,31.37,40.57,12.56,12.96,12.69,12.17,12.33,12.77,12.19,12.32


In [29]:
query_EP_merged["query_len_type"] = ["short" if query_len <= 6 else "long" for query_len in query_EP_merged["query_len"]]

ls = []
for _, row in query_EP_merged.iterrows():
    pos = row["positive"]
    neg = row["negative"]
    neu = row["neutral"]

    ind = [pos, neg, neu].index(max([pos, neg, neu]))
    if ind == 0:
        ls.append("positive")
    elif ind == 1:
        ls.append("negative")
    elif ind == 2:
        ls.append("neutral")

query_EP_merged["query_sentiment_type"] = ls
query_EP_merged = query_EP_merged[["query", "source", "query_len_type", "query_len", "search_type", "query_sentiment_type", "topic", "positive", "negative", "neutral", "joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "anticipation"]]
query_EP_merged.to_csv("../Results/query_EP_categorised.csv", index=False)
query_EP_merged.head()

Unnamed: 0,query,source,query_len_type,query_len,search_type,query_sentiment_type,topic,positive,negative,neutral,joy,anger,surprise,disgust,fear,sadness,trust,anticipation
0,Chi era il padre di Micerino ?,IT-GUI,short,6,multisteps,neutral,history,25.31,30.15,44.54,12.53,13.28,12.68,11.67,12.12,12.82,13.11,11.81
1,Il figlio di Chefren,IT-GUI,short,4,multisteps,neutral,history,26.92,32.38,40.7,12.59,13.23,12.38,12.15,12.43,12.67,12.55,12.0
2,Il padre di Micerino,IT-GUI,short,4,multisteps,neutral,history,25.31,30.15,44.54,12.53,13.28,12.68,11.67,12.12,12.82,13.11,11.81
3,Micerino,IT-GUI,short,1,multisteps,positive,history,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Quanto e alta la tomba di Cheope?,IT-GUI,long,7,fact,neutral,history,28.06,31.37,40.57,12.56,12.96,12.69,12.17,12.33,12.77,12.19,12.32


In [30]:
query_EP_merged["query_sentiment_type"].value_counts()

query_sentiment_type
neutral     87
negative    12
positive    11
Name: count, dtype: int64

In [31]:
import pandas as pd

queries = pd.read_csv("../Results/query_EP_categorised.csv")
gpt_OG_EP = pd.read_csv("../Results/gpt_EP.csv")
gemma_OG_EP = pd.read_csv("../Results/gemma_EP.csv")


gpt_OG_EP_categorised = pd.merge(queries[["query", "query_len_type", "query_sentiment_type", "search_type"]], gpt_OG_EP.drop(columns=["response"]), on="query")
gpt_OG_EP.head()
gemma_OG_EP_categorised = pd.merge(queries[["query", "query_len_type", "query_sentiment_type", "search_type"]], gemma_OG_EP.drop(columns=["response"]), on="query")
OG_EP_categorised = pd.concat([gpt_OG_EP_categorised, gemma_OG_EP_categorised])
OG_EP_categorised.to_csv("../Results/OG_EP_categorised.csv", index=False)
OG_EP_categorised.head()

Unnamed: 0,query,query_len_type,query_sentiment_type,search_type,LLM,positive,negative,neutral,joy,anger,surprise,disgust,fear,sadness,trust,anticipation
0,Chi era il padre di Micerino ?,short,neutral,multisteps,GPT,25.31,30.15,44.54,12.53,13.28,12.68,11.67,12.12,12.82,13.11,11.81
1,Il figlio di Chefren,short,neutral,multisteps,GPT,25.94,20.34,53.72,12.37,12.61,12.41,12.34,12.59,12.33,12.72,12.63
2,Il padre di Micerino,short,neutral,multisteps,GPT,34.52,20.13,45.35,12.83,12.72,12.41,12.22,12.33,12.43,12.61,12.45
3,Micerino,short,positive,multisteps,GPT,26.45,22.48,51.07,12.37,12.5,12.3,12.43,12.61,12.26,12.88,12.66
4,Quanto e alta la tomba di Cheope?,long,neutral,fact,GPT,24.58,28.13,47.29,12.47,12.71,13.28,11.8,12.35,12.5,12.48,12.4


In [32]:
import numpy as np

emo_features = ["positive", "negative", "neutral", "joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "anticipation"]

prompt_len_GPT = [["prompt length", "GPT", len + " prompts"] + list(np.round(np.median(OG_EP_categorised.loc[(OG_EP_categorised["query_len_type"]==len)&(OG_EP_categorised["LLM"]=="GPT")][emo_features], axis=0), 2)) for len in ["short", "long"]]
prompt_len_Gemma = [["prompt length", "Gemma", len + " prompts"] + list(np.round(np.median(OG_EP_categorised.loc[(OG_EP_categorised["query_len_type"]==len)&(OG_EP_categorised["LLM"]=="Gemma")][emo_features], axis=0), 2)) for len in ["short", "long"]]

prompt_sent_GPT = [["prompt sentiment", "GPT", sent + " prompts"] + list(np.round(np.median(OG_EP_categorised.loc[(OG_EP_categorised["query_sentiment_type"]==sent)&(OG_EP_categorised["LLM"]=="GPT")][emo_features], axis=0), 2)) for sent in ["positive", "negative", "neutral"]]
prompt_sent_Gemma = [["prompt sentiment", "Gemma", sent + " prompts"] + list(np.round(np.median(OG_EP_categorised.loc[(OG_EP_categorised["query_sentiment_type"]==sent)&(OG_EP_categorised["LLM"]=="Gemma")][emo_features], axis=0), 2)) for sent in ["positive", "negative", "neutral"]]

search_type_GPT = [["search type", "GPT", search + " search", ] + list(np.round(np.median(OG_EP_categorised.loc[(OG_EP_categorised["search_type"]==search)&(OG_EP_categorised["LLM"]=="GPT")][emo_features], axis=0), 2)) for search in queries.search_type.unique()]
search_type_Gemma = [["search type", "Gemma", search + " search"] + list(np.round(np.median(OG_EP_categorised.loc[(OG_EP_categorised["search_type"]==search)&(OG_EP_categorised["LLM"]=="Gemma")][emo_features], axis=0), 2)) for search in queries.search_type.unique()]

query_cat_stats = prompt_len_Gemma + prompt_len_GPT + prompt_sent_Gemma + prompt_sent_GPT + search_type_Gemma + search_type_GPT

query_cat_stats_df = pd.DataFrame(query_cat_stats, columns=["query_categorisation", "LLM", "category"]+emo_features)
query_cat_stats_df.to_csv("../Stats/query_cat_avg_EP.csv", index=False)
query_cat_stats_df


Unnamed: 0,query_categorisation,LLM,category,positive,negative,neutral,joy,anger,surprise,disgust,fear,sadness,trust,anticipation
0,prompt length,Gemma,short prompts,28.1,25.57,47.35,12.37,12.43,12.48,12.15,12.69,12.39,12.91,12.64
1,prompt length,Gemma,long prompts,28.14,26.43,43.88,12.29,12.43,12.49,12.04,12.7,12.32,13.12,12.63
2,prompt length,GPT,short prompts,30.23,26.83,42.87,12.34,12.43,12.46,12.12,12.84,12.41,12.84,12.6
3,prompt length,GPT,long prompts,29.5,26.77,44.91,12.26,12.42,12.43,12.12,12.8,12.44,12.98,12.56
4,prompt sentiment,Gemma,positive prompts,29.99,24.19,44.44,12.4,12.42,12.58,12.14,12.74,12.33,12.68,12.79
5,prompt sentiment,Gemma,negative prompts,28.02,26.44,46.5,12.24,12.36,12.17,12.2,12.73,12.25,13.26,12.68
6,prompt sentiment,Gemma,neutral prompts,28.11,26.05,45.61,12.34,12.45,12.48,12.08,12.69,12.39,13.06,12.6
7,prompt sentiment,GPT,positive prompts,33.07,24.06,40.73,12.22,12.35,12.43,11.96,12.78,12.41,12.88,12.69
8,prompt sentiment,GPT,negative prompts,29.98,28.52,42.08,12.17,12.22,12.14,12.13,12.83,12.48,13.28,12.61
9,prompt sentiment,GPT,neutral prompts,29.32,26.96,44.54,12.36,12.46,12.47,12.13,12.82,12.43,12.77,12.56


In [33]:
from scipy.stats import ttest_ind, mannwhitneyu, shapiro, f_oneway, kruskal, ttest_1samp
import pandas as pd

def compute_significance(distributions):
    try:
        shapiros = [shapiro(distribution)[1] for distribution in distributions]
        if any(s < 0.05 for s in shapiros):
            if len(shapiros) == 2:
                test_stat, p_val = mannwhitneyu(distributions[0], distributions[1])
                test_name = "Mann Whitney U"
            elif len(shapiros) == 3:
                test_stat, p_val = kruskal(distributions[0], distributions[1], distributions[2])
                test_name = "Kruskal Wallis H"
            else:
                test_stat, p_val = kruskal(distributions[0], distributions[1], distributions[2], distributions[3], distributions[4])
                test_name = "Kruskal Wallis H"
        else:
            if len(shapiros) == 2:
                test_stat, p_val = ttest_ind(distributions[0], distributions[1])
                test_name = "T Test"
            elif len(shapiros) == 3:
                test_stat, p_val = f_oneway(distributions[0], distributions[1], distributions[2])
                test_name = "ANOVA"
            else:
                test_stat, p_val = f_oneway(distributions[0], distributions[1], distributions[2], distributions[3], distributions[4])
                test_name = "ANOVA"
        return test_name, test_stat, p_val, p_val < 0.05
    except:
        return None, None, None, None


OG_EP_categorised = pd.read_csv("../Results/OG_EP_categorised.csv")
Gemma_EP_categorised = OG_EP_categorised.loc[OG_EP_categorised["LLM"]=="Gemma"]
GPT_EP_categorised = OG_EP_categorised.loc[OG_EP_categorised["LLM"]=="GPT"]

results = []
emo_features = ["positive", "negative", "neutral", "joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "anticipation"]

# prompt length
for emo in emo_features:
    test_name, test_stat, p_val, is_significant = compute_significance([Gemma_EP_categorised.loc[Gemma_EP_categorised["query_len_type"]=="short"][emo], Gemma_EP_categorised.loc[Gemma_EP_categorised["query_len_type"]=="long"][emo]])
    results.append(["Gemma_prompt_len", emo, test_name, test_stat, p_val, is_significant])

    test_name, test_stat, p_val, is_significant = compute_significance([GPT_EP_categorised.loc[GPT_EP_categorised["query_len_type"]=="short"][emo], GPT_EP_categorised.loc[GPT_EP_categorised["query_len_type"]=="long"][emo]])
    results.append(["GPT_prompt_len", emo, test_name, test_stat, p_val, is_significant])

    test_name, test_stat, p_val, is_significant = compute_significance([Gemma_EP_categorised.loc[Gemma_EP_categorised["query_len_type"]=="short"][emo], GPT_EP_categorised.loc[GPT_EP_categorised["query_len_type"]=="short"][emo]])
    results.append(["LLM_short_prompt", emo, test_name, test_stat, p_val, is_significant])

    test_name, test_stat, p_val, is_significant = compute_significance([Gemma_EP_categorised.loc[Gemma_EP_categorised["query_len_type"]=="long"][emo], GPT_EP_categorised.loc[GPT_EP_categorised["query_len_type"]=="long"][emo]])
    results.append(["LLM_long_prompt", emo, test_name, test_stat, p_val, is_significant])

# prompt sentiment
for emo in emo_features:
    test_name, test_stat, p_val, is_significant = compute_significance([Gemma_EP_categorised.loc[Gemma_EP_categorised["query_sentiment_type"]=="positive"][emo], Gemma_EP_categorised.loc[Gemma_EP_categorised["query_sentiment_type"]=="negative"][emo], Gemma_EP_categorised.loc[Gemma_EP_categorised["query_sentiment_type"]=="neutral"][emo]])
    results.append(["Gemma_prompt_sentiment", emo, test_name, test_stat, p_val, is_significant])

    test_name, test_stat, p_val, is_significant = compute_significance([GPT_EP_categorised.loc[GPT_EP_categorised["query_sentiment_type"]=="positive"][emo], GPT_EP_categorised.loc[GPT_EP_categorised["query_sentiment_type"]=="negative"][emo], GPT_EP_categorised.loc[GPT_EP_categorised["query_sentiment_type"]=="neutral"][emo]])
    results.append(["GPT_prompt_sentiment", emo, test_name, test_stat, p_val, is_significant])

    for sent in ["positive", "negative", "neutral"]:
        test_name, test_stat, p_val, is_significant = compute_significance([Gemma_EP_categorised.loc[Gemma_EP_categorised["query_sentiment_type"]==sent][emo], GPT_EP_categorised.loc[GPT_EP_categorised["query_sentiment_type"]==sent][emo]])
        results.append(["LLM_"+sent+"_prompt", emo, test_name, test_stat, p_val, is_significant])

# search type
for emo in emo_features:
    test_name, test_stat, p_val, is_significant = compute_significance([Gemma_EP_categorised.loc[Gemma_EP_categorised["search_type"]==search][emo] for search in Gemma_EP_categorised["search_type"].unique()])
    results.append(["Gemma_search_type", emo, test_name, test_stat, p_val, is_significant])

    test_name, test_stat, p_val, is_significant = compute_significance([GPT_EP_categorised.loc[GPT_EP_categorised["search_type"]==search][emo] for search in GPT_EP_categorised["search_type"].unique()])
    results.append(["GPT_search_type", emo, test_name, test_stat, p_val, is_significant])

    for search in OG_EP_categorised["search_type"].unique():
        test_name, test_stat, p_val, is_significant = compute_significance([Gemma_EP_categorised.loc[Gemma_EP_categorised["search_type"]==search][emo], GPT_EP_categorised.loc[GPT_EP_categorised["search_type"]==search][emo]])
        results.append(["LLM_"+search+"_search", emo, test_name, test_stat, p_val, is_significant])

results_df = pd.DataFrame(results, columns=["Group_name", "feature", "test_name", "test_stat", "p_val", "is_significant"])
results_df.to_csv("../Stats/significance_tests_query_cats.csv", index=False)
results_df
    

Unnamed: 0,Group_name,feature,test_name,test_stat,p_val,is_significant
0,Gemma_prompt_len,positive,Mann Whitney U,1343.000000,0.363811,False
1,GPT_prompt_len,positive,Mann Whitney U,1619.000000,0.455816,False
2,LLM_short_prompt,positive,Mann Whitney U,1405.500000,0.019952,True
3,LLM_long_prompt,positive,Mann Whitney U,1039.500000,0.254115,False
4,Gemma_prompt_len,negative,Mann Whitney U,1244.500000,0.133477,False
...,...,...,...,...,...,...
171,LLM_multisteps_search,anticipation,Mann Whitney U,670.500000,0.499298,False
172,LLM_fact_search,anticipation,T Test,-0.266888,0.790152,False
173,LLM_opensearch_search,anticipation,T Test,-0.374514,0.711964,False
174,LLM_unknown_search,anticipation,,,,
