In [40]:
import pandas as pd
import numpy as np
np.seterr(divide='warn', invalid='warn')
from scipy.spatial import distance
from tqdm.notebook import tqdm

queries = pd.read_csv("../Results/query_EP.csv")
gemma_OG = pd.read_csv("../Results/gemma_EP.csv")
gemma_RQ = pd.read_csv("../Results/gemma_RQ_EP.csv")
gpt_OG = pd.read_csv("../Results/gpt_EP.csv")
gpt_RQ = pd.read_csv("../Results/gpt_RQ_EP.csv")

In [49]:
def RQ_avg_EP(RQ_df, LLM):
    RQ_avg = []
    for query in queries["query"]:
        result = [query, LLM]
        RQ_row = RQ_df.loc[RQ_df["query"]==query]
        EP_avg = list(np.average(RQ_row[["positive", "negative", "neutral", "joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "expectations"]], axis=0))
        result += EP_avg
        RQ_avg.append(result)
    RQ_avg_df = pd.DataFrame(RQ_avg, columns=["query", "LLM", "positive", "negative", "neutral", "joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "expectations"])
    RQ_avg_df.to_csv("../Results/" + LLM + "_RQ_avg_EP.csv", index=False)

RQ_avg_EP(gpt_RQ, "gpt")
RQ_avg_EP(gemma_RQ, "gemma")

In [53]:
gemma_RQ_avg = pd.read_csv("../Results/gemma_RQ_avg_EP.csv")
gpt_RQ_avg = pd.read_csv("../Results/gpt_RQ_avg_EP.csv")

## JSD

### JSD between LLMS

In [54]:
# gemma_RQ_male = gemma_RQ.loc[gemma_RQ["gender"]=="male"]
# gemma_RQ_female = gemma_RQ.loc[gemma_RQ["gender"]=="female"]
# gpt_RQ_male = gpt_RQ.loc[gpt_RQ["gender"]=="male"]
# gpt_RQ_female = gpt_RQ.loc[gpt_RQ["gender"]=="female"]

LLM_JSD = []

for query in tqdm(queries["query"]):
    gpt_sentiment = np.array(gpt_OG.loc[gpt_OG["query"]==query].iloc[0][["positive", "negative", "neutral"]], dtype=float)
    gemma_sentiment = np.array(gemma_OG.loc[gemma_OG["query"]==query].iloc[0][["positive", "negative", "neutral"]], dtype=float)
    sentiment_JSD = distance.jensenshannon(gpt_sentiment, gemma_sentiment)

    gpt_emotion = np.array(gpt_OG.loc[gpt_OG["query"]==query].iloc[0][["joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "expectations"]], dtype=float)
    gemma_emotion = np.array(gemma_OG.loc[gemma_OG["query"]==query].iloc[0][["joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "expectations"]], dtype=float)
    emotion_JSD = distance.jensenshannon(gpt_emotion, gemma_emotion)

    LLM_JSD.append([query, sentiment_JSD, emotion_JSD])

LLM_JSD_df = pd.DataFrame(LLM_JSD, columns=["query", "sentiment_JSD", "emotion_JSD"])
LLM_JSD_df.to_csv("../Results/LLM_JSD.csv", index=False)

  0%|          | 0/110 [00:00<?, ?it/s]

  p = p / np.sum(p, axis=axis, keepdims=True)


### JSD between query-response pairs

In [55]:
test = np.array(queries[["positive", "negative", "neutral"]].iloc[[0,1]])
distance.jensenshannon(test[0], test[1]) # lesser the value, higher the similarity

0.02750887863789107

In [56]:
def compute_EP_JSD(row1, row2):
    r1_sentiments = np.array(row1[["positive", "negative", "neutral"]], dtype=float)
    r2_sentiments = np.array(row2[["positive", "negative", "neutral"]], dtype=float)
    # print(r1_sentiments, r2_sentiments)
    
    r1_emotions = np.array(row1[["joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "expectations"]], dtype=float)
    r2_emotions = np.array(row2[["joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "expectations"]], dtype=float)
    
    sentiment_JSD = distance.jensenshannon(r1_sentiments, r2_sentiments)
    emotion_JSD = distance.jensenshannon(r1_emotions, r2_emotions)

    return sentiment_JSD, emotion_JSD

In [73]:
def compute_JSD(OG_resp_df, RQ_avg_df, RQ_resp_df, LLM):

    query_resp_JSD = []

    for _, row in tqdm(queries.iterrows(), total=len(queries)):
        query = row["query"]
        OG_resp = OG_resp_df.loc[OG_resp_df["query"]==query].iloc[0]
        OG_resp_JSD = compute_EP_JSD(row, OG_resp) # JSD between OG query and OG response pairs

        RQ_avg_resp = RQ_avg_df.loc[RQ_avg_df["query"]==query].iloc[0]
        RQ_avg_JSD = compute_EP_JSD(row, RQ_avg_resp) # JSD between OG query and response to reformulated query (avg of male and female versions)

        RQ_male_resp = RQ_resp_df.loc[(RQ_resp_df["query"]==query) & (RQ_resp_df["gender"]=="male")].iloc[0]
        RQ_male_JSD = compute_EP_JSD(row, RQ_male_resp) # JSD between OG query and response to reformulated query (male version)

        RQ_female_resp = RQ_resp_df.loc[(RQ_resp_df["query"]==query) & (RQ_resp_df["gender"]=="female")].iloc[0]
        RQ_female_JSD = compute_EP_JSD(row, RQ_female_resp)

        query_resp_JSD.append([query, OG_resp_JSD[0], OG_resp_JSD[1], RQ_avg_JSD[0], RQ_avg_JSD[1], RQ_male_JSD[0], RQ_male_JSD[1], RQ_female_JSD[0], RQ_female_JSD[1]])

    query_resp_df = pd.DataFrame(query_resp_JSD, columns=["query", "OG_sentiment_JSD", "OG_emotion_JSD", "RQ_avg_sentiment_JSD", "RQ_avg_emotion_JSD", "RQ_male_sentiment_JSD", "RQ_male_emotion_JSD", "RQ_female_sentiment_JSD", "RQ_female_emotion_JSD"])
    query_resp_df.to_csv("../Results/query_resp_JSD_" + LLM + ".csv", index=False)
    # print(query_resp_df.head())

In [74]:
compute_JSD(OG_resp_df=gemma_OG, RQ_avg_df=gemma_RQ_avg, RQ_resp_df=gemma_RQ, LLM="gemma")
compute_JSD(OG_resp_df=gpt_OG, RQ_avg_df=gpt_RQ_avg, RQ_resp_df=gpt_RQ, LLM="gpt")

  0%|          | 0/110 [00:00<?, ?it/s]

  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, kee

  0%|          | 0/110 [00:00<?, ?it/s]

  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, kee

### JSD from average vector

In [75]:
avg_sentiment = np.average(np.array(queries[["positive", "negative", "neutral"]], dtype=float), axis=0)
avg_emotion = np.average(np.array(queries[["joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "expectations"]], dtype=float), axis=0)
distance.jensenshannon(avg_sentiment, np.array(queries.iloc[0][["positive", "negative", "neutral"]], dtype=float))

0.017470699275533706

In [76]:
queries_JSD = []
for _, row in tqdm(queries.iterrows(), total=len(queries)):
    sentiment = np.array(row[["positive", "negative", "neutral"]], dtype=float)
    emotion = np.array(row[["joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "expectations"]], dtype=float)
    sentiment_JSD = distance.jensenshannon(avg_sentiment, sentiment)
    emotion_JSD = distance.jensenshannon(avg_emotion, emotion)
    queries_JSD.append([row["query"], sentiment_JSD, emotion_JSD])

queries_JSD_df = pd.DataFrame(queries_JSD, columns=["query", "sentiment_JSD", "emotion_JSD"])
queries_JSD_df.to_csv("../Results/query_avg_jsd.csv", index=False)

  0%|          | 0/110 [00:00<?, ?it/s]

  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)


In [77]:
def compute_JSD_from_avg(LLM, OG_resp, RQ_avg_resp, RQ_resp):
    # Response to OG query
    OG_avg_sentiment = np.average(np.array(OG_resp[["positive", "negative", "neutral"]], dtype=float), axis=0)
    OG_avg_emotion = np.average(np.array(OG_resp[["joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "expectations"]], dtype=float), axis=0)

    # Average of response to male and female queries
    RQ_avg_df_sentiment = np.average(np.array(RQ_avg_resp[["positive", "negative", "neutral"]], dtype=float), axis=0)
    RQ_avg_df_emotion = np.average(np.array(RQ_avg_resp[["joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "expectations"]], dtype=float), axis=0)
    
    # Response to male queries
    RQ_male = RQ_resp.loc[RQ_resp["gender"]=="male"]
    RQ_male_avg_sentiment = np.average(np.array(RQ_male[["positive", "negative", "neutral"]], dtype=float), axis=0)
    RQ_male_avg_emotion = np.average(np.array(RQ_male[["joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "expectations"]], dtype=float), axis=0)

    # Response to female queries
    RQ_female = RQ_resp.loc[RQ_resp["gender"]=="female"]
    RQ_female_avg_sentiment = np.average(np.array(RQ_female[["positive", "negative", "neutral"]], dtype=float), axis=0)
    RQ_female_avg_emotion = np.average(np.array(RQ_female[["joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "expectations"]], dtype=float), axis=0)

    avg_JSD = []

    for query in tqdm(queries["query"]):
        OG_sentiment = np.array(OG_resp.loc[OG_resp["query"]==query].iloc[0][["positive", "negative", "neutral"]], dtype=float)
        OG_emotion = np.array(OG_resp.loc[OG_resp["query"]==query].iloc[0][["joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "expectations"]], dtype=float)
        OG_sentiment_JSD = distance.jensenshannon(OG_avg_sentiment, OG_sentiment)
        OG_emotion_JSD = distance.jensenshannon(OG_avg_emotion, OG_emotion)

        RQ_avg_sentiment = np.array(RQ_avg_resp.loc[RQ_avg_resp["query"]==query].iloc[0][["positive", "negative", "neutral"]], dtype=float)
        RQ_avg_emotion = np.array(RQ_avg_resp.loc[RQ_avg_resp["query"]==query].iloc[0][["joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "expectations"]], dtype=float)
        RQ_avg_sentiment_JSD = distance.jensenshannon(RQ_avg_df_sentiment, RQ_avg_sentiment)
        RQ_avg_emotion_JSD = distance.jensenshannon(RQ_avg_df_emotion, RQ_avg_emotion)

        RQ_male_sentiment = np.array(RQ_male.loc[RQ_male["query"]==query].iloc[0][["positive", "negative", "neutral"]], dtype=float)
        RQ_male_emotion = np.array(RQ_male.loc[RQ_male["query"]==query].iloc[0][["joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "expectations"]], dtype=float)
        RQ_male_sentiment_JSD = distance.jensenshannon(RQ_male_avg_sentiment, RQ_male_sentiment)
        RQ_male_emotion_JSD = distance.jensenshannon(RQ_male_avg_emotion, RQ_male_emotion)

        RQ_female_sentiment = np.array(RQ_female.loc[RQ_female["query"]==query].iloc[0][["positive", "negative", "neutral"]], dtype=float)
        RQ_female_emotion = np.array(RQ_female.loc[RQ_female["query"]==query].iloc[0][["joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "expectations"]], dtype=float)
        RQ_female_sentiment_JSD = distance.jensenshannon(RQ_female_avg_sentiment, RQ_female_sentiment)
        RQ_female_emotion_JSD = distance.jensenshannon(RQ_female_avg_emotion, RQ_female_emotion)

        avg_JSD.append([query, OG_sentiment_JSD, OG_emotion_JSD, RQ_avg_sentiment_JSD, RQ_avg_emotion_JSD, RQ_male_sentiment_JSD, RQ_male_emotion_JSD, RQ_female_sentiment_JSD, RQ_female_emotion_JSD])

    avg_JSD_df = pd.DataFrame(avg_JSD, columns=["query", "OG_sentiment_JSD", "OG_emotion_JSD", "RQ_avg_sentiment_JSD", " RQ_avg_emotion_JSD", "RQ_male_sentiment_JSD", "RQ_male_emotion_JSD", "RQ_female_sentiment_JSD", "RQ_female_emotion_JSD"])
    avg_JSD_df.to_csv("../Results/" + LLM + "_avg_JSD.csv", index=False)

In [79]:
compute_JSD_from_avg(LLM="gemma", OG_resp=gemma_OG, RQ_avg_resp=gemma_RQ_avg, RQ_resp=gemma_RQ)
compute_JSD_from_avg(LLM="gpt", OG_resp=gpt_OG, RQ_avg_resp=gpt_RQ_avg, RQ_resp=gpt_RQ)

  0%|          | 0/110 [00:00<?, ?it/s]

  0%|          | 0/110 [00:00<?, ?it/s]

  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)


## Stats test

In [84]:
import pandas as pd

query_EP = pd.read_csv("../Results/query_EP.csv")

gemma_OG_EP = pd.read_csv("../Results/gemma_EP.csv")
gemma_RQ_avg_EP = pd.read_csv("../Results/gemma_RQ_avg_EP.csv") 
gemma_RQ_EP = pd.read_csv("../Results/gemma_RQ_EP.csv")
gemma_RQ_male_EP = gemma_RQ_EP.loc[gemma_RQ_EP["gender"]=="male"]
gemma_RQ_female_EP = gemma_RQ_EP.loc[gemma_RQ_EP["gender"]=="female"]

gpt_OG_EP = pd.read_csv("../Results/gpt_EP.csv")
gpt_RQ_avg_EP = pd.read_csv("../Results/gpt_RQ_avg_EP.csv") 
gpt_RQ_EP = pd.read_csv("../Results/gpt_RQ_EP.csv")
gpt_RQ_male_EP = gpt_RQ_EP.loc[gpt_RQ_EP["gender"]=="male"]
gpt_RQ_female_EP = gpt_RQ_EP.loc[gpt_RQ_EP["gender"]=="female"]


llm_JSD = pd.read_csv("../Results/LLM_JSD.csv") # JSD between Gemma and GPT responses for OG prompts
gpt_avg_JSD = pd.read_csv("../Results/gpt_avg_JSD.csv") # JSD between average vector GPT response and each individual response for OG, male-female average, male, and female prompts
gemma_avg_JSD = pd.read_csv("../Results/gemma_avg_JSD.csv") # JSD between average vector Gemma response and each individual response for OG, male-female average, male, and female prompts
query_gpt_JSD = pd.read_csv("../Results/query_resp_JSD_gpt.csv") # JSD between prompt - GPT response pair for OG prompt and corresponding response for OG, male-female average, male, female prompts
query_gemma_JSD = pd.read_csv("../Results/query_resp_JSD_gemma.csv") # JSD between prompt - Gemma response pair for OG prompt and corresponding response for OG, male-female average, male, female prompts


In [85]:
## descriptive stats for EP

emo_features = ["positive", "negative", "neutral", "joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "expectations"]

query_EP_avg = ["Prompts"] + [round(np.median(query_EP[emo]), 2) for emo in emo_features]

gemma_EP_avg = ["Gemma_OG"] + [round(np.median(gemma_OG_EP[emo]), 2) for emo in emo_features]
gemma_RQ_avg_EP_avg = ["Gemma_RQ_avg"] + [round(np.median(gemma_RQ_avg_EP[emo]), 2) for emo in emo_features]
gemma_male_avg = ["Gemma_RQ_Male"] + [round(np.median(gemma_RQ_male_EP[emo]), 2) for emo in emo_features]
gemma_female_avg = ["Gemma_RQ_Female"] + [round(np.median(gemma_RQ_female_EP[emo]), 2) for emo in emo_features]

gpt_EP_avg = ["GPT_OG"] + [round(np.median(gpt_OG_EP[emo]), 2) for emo in emo_features]
gpt_RQ_avg_EP_avg = ["GPT_RQ_avg"] + [round(np.median(gpt_RQ_avg_EP[emo]), 2) for emo in emo_features]
gpt_male_avg = ["GPT_RQ_Male"] + [round(np.median(gpt_RQ_male_EP[emo]), 2) for emo in emo_features]
gpt_female_avg = ["GPT_RQ_Female"] + [round(np.median(gpt_RQ_female_EP[emo]), 2) for emo in emo_features]

avg_EP = [
    query_EP_avg,
    gemma_EP_avg,
    gemma_RQ_avg_EP_avg,
    gemma_male_avg,
    gemma_female_avg,
    gpt_EP_avg,
    gpt_RQ_avg_EP_avg,
    gpt_male_avg,
    gpt_female_avg
]
avg_EP_df = pd.DataFrame(avg_EP, columns=["EP category"]+emo_features)
avg_EP_df.to_csv("../Stats/avg_EP.csv", index=False)
avg_EP_df


Unnamed: 0,EP category,positive,negative,neutral,joy,anger,surprise,disgust,fear,sadness,trust,expectations
0,Prompts,26.08,27.65,43.97,62.46,61.83,61.91,61.44,63.81,62.7,63.88,62.61
1,Gemma_OG,28.12,25.86,45.29,63.67,64.04,63.62,61.98,65.57,63.48,66.87,65.19
2,Gemma_RQ_avg,30.69,26.47,42.43,64.14,65.27,65.35,62.73,66.0,64.81,69.98,66.76
3,Gemma_RQ_Male,30.35,26.73,42.48,64.07,65.12,64.58,62.31,65.8,64.49,68.96,66.0
4,Gemma_RQ_Female,31.13,26.75,41.43,64.19,65.11,65.5,63.28,66.16,65.09,70.5,67.67
5,GPT_OG,29.81,26.8,43.68,63.42,63.7,63.81,61.79,65.55,63.96,66.44,64.82
6,GPT_RQ_avg,31.08,27.76,40.72,63.56,64.15,64.15,62.02,65.84,64.54,67.27,65.32
7,GPT_RQ_Male,31.32,27.32,40.76,63.49,64.3,63.91,62.06,65.98,64.42,67.26,64.89
8,GPT_RQ_Female,30.9,27.46,40.43,63.63,64.11,64.32,62.22,65.89,64.7,67.83,65.33


In [83]:
from scipy.stats import ttest_ind, mannwhitneyu, shapiro, f_oneway, kruskal, ttest_1samp

def emo_feature_significance_test(group_name, df1, df2):
    emo_features = ["positive", "negative", "neutral", "joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "expectations"]
    signficance_test = []

    for emo in emo_features:
        shapiro_1 = shapiro(df1[emo])[1]
        shapiro_2 = shapiro(df2[emo])[1]
        if shapiro_1 < 0.05 or shapiro_2 < 0.05:
            test_name = "Mann-Whitney U"
            test_stat, p_val = mannwhitneyu(df1[emo], df2[emo])
            difference = round(np.median(df1[emo]) - np.median(df2[emo]), 2)
        else:
            test_name = "T-Test"
            test_stat, p_val = ttest_ind(df1[emo], df2[emo])
            difference = round(np.mean(df1[emo]) - np.mean(df2[emo]), 2)
        signficance_test.append([group_name, emo, test_name, test_stat, difference, p_val, p_val < 0.05])

    return signficance_test

Gemma_GPT = emo_feature_significance_test("Gemma_GPT", gemma_OG_EP, gpt_OG_EP)
Gemma_OG_RQ = emo_feature_significance_test("Gemma_OG_RQ", gemma_OG_EP, gemma_RQ_avg_EP)
Gemma_RQ = emo_feature_significance_test("Gemma_RQ_male_female", gemma_RQ_male_EP, gemma_RQ_female_EP)
GPT_OG_RQ = emo_feature_significance_test("GPT_OG_RQ", gpt_OG_EP, gpt_RQ_avg_EP)
GPT_RQ = emo_feature_significance_test("GPT_RQ_male_female", gpt_RQ_male_EP, gpt_RQ_female_EP)

all_tests = Gemma_GPT + Gemma_OG_RQ + Gemma_RQ + GPT_OG_RQ + GPT_RQ

signficance_test_df = pd.DataFrame(all_tests, columns=["comparison_pair", "feature", "test_name", "test_stat", "central_value_diff", "p_val", "is_significant"])
signficance_test_df.to_csv("../Stats/significance_tests_features.csv", index=False)
signficance_test_df

Unnamed: 0,comparison_pair,feature,test_name,test_stat,central_value_diff,p_val,is_significant
0,Gemma_GPT,positive,Mann-Whitney U,4859.0,-1.68,0.01167157,True
1,Gemma_GPT,negative,Mann-Whitney U,5449.0,-0.94,0.2033439,False
2,Gemma_GPT,neutral,Mann-Whitney U,7342.0,1.61,0.006221426,True
3,Gemma_GPT,joy,Mann-Whitney U,6115.5,0.25,0.8904819,False
4,Gemma_GPT,anger,Mann-Whitney U,6391.5,0.34,0.4700693,False
5,Gemma_GPT,surprise,Mann-Whitney U,5885.5,-0.19,0.7282805,False
6,Gemma_GPT,disgust,Mann-Whitney U,5742.5,0.2,0.5154716,False
7,Gemma_GPT,fear,Mann-Whitney U,5666.5,0.02,0.4171715,False
8,Gemma_GPT,sadness,Mann-Whitney U,5217.5,-0.48,0.07798667,False
9,Gemma_GPT,trust,Mann-Whitney U,6329.5,0.43,0.5545023,False
