In [9]:
import pandas as pd
import numpy as np
from scipy.spatial import distance
from tqdm.notebook import tqdm

queries = pd.read_csv("../Results/query_EP.csv")
gemma_OG = pd.read_csv("../Results/gemma_EP.csv")
gemma_RQ = pd.read_csv("../Results/gemma_RQ_EP.csv")
gpt_OG = pd.read_csv("../Results/gpt_EP.csv")
gpt_RQ = pd.read_csv("../Results/gpt_RQ_EP.csv")

### JSD between query-response pairs

In [44]:
test = np.array(queries[["positive", "negative", "neutral"]].iloc[[0,1]])
distance.jensenshannon(test[0], test[1]) # lesser the value, higher the similarity

0.0275088786378906

In [14]:
np.seterr(divide='warn', invalid='warn')
def compute_EP_JSD(row1, row2):
    r1_sentiments = np.array(row1[["positive", "negative", "neutral"]], dtype=float)
    r2_sentiments = np.array(row2[["positive", "negative", "neutral"]], dtype=float)
    # print(r1_sentiments, r2_sentiments)
    
    r1_emotions = np.array(row1[["joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "expectations"]], dtype=float)
    r2_emotions = np.array(row2[["joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "expectations"]], dtype=float)
    
    sentiment_JSD = distance.jensenshannon(r1_sentiments, r2_sentiments)
    emotion_JSD = distance.jensenshannon(r1_emotions, r2_emotions)

    return sentiment_JSD, emotion_JSD

In [15]:
def compute_JSD(OG_resp_df, RQ_resp_df, LLM):

    query_resp_JSD = []

    for _, row in tqdm(queries.iterrows(), total=len(queries)):
        query = row["query"]
        OG_resp = OG_resp_df.loc[OG_resp_df["query"]==query].iloc[0]
        OG_resp_JSD = compute_EP_JSD(row, OG_resp) # JSD between OG query and OG response pairs

        RQ_male_resp = RQ_resp_df.loc[(RQ_resp_df["query"]==query) & (RQ_resp_df["gender"]=="male")].iloc[0]
        RQ_male_JSD = compute_EP_JSD(row, RQ_male_resp) # JSD between OG query and response to reformulated query (male version)

        RQ_female_resp = RQ_resp_df.loc[(RQ_resp_df["query"]==query) & (RQ_resp_df["gender"]=="female")].iloc[0]
        RQ_female_JSD = compute_EP_JSD(row, RQ_female_resp)

        query_resp_JSD.append([query, OG_resp_JSD[0], OG_resp_JSD[1], RQ_male_JSD[0], RQ_male_JSD[1], RQ_female_JSD[0], RQ_female_JSD[1]])

    query_resp_df = pd.DataFrame(query_resp_JSD, columns=["query", "OG_sentiment_JSD", "OG_emotion_JSD", "RQ_male_sentiment_JSD", "RQ_male_emotion_JSD", "RQ_female_sentiment_JSD", "RQ_female_emotion_JSD"])
    query_resp_df.to_csv("../Results/query_resp_JSD_" + LLM + ".csv", index=False)
    # print(query_resp_df.head())

In [16]:
compute_JSD(OG_resp_df=gemma_OG, RQ_resp_df=gemma_RQ, LLM="gemma")
compute_JSD(OG_resp_df=gpt_OG, RQ_resp_df=gpt_RQ, LLM="gpt")

  0%|          | 0/110 [00:00<?, ?it/s]

  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, kee

  0%|          | 0/110 [00:00<?, ?it/s]

  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, keepdims=True)
  p = p / np.sum(p, axis=axis, kee

### JSD from average vector

In [7]:
avg_sentiment = np.average(np.array(queries[["positive", "negative", "neutral"]], dtype=float), axis=0)
avg_emotion = np.average(np.array(queries[["joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "expectations"]], dtype=float), axis=0)
distance.jensenshannon(avg_sentiment, np.array(queries.iloc[0][["positive", "negative", "neutral"]], dtype=float))

0.01747069927553432

In [17]:
queries_JSD = []
for _, row in tqdm(queries.iterrows(), total=len(queries)):
    sentiment = np.array(row[["positive", "negative", "neutral"]], dtype=float)
    emotion = np.array(row[["joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "expectations"]], dtype=float)
    sentiment_JSD = distance.jensenshannon(avg_sentiment, sentiment)
    emotion_JSD = distance.jensenshannon(avg_emotion, emotion)
    queries_JSD.append([row["query"], sentiment_JSD, emotion_JSD])

queries_JSD_df = pd.DataFrame(queries_JSD, columns=["query", "sentiment_JSD", "emotion_JSD"])
queries_JSD_df.to_csv("../Results/query_avg_jsd.csv", index=False)

  0%|          | 0/110 [00:00<?, ?it/s]

  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)


In [29]:
def compute_JSD_from_avg(LLM, OG_resp, RQ_resp):
    OG_avg_sentiment = np.average(np.array(OG_resp[["positive", "negative", "neutral"]], dtype=float), axis=0)
    OG_avg_emotion = np.average(np.array(OG_resp[["joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "expectations"]], dtype=float), axis=0)
    
    RQ_male = RQ_resp.loc[RQ_resp["gender"]=="male"]
    RQ_male_avg_sentiment = np.average(np.array(RQ_male[["positive", "negative", "neutral"]], dtype=float), axis=0)
    RQ_male_avg_emotion = np.average(np.array(RQ_male[["joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "expectations"]], dtype=float), axis=0)

    RQ_female = RQ_resp.loc[RQ_resp["gender"]=="female"]
    RQ_female_avg_sentiment = np.average(np.array(RQ_female[["positive", "negative", "neutral"]], dtype=float), axis=0)
    RQ_female_avg_emotion = np.average(np.array(RQ_female[["joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "expectations"]], dtype=float), axis=0)

    avg_JSD = []

    for query in tqdm(queries["query"]):
        OG_sentiment = np.array(OG_resp.loc[OG_resp["query"]==query].iloc[0][["positive", "negative", "neutral"]], dtype=float)
        OG_emotion = np.array(OG_resp.loc[OG_resp["query"]==query].iloc[0][["joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "expectations"]], dtype=float)
        OG_sentiment_JSD = distance.jensenshannon(OG_avg_sentiment, OG_sentiment)
        OG_emotion_JSD = distance.jensenshannon(OG_avg_emotion, OG_emotion)

        RQ_male_sentiment = np.array(RQ_male.loc[RQ_male["query"]==query].iloc[0][["positive", "negative", "neutral"]], dtype=float)
        RQ_male_emotion = np.array(RQ_male.loc[RQ_male["query"]==query].iloc[0][["joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "expectations"]], dtype=float)
        RQ_male_sentiment_JSD = distance.jensenshannon(RQ_male_avg_sentiment, RQ_male_sentiment)
        RQ_male_emotion_JSD = distance.jensenshannon(RQ_male_avg_emotion, RQ_male_emotion)

        RQ_female_sentiment = np.array(RQ_female.loc[RQ_female["query"]==query].iloc[0][["positive", "negative", "neutral"]], dtype=float)
        RQ_female_emotion = np.array(RQ_female.loc[RQ_female["query"]==query].iloc[0][["joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "expectations"]], dtype=float)
        RQ_female_sentiment_JSD = distance.jensenshannon(RQ_female_avg_sentiment, RQ_female_sentiment)
        RQ_female_emotion_JSD = distance.jensenshannon(RQ_female_avg_emotion, RQ_female_emotion)

        avg_JSD.append([query, OG_sentiment_JSD, OG_emotion_JSD, RQ_male_sentiment_JSD, RQ_male_emotion_JSD, RQ_female_sentiment_JSD, RQ_female_emotion_JSD])

    avg_JSD_df = pd.DataFrame(avg_JSD, columns=["query", "OG_sentiment_JSD", "OG_emotion_JSD", "RQ_male_sentiment_JSD", "RQ_male_emotion_JSD", "RQ_female_sentiment_JSD", "RQ_female_emotion_JSD"])
    avg_JSD_df.to_csv("../Results/" + LLM + "_avg_JSD.csv", index=False)

In [30]:
compute_JSD_from_avg("gemma", gemma_OG, gemma_RQ)
compute_JSD_from_avg("gpt", gpt_OG, gpt_RQ)

  0%|          | 0/110 [00:00<?, ?it/s]

  0%|          | 0/110 [00:00<?, ?it/s]

  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)
