In [None]:
from argparse import ArgumentParser
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from tqdm import tqdm
import json
import statistics
import tqdm
import gzip
import pickle
import os

rankllamascore = defaultdict(dict)
rankllamascore_affine = defaultdict(dict)

def load_jsonl(file_path):
    hard_negatives_scores = defaultdict(dict)
    with gzip.open(file_path, "r") as f:
        data = pickle.load(f)
        for qid, scores in tqdm.tqdm(data.items(), desc="Processing CE scores"):
            for did, score in scores.items():
                if did in hard_negatives_scores[str(qid)]:
                    hard_negatives_scores[str(qid)][str(did)] = statistics.mean(float(score), hard_negatives_scores[str(qid)][str(did)])
                else:
                    hard_negatives_scores[str(qid)][str(did)] = float(score)
    return hard_negatives_scores

MiniLM_scores = load_jsonl("/ivi/ilps/personal/jqiao/lsr_eval/data/msmarco/hard_negatives_scores/cross-encoder-ms-marco-MiniLM-L-6-v2-scores.pkl.gz")
rankllamascore = load_jsonl("/ivi/ilps/personal/jqiao/lsr_eval/data/msmarco/hard_negatives_scores/rankllama-13b-ms-marco-scores.pkl.gz")
rankllamascore_affine = load_jsonl("/ivi/ilps/personal/jqiao/lsr_eval/data/msmarco/hard_negatives_scores/rankllama-13b-ms-marco-scores-corpus-affine.pkl.gz")

MiniLM_scores_list = [score for doc_scores in tqdm.tqdm(MiniLM_scores.values()) for score in doc_scores.values()]
rankllamascore_list = [score for doc_scores in tqdm.tqdm(rankllamascore.values()) for score in doc_scores.values()]
rankllamascore_affine_list = [score for doc_scores in tqdm.tqdm(rankllamascore_affine.values()) for score in doc_scores.values()]

plt.figure(figsize=(12, 8))
sns.histplot(MiniLM_scores_list, color="blue", label="MiniLM Scores", kde=True, stat="density", bins=30)
sns.histplot(rankllamascore_list, color="blue", label="RankLlama Scores", kde=True, stat="density", bins=30)
sns.histplot(rankllamascore_affine_list, color="blue", label="RankLlama affine Scores", kde=True, stat="density", bins=30)

plt.title('Comparison of Score Distributions')
plt.xlabel('Score')
plt.ylabel('Density')
plt.legend()
plt.savefig('score_distribution_plot.png')
plt.show()

Processing CE scores: 100%|██████████| 808731/808731 [01:54<00:00, 7047.73it/s]
Processing CE scores: 100%|██████████| 808731/808731 [01:12<00:00, 11084.34it/s]
Processing CE scores: 100%|██████████| 808731/808731 [01:15<00:00, 10754.21it/s]
100%|██████████| 808731/808731 [00:11<00:00, 70335.87it/s]
100%|██████████| 808731/808731 [00:07<00:00, 103158.60it/s]
100%|██████████| 808731/808731 [00:07<00:00, 108293.66it/s]


In [None]:
plt.figure(figsize=(12, 8))
sns.histplot(MiniLM_scores_list, color="blue", label="MiniLM Scores", kde=True, stat="density", bins=30)
sns.histplot(rankllamascore_list, color="blue", label="RankLlama Scores", kde=True, stat="density", bins=30)
sns.histplot(rankllamascore_affine_list, color="blue", label="RankLlama affine Scores", kde=True, stat="density", bins=30)

plt.title('Comparison of Score Distributions')
plt.xlabel('Score')
plt.ylabel('Density')
plt.legend()
plt.savefig('score_distribution_plot.png')
plt.show()