In [121]:
import pandas as pd
import json
from tqdm import tqdm
import numpy as np
from scipy.stats import kendalltau
from sklearn.metrics import ndcg_score
from openai import OpenAI

In [122]:
corpus = pd.read_json("corpus.jsonl", lines=True)
queries = pd.read_json("queries.jsonl", lines=True)
corpus.set_index("_id", inplace=True)
queries.set_index("_id", inplace=True)
qrels = pd.read_csv("dev.tsv", sep="\t")
qrels

Unnamed: 0,query-id,corpus-id,score
0,0,620,2
1,0,621,2
2,0,622,2
3,0,616,1
4,0,617,1
...,...,...,...
4993,557,188393,1
4994,557,188394,1
4995,558,188507,2
4996,558,188508,2


In [123]:
e5_results = pd.read_json("results_wikifacts-window_3_v0_dev.json")
e5_results.head()

Unnamed: 0,0,1,2,3,4,6,7,8,9,10,...,549,550,551,552,553,554,555,556,557,558
599,83.478034,,,,,,,,,,...,,,,,,,,,,
91397,83.293903,,,,,,,,,,...,,,,,,,,,,
140938,83.293903,,,,,,,,,,...,,,,,,,,,,
9725,83.293903,,,,,,,,,,...,,,,,,,,,,
598,83.043307,,,,,,,,,,...,,,,,,,,,,


In [124]:
def get_top_k_nlargest(df, k=20):
    """
    Использует метод nlargest для каждого столбца
    """
    results = {
        "query-id": [],
        "corpus-id": [],
        "e5-score": [],
    }
    
    for query_id in df.columns:
        top_k = df[query_id].nlargest(k)
        n = len(top_k)
        results["query-id"] += [query_id for _ in range(n)]
        results["corpus-id"] += top_k.index.tolist()
        results["e5-score"] += top_k.values.tolist()
    return pd.DataFrame(results)

e5_df = get_top_k_nlargest(e5_results)

In [125]:
e5_df.head(20)
def merge_scores(e5_df, qrels):
  merged = pd.merge(
        e5_df,
        qrels,
        on=['query-id', 'corpus-id'],
        how='left'
    )

  # matched_count = merged['score'].notna().sum()
  # total_count = len(merged)
  
  # print(f"Найдено соответствий: {matched_count} из {total_count} ({matched_count/total_count:.1%})")
  
  merged['score'] = merged['score'].fillna(0).astype(int)
  
  # merged = merged[['query-id', 'corpus-id', 'e5-score', 'score']]
  
  return merged
top_qrels = merge_scores(e5_df, qrels)

In [126]:
qrels[qrels['query-id'] == 0]

Unnamed: 0,query-id,corpus-id,score
0,0,620,2
1,0,621,2
2,0,622,2
3,0,616,1
4,0,617,1
5,0,618,1
6,0,597,1
7,0,598,1
8,0,599,1


In [127]:
e5_df[(e5_df['query-id'] == 0)]

Unnamed: 0,query-id,corpus-id,e5-score
0,0,599,83.478034
1,0,91397,83.293903
2,0,140938,83.293903
3,0,9725,83.293903
4,0,598,83.043307
5,0,9568,82.797515
6,0,140781,82.797515
7,0,91240,82.797515
8,0,616,82.767916
9,0,618,82.687384


In [128]:
top_qrels[(top_qrels['query-id'] == 0)]

Unnamed: 0,query-id,corpus-id,e5-score,score
0,0,599,83.478034,1
1,0,91397,83.293903,0
2,0,140938,83.293903,0
3,0,9725,83.293903,0
4,0,598,83.043307,1
5,0,9568,82.797515,0
6,0,140781,82.797515,0
7,0,91240,82.797515,0
8,0,616,82.767916,1
9,0,618,82.687384,1


In [129]:
deepseek = pd.read_csv("deepseek_bing.tsv", sep="\t")
deepseek[deepseek['query-id'] == 0]

Unnamed: 0,llm-score,human-score,query-id,corpus-id
0,2,2,0,620
1,2,2,0,621
2,2,2,0,622
3,2,1,0,616
4,2,1,0,617
5,2,1,0,618
6,2,1,0,597
7,2,1,0,598
8,2,1,0,599


In [133]:

def merge_scores_llm(e5_df, llm):
  merged = pd.merge(
        e5_df,
        llm,
        on=['query-id', 'corpus-id'],
        how='left'
    )

  del merged["human-score"]
  
  return merged
ftop_qrels = merge_scores_llm(top_qrels, deepseek)
print(len(ftop_qrels))
ftop_qrels[ftop_qrels['query-id'] == 0]

10680


Unnamed: 0,query-id,corpus-id,e5-score,score,llm-score
0,0,599,83.478034,1,2.0
1,0,91397,83.293903,0,
2,0,140938,83.293903,0,
3,0,9725,83.293903,0,
4,0,598,83.043307,1,2.0
5,0,9568,82.797515,0,
6,0,140781,82.797515,0,
7,0,91240,82.797515,0,
8,0,616,82.767916,1,2.0
9,0,618,82.687384,1,2.0


In [131]:
print(queries.loc[0]["text"])
print(corpus.loc[91397]["text"])

Дядя Булгакова так и не узнал, что является прототипом профессора Преображенского.
Пётр Иванович Булгаков — дядя, священник, автор коротких рассказов («Лукавый» и др.). Михаил Михайлович Покровский (1882—1943) — дядя, врач-терапевт. Николай Михайлович Покровский (1878—1942) — дядя, врач-гинеколог, основной прототип профессора Филиппа Филипповича Преображенского из повести «Собачье сердце».


In [132]:
top_qrels.to_csv('top_qrels.tsv', sep='\t', index=False)