In [None]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity



In [None]:


class CFG:
    input_path = "/content/drive/MyDrive/RetrieverNLP/resource/input/"
    train_path = f"{input_path}train.csv"
    test_path  = f"{input_path}test.csv"
    misc_path  = f"{input_path}misconception_mapping.csv"
    samp_path  = f"{input_path}sample_submission.csv"
#     max_cutoff = 50 #V1
    #max_cutoff = 100 #V2
    #max_cutoff = 50 #v3
    max_cutoff = 150 #v3
    
    is_train   = False
    if is_train:
        embd_name  = "BAAI/bge-large-en-v1.5"#online
        rerank_na  = 'BAAI/bge-reranker-large'#online
    else:
        embd_name  = "/kaggle/input/bge-large-en-v1-5/bge-large-en-v1.5"#offline
        rerank_na  = "/kaggle/input/bge-reranker-large"#offline
        
    with_fineture_reranker = True
    reranker_fineture_path = "/kaggle/input/bge-reranker-ft-v2"
#     reranker_fineture_path = "/kaggle/input/bge-reranker-ft-v3"
cfg = CFG()


train                 = pd.read_csv(cfg.train_path)
misconception_mapping = pd.read_csv(cfg.misc_path)



def make_all_question_text(df: pd.DataFrame) -> pd.DataFrame:
    df["all_question_text"] = df["ConstructName"] +" " +df["QuestionText"]
    return df
train = make_all_question_text(train)


import pandas as pd

def wide_to_long(df: pd.DataFrame) -> pd.DataFrame:
    # 第一步：处理答案文本（AnswerXText）
    text_long = pd.melt(
        df,
        id_vars    = ["QuestionId", "all_question_text", "CorrectAnswer"],
        value_vars = ["AnswerAText", "AnswerBText", "AnswerCText", "AnswerDText"],
        var_name   = 'Answer',
        value_name = 'AnswerText'
    )
    # 提取答案选项字母（如 A/B/C/D）
    text_long['Answer'] = text_long['Answer'].str.replace('Answer', '').str.replace('Text', '')

    # 第二步：处理错误概念ID（MisconceptionXId）
    misconception_long = pd.melt(
        df,
        id_vars    = ["QuestionId"],
        value_vars = ["MisconceptionAId", "MisconceptionBId", "MisconceptionCId", "MisconceptionDId"],
        var_name   = 'MisconceptionAnswer',
        value_name = 'MisconceptionId'
    )
    # 提取答案选项字母（如 A/B/C/D）
    misconception_long['Answer'] = misconception_long['MisconceptionAnswer'].str.replace('Misconception', '').str.replace('Id', '')
    misconception_long = misconception_long.drop(columns=['MisconceptionAnswer'])

    # 合并两个长格式数据
    merged_long = pd.merge(
        text_long,
        misconception_long,
        on=['QuestionId', 'Answer'],
        how='left'
    )
    
    return merged_long

train_long = wide_to_long(train)




import numpy as np
import torch
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer

# ======================================================
# 配置区（按需修改）
# ======================================================
MODELS_TO_TEST = {
    "NV-Embed-v2": {
        "model_name": "nvidia/NV-Embed-v2",
        "query_instruction": "",  # NV嵌入不需要指令前缀
        "normalize": True,
        "trust_remote_code":True
    },
    "BGE-large-en": {
        "model_name": "BAAI/bge-large-en-v1.5",
        "query_instruction": "Represent this sentence for searching relevant passages: ",
        "normalize": True,
        "trust_remote_code":False
    },
    "BGE-M3": {
        "model_name": "BAAI/bge-m3",
        "query_instruction": "",
        "normalize": False , # M3自带归一化
        "trust_remote_code":False
    }
}

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
PER_GPU_BATCH_SIZE = 32

# ======================================================
# 核心评估函数
# ======================================================
def evaluate_model(model_config, misconception_mapping, train_data):
    """评估单个模型的Recall@50"""
    # 加载模型与分词器

    model_path = "./nv-embed-v2"
    model_name = model_config["model_name"]
    if model_name == "nvidia/NV-Embed-v2":
        model = AutoModel.from_pretrained(
            model_path,
            trust_remote_code=True,
            local_files_only=True
        ).to(DEVICE)
        tokenizer = AutoTokenizer.from_pretrained(
            model_path,
            local_files_only=True    
        ).to(DEVICE)
    else:
        model = AutoModel.from_pretrained(
            model_config["model_name"]
        ).to(DEVICE)
        tokenizer = AutoTokenizer.from_pretrained(model_config["model_name"]).to(DEVICE)
    
    # 构建候选池索引映射
    misconception_ids = misconception_mapping['MisconceptionId'].values.tolist()
    misconception_texts = misconception_mapping['MisconceptionName'].values.tolist()
    id_to_index = {_id: idx for idx, _id in enumerate(misconception_ids)}

    # 生成候选池向量
    def encode_texts(texts, is_query=False):
        all_vectors = []
        for i in tqdm(range(0, len(texts), PER_GPU_BATCH_SIZE)):
            batch_texts = texts[i:i+PER_GPU_BATCH_SIZE]
            
            # 添加指令前缀（针对BGE）
            if is_query and model_config["query_instruction"]:
                batch_texts = [model_config["query_instruction"] + t for t in batch_texts]
            
            inputs = tokenizer(
                batch_texts,
                padding=True,
                truncation=True,
                max_length=512,
                return_tensors="pt"
            ).to(DEVICE)
            
            with torch.no_grad():
                outputs = model(**inputs)
                embeddings = outputs.last_hidden_state[:, 0]  # 取CLS token
            
            if model_config["normalize"]:
                embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
            
            all_vectors.append(embeddings.cpu().numpy())
        
        return np.concatenate(all_vectors, axis=0)
    
    # 编码候选池
    sentence_embeddings = encode_texts(misconception_texts)
    
    # 编码查询
    query_texts = [item["all_text"] for item in train_data]
    ground_truth_indices = [id_to_index[item["MisconceptionId"]] for item in train_data]
    query_embeddings = encode_texts(query_texts, is_query=True)
    
    # 计算Recall@50
    def calculate_recall(sent_emb, query_emb, gt_indices, top_k=50):
        index = faiss.IndexFlatIP(sent_emb.shape[1])
        index.add(sent_emb.astype(np.float32))
        
        _, top_indices = index.search(query_emb.astype(np.float32), top_k)
        return np.mean([1 if gt in indices else 0 for gt, indices in zip(gt_indices, top_indices)])
    
    return calculate_recall(sentence_embeddings, query_embeddings, ground_truth_indices)


# ======================================================
# 执行测试
# ======================================================

    # 加载数据（需替换为实际数据）
    # misconception_mapping = ... 
    # train_data = ...


results = {}
for model_name, config in MODELS_TO_TEST.items():
    print(f"\n=== Evaluating {model_name} ===")
    recall = evaluate_model(config, misconception_mapping, train_long)
    results[model_name] = recall
    print(f"{model_name} Recall@50: {recall*100:.2f}%")

# 打印最终对比结果
print("\n=== Final Results ===")
for model, score in sorted(results.items(), key=lambda x: x[1], reverse=True):
    print(f"{model:<15} | Recall@50: {score*100:.2f}%")
