In [1]:
import re
import numpy as np
from typing import List
from concurrent.futures import ThreadPoolExecutor
from sentence_transformers import SentenceTransformer
from FlagEmbedding import BGEM3FlagModel
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
model_name_or_path = '/root/autodl-tmp/BAAI/bge-m3'
model = BGEM3FlagModel(model_name_or_path, use_fp16=False)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from rouge_score import rouge_scorer
from sklearn.metrics import accuracy_score, f1_score
from rouge_chinese import Rouge
import jieba
import string

bert_en_path='/root/autodl-tmp/google-bert/bert-base-uncased'

In [3]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel

# 提前加载模型和 tokenizer
bert_tokenizer_en = AutoTokenizer.from_pretrained(bert_en_path)
bert_model_en = AutoModel.from_pretrained(bert_en_path).eval()

def get_token_embeddings(sentence, model, tokenizer):
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    hidden_states = outputs.last_hidden_state.squeeze(0)
    tokens = tokenizer.tokenize(sentence)
    return hidden_states[1:1+len(tokens)], tokens  # 去除 [CLS]

def cosine_similarity_matrix(x, y):
    x_norm = F.normalize(x, dim=1)
    y_norm = F.normalize(y, dim=1)
    return torch.mm(x_norm, y_norm.transpose(0, 1))

def compute_bertscore(cand_emb, ref_emb):
    sim_matrix = cosine_similarity_matrix(cand_emb, ref_emb)
    precision = sim_matrix.max(dim=1)[0].mean()
    recall = sim_matrix.max(dim=0)[0].mean()
    f1 = 2 * precision * recall / (precision + recall + 1e-8)
    return precision.item(), recall.item(), f1.item()

def bertscore(cand, ref, tokenizer,model):
    cand_emb, _ = get_token_embeddings(cand, model, tokenizer)
    ref_emb, _ = get_token_embeddings(ref, model, tokenizer)
    return compute_bertscore(cand_emb, ref_emb)

In [None]:
import re
import torch
from typing import List
from concurrent.futures import ThreadPoolExecutor

class AnswerContentReward():
    def __call__(self, completions: List[str], **kwargs) -> List[float]:
        
        solution_data = kwargs['solution']
        # print(dataset)
        # print(lan)
        # print(solution_data)
        def cosine_similarity(a: torch.Tensor, b: torch.Tensor) -> float:
            a_norm = torch.norm(a)
            b_norm = torch.norm(b)
            if a_norm.item() == 0 or b_norm.item() == 0:
                return 0.0
            return torch.dot(a, b).item() / (a_norm.item() * b_norm.item())
        
        def normalize_answer(s):
            def white_space_fix(text):
                return ' '.join(text.split())
            def remove_punc(text):
                exclude = set(string.punctuation)
                return ''.join(ch for ch in text if ch not in exclude)
            def lower(text):
                return text.lower()
            return white_space_fix(remove_punc(lower(s)))

        def calculate_reward(index: int, completion: str, solution_data: str) -> (int, float):
            # try:
                # answer_match = re.search(r"<answer>(.*?)</answer>", completion, re.DOTALL)
                # if not answer_match:
                #     return index, 0.0
                # answer_text = answer_match.group(1).strip()
                answer_text=completion
                completion_emb = model.encode(answer_text)['dense_vecs']
                solution_emb = model.encode(solution_data)['dense_vecs']
                # 转为float32的tensor
                completion_emb = torch.tensor(completion_emb, dtype=torch.float16)
                solution_emb = torch.tensor(solution_emb, dtype=torch.float16)
                cos_sim = cosine_similarity(completion_emb, solution_emb)
                # print(cos_sim)
                scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
                score = scorer.score(solution_data, answer_text)
                rouge_l = score["rougeL"].fmeasure
                ref = solution_data
                hyp = answer_text
                P, R, F1=bertscore(hyp, ref, bert_tokenizer_en,bert_model_en)
                print(P, R, F1)
                reword=(rouge_l+cos_sim+P+R+F1)/5     
                return index,reword
            # except:    
            #     return index, 0.0
        the_score = [0.0] * len(completions)
        with ThreadPoolExecutor() as executor:
            futures = {
                executor.submit(calculate_reward, index, completion, solution_data): index
                for index, completion in enumerate(completions)
            }
            for future in futures:
                
                index, score = future.result()
                the_score[index] = score
        return the_score
    
one_data = {
    "id": "harvard_test_001",
    "dataset": "mramg",
    "lan": "en",
    "solution": "The patient underwent laser iridotomy in both eyes for glaucoma treatment.",
}
test_cases = [
    [   # Row 1 - 结构完整，各类评分例子
        # 🔵 高分：完全贴合
        "<answer>The patient underwent laser iridotomy in both eyes for glaucoma treatment.</answer>",
        # 🟡 中分：部分相关
        "<answer>The patient had a treatment related to eyes, possibly involving surgery.</answer>",
        # 🔴 低分：完全无关
        "<answer>I went to the park and saw ducks in the pond.</answer>",
        # ❌ 错误格式：没有 answer 标签
        "The patient underwent laser iridotomy in both eyes for glaucoma treatment.",
    ],
    [   # Row 2 - 中文 mramg 数据集
        "<answer>患者双眼接受了激光虹膜切开术，用于治疗青光眼。</answer>",      
        "<answer>患者接受了某种治疗眼睛的手术。</answer>",                          
        "<answer>我喜欢吃火锅和打篮球。</answer>",                                  
        "<answer>标签嵌套错误</answer>",                             
    ],
]
orm = AnswerContentReward()
for test_case in test_cases:
    # test_case是一个小批次的 completions（比如4条小completion）
    result = orm(test_case,**one_data)
    print(result)


You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


1.0 1.0 1.0
0.5552507638931274 0.601777970790863 0.5775789022445679


RuntimeError: Tensor.item() cannot be called on meta tensors