Judge by qwen2.5

In [1]:
evaluation_prompt = """You will act as a reviewer to score the following returned memory based on their relevance, conciseness and readability.
The fewer and more relevant the key messages are, the higher the score will be.
Here is the question and answer:{question}

The score range is from 1 to 10, with 1 being the worst and 10 being the best, ranked in order of importance:

1. Relevance of the returned information and its ability to support generating similar responses.
2. Conciseness— the more concise and noise-free the returned content, the better.
3. Readability for humans, maintaining chronological order to preserve memory continuity.

Please refer to the following three examples for guidance:
A system:{short_memory}, score:{short_score}
B system:{base_dialog}, score:{base_score}
C system:{base_paragraph}, score:{base_p_score}

Next, please score the returned memory for the following systems:

D system:{long_memory}

E system:{long_recall_memory}

Please provide the scores in the following format:
```json
{{
    "D":"1~10",
    "E":"1~10"
}}
```"""

In [2]:
import requests
import json
import re

def llm_create(prompt):
    url = "http://localhost:11434/api/generate"
    data = {
        "model": "qwen2.5:32b-instruct-q4_K_M",
        "prompt": prompt,
        "stream":False
    }

    response = requests.post(url, json=data)
    return response.json()['response']

def llm_response_handler(response:str):
    """handle llm response format, especially for llama family"""
    try:
        return json.loads(response)
    except:
        response = response.strip()
        try:
            return json.loads(re.search(r"```json(.*?)```", response, re.DOTALL).group(1).strip())
        except:
            try:
                return json.loads(re.search(r"```(.*?)```", response, re.DOTALL).group(1).strip())
            except:
                return response
            
def get_res(question, short_searched, long_searched, long_recall_searched, base_dialog_searched, base_paragraph_searched, short_s, base_s, base_p_s):
    res = llm_create(evaluation_prompt.format(question=question, short_memory=short_searched, 
                                              long_memory=long_searched, long_recall_memory=long_recall_searched, base_dialog=base_dialog_searched, 
                                              base_paragraph=base_paragraph_searched, short_score=short_s, base_score=base_s, base_p_score=base_p_s))
    res_dict = llm_response_handler(res)
    # print(res_dict)
    # print(evaluation_prompt.format(question=question, short_memory=short_searched, long_memory=long_searched, long_recall_memory=long_recall_searched, base_dialog=base_dialog_searched, base_paragraph=base_paragraph_searched))
    return res_dict.get('D'), res_dict.get('E')

gpt-4o-mini

In [4]:
import pandas as pd
gpt_df = pd.read_json("eval_result_gpt-4o-mini.json", lines=True)
question_df = pd.read_json('questions_0205.json', lines=True)
base_df = pd.read_json("eval_result_base.json", lines=True)
base_score = pd.read_json('eval_result_base_score.json', lines=True)

In [12]:
for row in range(500):
    print(f'---process {row+1}---')
    
    qa_pair = f"user:{question_df['question'][row]}, assistant:{question_df['answer'][row]}"
    
    long_searched = gpt_df.loc[row, 'long_mem_result']
    long_recall_searched = gpt_df.loc[row, 'long_mem_recall_result']
    
    short_searched = base_df.loc[row, 'short_mem_result']
    base_dialog_searched = base_df.loc[row, 'base_dialog']
    base_paragraph_searched = base_df.loc[row, 'base_paragraph']
    
    short_s = base_score.loc[row, 'short_score']
    base_s = base_score.loc[row, 'base_dialog_score']
    base_p_s = base_score.loc[row, 'base_paragraph_score']
    
    long_score, long_recall_score= get_res(qa_pair, short_searched, long_searched, long_recall_searched, base_dialog_searched, base_paragraph_searched, short_s, base_s, base_p_s)
    gpt_df.loc[row, 'long_score_by_qwen'] = long_score
    gpt_df.loc[row, 'long_recall_score_by_qwen'] = long_recall_score

---process 77---


In [13]:
number=500
observe_df = gpt_df[:number]
print(observe_df['long_score_by_qwen'].astype(int).sum()/number)
print(observe_df['long_recall_score_by_qwen'].astype(int).sum()/number)

7.214
7.562


qwen2.5

In [14]:
import pandas as pd
qwen_df = pd.read_json('eval_result_qwen.json', lines=True)
question_df = pd.read_json('questions_0205.json', lines=True)
base_df = pd.read_json("eval_result_base.json", lines=True)
base_score = pd.read_json('eval_result_base_score.json', lines=True)

In [21]:
for row in range(500):
    print(f'---process {row+1}---')
    
    qa_pair = f"user:{question_df['question'][row]}, assistant:{question_df['answer'][row]}"
    
    long_searched = qwen_df.loc[row, 'long_mem_result']
    long_recall_searched = qwen_df.loc[row, 'long_mem_recall_result']
    
    short_searched = base_df.loc[row, 'short_mem_result']
    base_dialog_searched = base_df.loc[row, 'base_dialog']
    base_paragraph_searched = base_df.loc[row, 'base_paragraph']
    
    short_s = base_score.loc[row, 'short_score']
    base_s = base_score.loc[row, 'base_dialog_score']
    base_p_s = base_score.loc[row, 'base_paragraph_score']
    
    long_score, long_recall_score= get_res(qa_pair, short_searched, long_searched, long_recall_searched, base_dialog_searched, base_paragraph_searched, short_s, base_s, base_p_s)
    qwen_df.loc[row, 'long_score_by_qwen'] = long_score
    qwen_df.loc[row, 'long_recall_score_by_qwen'] = long_recall_score

---process 35---
---process 184---
---process 238---


In [22]:
number=500
observe_df = qwen_df[:number]
print(observe_df['long_score_by_qwen'].astype(int).sum()/number)
print(observe_df['long_recall_score_by_qwen'].astype(int).sum()/number)

7.092
7.75


gemma2

In [None]:
import pandas as pd
gemma_df = pd.read_json('eval_result_gemma.json', lines=True)
question_df = pd.read_json('questions_0205.json', lines=True)
base_df = pd.read_json("eval_result_base.json", lines=True)
base_score = pd.read_json('eval_result_base_score.json', lines=True)

In [6]:
for row in range(307, 500):
    print(f'---process {row+1}---')
    
    qa_pair = f"user:{question_df['question'][row]}, assistant:{question_df['answer'][row]}"
    
    long_searched = gemma_df.loc[row, 'long_mem_result']
    long_recall_searched = gemma_df.loc[row, 'long_mem_recall_result']
    
    short_searched = base_df.loc[row, 'short_mem_result']
    base_dialog_searched = base_df.loc[row, 'base_dialog']
    base_paragraph_searched = base_df.loc[row, 'base_paragraph']
    
    short_s = base_score.loc[row, 'short_score']
    base_s = base_score.loc[row, 'base_dialog_score']
    base_p_s = base_score.loc[row, 'base_paragraph_score']
    
    long_score, long_recall_score= get_res(qa_pair, short_searched, long_searched, long_recall_searched, base_dialog_searched, base_paragraph_searched, short_s, base_s, base_p_s)
    gemma_df.loc[row, 'long_score_by_qwen'] = long_score
    gemma_df.loc[row, 'long_recall_score_by_qwen'] = long_recall_score

---process 308---
---process 309---
---process 310---
---process 311---
---process 312---
---process 313---
---process 314---
---process 315---
---process 316---
---process 317---
---process 318---
---process 319---
---process 320---
---process 321---
---process 322---
---process 323---
---process 324---
---process 325---
---process 326---
---process 327---
---process 328---
---process 329---
---process 330---
---process 331---
---process 332---
---process 333---
---process 334---
---process 335---
---process 336---
---process 337---
---process 338---
---process 339---
---process 340---
---process 341---
---process 342---
---process 343---
---process 344---
---process 345---
---process 346---
---process 347---
---process 348---
---process 349---
---process 350---
---process 351---
---process 352---
---process 353---
---process 354---
---process 355---
---process 356---
---process 357---
---process 358---
---process 359---
---process 360---
---process 361---
---process 362---
---process

In [7]:
number=500
observe_df = gemma_df[:number]
print(observe_df['long_score_by_qwen'].astype(int).sum()/number)
print(observe_df['long_recall_score_by_qwen'].astype(int).sum()/number)

6.888
6.324


llama3.3

In [26]:
import pandas as pd
llama_3_3_df = pd.read_json('eval_result_llama3_3.json', lines=True)
question_df = pd.read_json('questions_0205.json', lines=True)
base_df = pd.read_json("eval_result_base.json", lines=True)
base_score = pd.read_json('eval_result_base_score.json', lines=True)

In [30]:
for row in range(500):
    print(f'---process {row+1}---')
    
    qa_pair = f"user:{question_df['question'][row]}, assistant:{question_df['answer'][row]}"
    
    long_searched = llama_3_3_df.loc[row, 'long_mem_result']
    long_recall_searched = llama_3_3_df.loc[row, 'long_mem_recall_result']
    
    short_searched = base_df.loc[row, 'short_mem_result']
    base_dialog_searched = base_df.loc[row, 'base_dialog']
    base_paragraph_searched = base_df.loc[row, 'base_paragraph']
    
    short_s = base_score.loc[row, 'short_score']
    base_s = base_score.loc[row, 'base_dialog_score']
    base_p_s = base_score.loc[row, 'base_paragraph_score']
    
    long_score, long_recall_score= get_res(qa_pair, short_searched, long_searched, long_recall_searched, base_dialog_searched, base_paragraph_searched, short_s, base_s, base_p_s)
    llama_3_3_df.loc[row, 'long_score_by_qwen'] = long_score
    llama_3_3_df.loc[row, 'long_recall_score_by_qwen'] = long_recall_score

---process 75---


In [31]:
number=500
observe_df = llama_3_3_df[:number]
print(observe_df['long_score_by_qwen'].astype(int).sum()/number)
print(observe_df['long_recall_score_by_qwen'].astype(int).sum()/number)

6.576
7.382


Deepseek-v3

In [4]:
import pandas as pd
deepseek_df = pd.read_json('eval_result_deepseek_v3.json', lines=True)
question_df = pd.read_json('questions_0205.json', lines=True)
base_df = pd.read_json("eval_result_base.json", lines=True)
base_score = pd.read_json('eval_result_base_score.json', lines=True)

In [8]:
for row in range(500):
    print(f'---process {row+1}---')
    
    qa_pair = f"user:{question_df['question'][row]}, assistant:{question_df['answer'][row]}"
    
    long_searched = deepseek_df.loc[row, 'long_mem_result']
    long_recall_searched = deepseek_df.loc[row, 'long_mem_recall_result']
    
    short_searched = base_df.loc[row, 'short_mem_result']
    base_dialog_searched = base_df.loc[row, 'base_dialog']
    base_paragraph_searched = base_df.loc[row, 'base_paragraph']
    
    short_s = base_score.loc[row, 'short_score']
    base_s = base_score.loc[row, 'base_dialog_score']
    base_p_s = base_score.loc[row, 'base_paragraph_score']
    
    long_score, long_recall_score= get_res(qa_pair, short_searched, long_searched, long_recall_searched, base_dialog_searched, base_paragraph_searched, short_s, base_s, base_p_s)
    deepseek_df.loc[row, 'long_score_by_llama'] = long_score
    deepseek_df.loc[row, 'long_recall_score_by_llama'] = long_recall_score

---process 70---
---process 319---


In [10]:
number=500
observe_df = deepseek_df[:number]
print(observe_df['long_score_by_llama'].astype(int).sum()/number)
print(observe_df['long_recall_score_by_llama'].astype(int).sum()/number)

7.128
8.066
