### Load question

In [2]:
from datetime import datetime, timedelta
import re

def transform_time_intervals(intervals):
    def parse_relative_time(text):
        match = re.match(r"A (few|couple of) (hours|days|weeks|months|years) after", text)
        if match:
            qty = 3 if match.group(1) == "few" else 2
            unit = match.group(2)
            delta = {
                "hours": timedelta(hours=qty),
                "days": timedelta(days=qty),
                "weeks": timedelta(weeks=qty),
                "months": timedelta(days=qty * 30),  # Approximate month as 30 days
                "years": timedelta(days=qty * 365)  # Approximate year as 365 days
            }
            return delta[unit]
        return None

    def generate_timeline(events):
        now = datetime.now() - timedelta(minutes=10)  # 固定當前時間
        timeline = [now]
        for event in reversed(events[1:]):
            delta = parse_relative_time(event)
            if delta:
                now -= delta
            timeline.append(now)

        return list(reversed(timeline))
    
    result = generate_timeline(intervals)
    if len([dt.strftime("%Y-%m-%d %H:%M") for dt in result]) > 5:
        print(f'error:{intervals}')
    return [dt.strftime("%Y-%m-%d %H:%M") for dt in result]


In [5]:
import pandas as pd
question_df = pd.read_json('questions_0205.json', lines=True)
question_df['time_changed'] = question_df['time_interval'].apply(transform_time_intervals)
question_df = question_df.drop(['check', 'evidence', 'explain', 'answer', 'question_type', 'generate_dialogue', 'relationship', 'time_interval'], axis=1)

### Evaluation 

In [2]:
from long_memory.component import WeaviateLongMemory
long_mem = WeaviateLongMemory(user='Deepseek_v3', model="deepseek-chat")

Detect existed Deepseek_v3 user group memory space, loading...
Detect existed Deepseek_v3 user child memory space, loading...


In [1]:
import pandas as pd
question_df = pd.read_json("eval_result_deepseek_v3.json", lines=True)

In [None]:
question_df.to_json("eval_result_deepseek_v3.json", orient="records", lines=True)

In [None]:
import math
from datetime import datetime, timedelta

for row in range(89,500):
    print(f'---process {row+1}---')
    
    question = f"user:{question_df['question'][row]}"
    
    # long mem process
    class_count = 0
    success = False
    while not success and class_count<5:
        try:
            class_count+=1
            long_mem.del_memory()
            for number, col in enumerate(['first_session_dialogue', 'second_session_dialogue', 'third_session_dialogue', 'fourth_session_dialogue', 'fifth_session_dialogue']):
                chatlogs = []
                current_time = datetime.strptime(question_df['time_changed'][row][number], "%Y-%m-%d %H:%M")
                for i in range(math.ceil(len(question_df[col][row])/2)):
                    try:
                        chatlogs.append({
                            "text":f"user:{question_df[col][row][i*2]}, assistant:{question_df[col][row][i*2+1]}",
                            "time":current_time.strftime("%Y-%m-%dT%H:%M:%SZ")
                        })
                    except:
                        chatlogs.append({
                            "text":f"user:{question_df[col][row][i*2]}",
                            "time":current_time.strftime("%Y-%m-%dT%H:%M:%SZ")
                        })
                    current_time = current_time + timedelta(minutes=1)
                long_mem.add_chat_logs(chatlogs)
            success = True
        except Exception as e:
            print(f"---long memory chatlog classify error:{e}, retry...---")
    search_count = 0
    success = False
    while not success and search_count<5:
        try:
            search_count+=1
            searched_memory = long_mem.get_memory(question, recall=False, retrieve_number=5)
            question_df.loc[row, 'long_mem_result'] = str(searched_memory)
            # (recall)
            searched_memory = long_mem.get_memory(question, recall=True, retrieve_number=5)
            question_df.loc[row, 'long_mem_recall_result'] = str(searched_memory)
            success = True
        except Exception as e:
            print(f"---long memory action error:{e}, retry...---")
    
    if (row+1)%50==0:
        question_df.to_json("eval_result_deepseek_v3.json", orient="records", lines=True)
question_df.to_json("eval_result_deepseek_v3.json", orient="records", lines=True)

---process 90---
Detect empty group memory, create memory space...
Detect empty child memory, create memory space...
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
---process 91---
Detect empty group memory, create memory space...
Detect empty child memory, create memory space...
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
---process 92---
Detect empty group memory, create memory space...
Detect empty child memory, create memory space...
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
[34mSave chat logs

Evaluation

In [4]:
import pandas as pd
deepseek_v3_df = pd.read_json('eval_result_deepseek_v3.json', lines=True)
question_df = pd.read_json('questions_0205.json', lines=True)
base_df = pd.read_json("eval_result_base.json", lines=True)
base_score = pd.read_json('eval_result_base_score.json', lines=True)

In [5]:
from dotenv import load_dotenv
from openai import OpenAI
import os

load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def llm_create(prompt):
    messages = [{"role": "user", "content": prompt}]
    completion = client.chat.completions.create(
        model="gpt-4o",
        messages=messages,
    )
    return completion.choices[0].message.content

In [6]:
import requests
import json
import re

def llm_response_handler(response:str):
    """handle llm response format, especially for llama family"""
    try:
        return json.loads(response)
    except:
        response = response.strip()
        try:
            return json.loads(re.search(r"```json(.*?)```", response, re.DOTALL).group(1).strip())
        except:
            try:
                return json.loads(re.search(r"```(.*?)```", response, re.DOTALL).group(1).strip())
            except:
                return response

In [7]:
evaluation_prompt = """You will act as a reviewer to score the following returned memory based on their relevance, conciseness and readability.
The fewer and more relevant the key messages are, the higher the score will be.
Here is the question and answer:{question}

The score range is from 1 to 10, with 1 being the worst and 10 being the best, ranked in order of importance:

1. Relevance of the returned information and its ability to support generating similar responses.
2. Conciseness— the more concise and noise-free the returned content, the better.
3. Readability for humans, maintaining chronological order to preserve memory continuity.

Please refer to the following three examples for guidance:
A system:{short_memory}, score:{short_score}
B system:{base_dialog}, score:{base_score}
C system:{base_paragraph}, score:{base_p_score}

Next, please score the returned memory for the following systems:

D system:{long_memory}

E system:{long_recall_memory}

Please provide the scores in the following format:
```json
{{
    "D":"1~10",
    "E":"1~10"
}}
```"""

In [8]:
def get_res(question, short_searched, long_searched, long_recall_searched, base_dialog_searched, base_paragraph_searched, short_s, base_s, base_p_s):
    res = llm_create(evaluation_prompt.format(question=question, short_memory=short_searched, 
                                              long_memory=long_searched, long_recall_memory=long_recall_searched, base_dialog=base_dialog_searched, 
                                              base_paragraph=base_paragraph_searched, short_score=short_s, base_score=base_s, base_p_score=base_p_s))
    res_dict = llm_response_handler(res)
    # print(evaluation_prompt.format(question=question, short_memory=short_searched, long_memory=long_searched, long_recall_memory=long_recall_searched, base_dialog=base_dialog_searched, base_paragraph=base_paragraph_searched))
    return res_dict.get('D'), res_dict.get('E')

for row in range(500):
    print(f'---process {row+1}---')
    
    qa_pair = f"user:{question_df['question'][row]}, assistant:{question_df['answer'][row]}"
    
    long_searched = deepseek_v3_df.loc[row, 'long_mem_result']
    long_recall_searched = deepseek_v3_df.loc[row, 'long_mem_recall_result']
    
    short_searched = base_df.loc[row, 'short_mem_result']
    base_dialog_searched = base_df.loc[row, 'base_dialog']
    base_paragraph_searched = base_df.loc[row, 'base_paragraph']
    
    short_s = base_score.loc[row, 'short_score']
    base_s = base_score.loc[row, 'base_dialog_score']
    base_p_s = base_score.loc[row, 'base_paragraph_score']
    
    long_score, long_recall_score= get_res(qa_pair, short_searched, long_searched, long_recall_searched, base_dialog_searched, base_paragraph_searched, short_s, base_s, base_p_s)
    deepseek_v3_df.loc[row, 'long_score'] = long_score
    deepseek_v3_df.loc[row, 'long_recall_score'] = long_recall_score
deepseek_v3_df.to_json('eval_result_deepseek_v3.json', orient="records", lines=True)

---process 1---
---process 2---
---process 3---
---process 4---
---process 5---
---process 6---
---process 7---
---process 8---
---process 9---
---process 10---
---process 11---
---process 12---
---process 13---
---process 14---
---process 15---
---process 16---
---process 17---
---process 18---
---process 19---
---process 20---
---process 21---
---process 22---
---process 23---
---process 24---
---process 25---
---process 26---
---process 27---
---process 28---
---process 29---
---process 30---
---process 31---
---process 32---
---process 33---
---process 34---
---process 35---
---process 36---
---process 37---
---process 38---
---process 39---
---process 40---
---process 41---
---process 42---
---process 43---
---process 44---
---process 45---
---process 46---
---process 47---
---process 48---
---process 49---
---process 50---
---process 51---
---process 52---
---process 53---
---process 54---
---process 55---
---process 56---
---process 57---
---process 58---
---process 59---
---pro

In [9]:
number=500
observe_df = deepseek_v3_df[:number]
print(observe_df['long_score'].astype(int).sum()/number)
print(observe_df['long_recall_score'].astype(int).sum()/number)

7.188
8.288
