Load datasets and generate result

In [5]:
from datetime import datetime, timedelta
import re

def transform_time_intervals(intervals):
    def parse_relative_time(text):
        match = re.match(r"A (few|couple of) (hours|days|weeks|months|years) after", text)
        if match:
            qty = 3 if match.group(1) == "few" else 2
            unit = match.group(2)
            delta = {
                "hours": timedelta(hours=qty),
                "days": timedelta(days=qty),
                "weeks": timedelta(weeks=qty),
                "months": timedelta(days=qty * 30),  # Approximate month as 30 days
                "years": timedelta(days=qty * 365)  # Approximate year as 365 days
            }
            return delta[unit]
        return None

    def generate_timeline(events):
        now = datetime.now() - timedelta(minutes=10)  # 固定當前時間
        timeline = [now]
        for event in reversed(events[1:]):
            delta = parse_relative_time(event)
            if delta:
                now -= delta
            timeline.append(now)

        return list(reversed(timeline))
    
    result = generate_timeline(intervals)
    if len([dt.strftime("%Y-%m-%d %H:%M") for dt in result]) > 5:
        print(f'error:{intervals}')
    return [dt.strftime("%Y-%m-%d %H:%M") for dt in result]


In [6]:
import pandas as pd
question_df = pd.read_json('questions_0205.json', lines=True)
question_df['time_changed'] = question_df['time_interval'].apply(transform_time_intervals)
question_df = question_df.drop(['check', 'evidence', 'explain', 'answer', 'question_type', 'generate_dialogue', 'relationship', 'time_interval'], axis=1)

In [1]:
import pandas as pd
question_df = pd.read_json("eval_result_remind.json", lines=True)

In [2]:
from long_memory.component import WeaviateLongMemory
from short_memory.component import WeaviateShortMemory

long_mem = WeaviateLongMemory(user='GPT_4o_mini')
short_mem = WeaviateShortMemory(user='GPT_4o_mini')

Detect existed GPT_4o_mini user group memory space, loading...
Detect existed GPT_4o_mini user child memory space, loading...
Detect existed GPT_4o_mini user short memory space, loading...


In [3]:
combine_prompt = """The following memories have been retrieved from the system.

Recent memory:{short_memory}

Older memory:{long_memory}"""

In [10]:
import math
from datetime import datetime, timedelta

yesterday = datetime.now() - timedelta(days=1)
for row in range(410, 500):
    print(f'---process {row+1}---')
    
    question = f"user:{question_df['question'][row]}"
    
    # remind process
    success = False
    while not success:
        try:
            long_mem.del_memory()
            short_mem.del_memory()
            for number, col in enumerate(['first_session_dialogue', 'second_session_dialogue', 'third_session_dialogue', 'fourth_session_dialogue', 'fifth_session_dialogue']):
                chatlogs = []
                current_time = datetime.strptime(question_df['time_changed'][row][number], "%Y-%m-%d %H:%M")
                # save to long or short by time
                if current_time >= yesterday:
                    for i in range(math.ceil(len(question_df[col][row])/2)):
                        try:
                            chatlogs.append({
                                "user":question_df[col][row][i*2],
                                "assistant":question_df[col][row][i*2+1],
                                "time":current_time.strftime("%Y-%m-%dT%H:%M:%SZ")
                            })
                        except:
                            chatlogs.append({
                                "user":question_df[col][row][i*2],
                                "assistant":"",
                                "time":current_time.strftime("%Y-%m-%dT%H:%M:%SZ")
                            })
                        current_time = current_time + timedelta(minutes=1)
                    short_mem.add_chatlogs(chatlogs)
                else:
                    for i in range(math.ceil(len(question_df[col][row])/2)):
                        try:
                            chatlogs.append({
                                "text":f"user:{question_df[col][row][i*2]}, assistant:{question_df[col][row][i*2+1]}",
                                "time":current_time.strftime("%Y-%m-%dT%H:%M:%SZ")
                            })
                        except:
                            chatlogs.append({
                                "text":f"user:{question_df[col][row][i*2]}",
                                "time":current_time.strftime("%Y-%m-%dT%H:%M:%SZ")
                            })
                        current_time = current_time + timedelta(minutes=1)
                    long_mem.add_chat_logs(chatlogs)
            success = True
        except Exception as e:
            print(f"---remind error:{e}, retry...---")
    success = False
    while not success:
        try:
            # searched_memory
            long_memory = long_mem.get_memory(question, recall=True, retrieve_number=5)
            short_memory = short_mem.get_memory(question, k=5)
            searched_memory = combine_prompt.format(short_memory=short_memory, long_memory=long_memory)
            question_df.loc[row, 'remind_result'] = str(searched_memory)
            success = True
        except Exception as e:
            print(f"---remind error:{e}, retry...---")

    if (row+1)%50==0:
        question_df.to_json("eval_result_remind.json", orient="records", lines=True)
question_df.to_json("eval_result_remind.json", orient="records", lines=True)

---process 411---
Detect empty group memory, create memory space...
Detect empty child memory, create memory space...
Detect empty short memory, create memory space...
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
---remind error:Not valid 'uuid' or 'uuid' can not be extracted from value, retry...---
---remind error:Not valid 'uuid' or 'uuid' can not be extracted from value, retry...---
---process 412---
Detect empty group memory, create memory space...
Detect empty child memory, create memory space...
Detect empty short memory, create memory space...
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
---remind error:Not valid 'uuid' or 'uuid' can not be extracted from value, retry...---
---process 413---
Detect empty group memory, create memory space...
Detect empty child memory, create memory space...
Detect empty

Score

In [1]:
from dotenv import load_dotenv
from openai import OpenAI
import json
import re
import os

load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def llm_create(prompt):
    messages = [{"role": "user", "content": prompt}]
    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages,
    )
    return completion.choices[0].message.content

def llm_response_handler(response:str):
    """handle llm response format, especially for llama family"""
    try:
        return json.loads(response)
    except:
        response = response.strip()
        try:
            return json.loads(re.search(r"```json(.*?)```", response, re.DOTALL).group(1).strip())
        except:
            try:
                return json.loads(re.search(r"```(.*?)```", response, re.DOTALL).group(1).strip())
            except:
                return response

In [2]:
generate_prompt = """You are given a question from user ask assistant, and have a memory retrieve from assistant
Pretend you are assistant and give a response.

Question:{question}

Memory:{memory}"""

judge_prompt = """Your task is to label an answer to a question as ‘CORRECT’ or ‘WRONG’.

You will be given the following data:

A question (posed by user to assistant).
A ‘gold’ (ground truth) answer.
A generated answer, which you will score as CORRECT or WRONG.
The point of the question is to ask about something assistant should know about the user based on their prior conversations. The gold answer will usually be a concise and short answer that includes the referenced topic.

Example:
Question: Do you remember what I got the last time I went to Hawaii?
Gold answer: A shell necklace

The generated answer might be much longer, but you should be generous with your grading—as long as it touches on the same topic as the gold answer, it should be counted as CORRECT.

Examples of CORRECT answers:
- Oh yeah, that was so fun! I got so much stuff there, including that shell necklace.
- I got a ton of stuff... that surfboard, the mug, the necklace, those coasters too.
- That cute necklace.

Examples of WRONG answers:
- Oh yeah, that was so fun! I got so much stuff there, including that mug.
- I got a ton of stuff... that surfboard, the mug, those coasters too.
- I’m sorry, I don’t remember what you’re talking about.

Now it’s time for the real question:
Question: {question}
Gold answer: {answer}
Generated answer: {generate_answer}

Only answer CORRECT or WRONG, no need to explain"""

In [None]:
import pandas as pd
question_df = pd.read_json('questions_0205.json', lines=True)
remind_df = pd.read_json("eval_result_remind.json", lines=True)
stm_df = pd.read_json("eval_result_base.json", lines=True)
ltm_df = pd.read_json("eval_result_gpt-4o-mini.json", lines=True)
# judge_df = pd.DataFrame()
judge_df = pd.read_json("judge_remind.json", lines=True)

In [None]:
for row in range(500):
    print(f'---process {row+1}---')
    
    question = f"{question_df['question'][row]}"
    answer = f"{question_df['answer'][row]}"
    
    long_searched = ltm_df.loc[row, 'long_mem_result']
    long_recall_searched = ltm_df.loc[row, 'long_mem_recall_result']
    short_searched = stm_df.loc[row, 'short_mem_result']
    remind_searched = remind_df.loc[row, 'remind_result']
    
    judge_df.loc[row, 'long_mem_result'] = long_searched
    judge_df.loc[row, 'long_mem_answer'] = llm_create(generate_prompt.format(question=question, memory=long_searched))
    judge_df.loc[row, 'long_mem_judge'] = llm_create(judge_prompt.format(question=question, answer=answer, generate_answer=judge_df.loc[row, 'long_mem_answer']))
    judge_df.loc[row, 'long_mem_recall_result'] = long_recall_searched
    judge_df.loc[row, 'long_mem_recall_answer'] = llm_create(generate_prompt.format(question=question, memory=long_recall_searched))
    judge_df.loc[row, 'long_mem_recall_judge'] = llm_create(judge_prompt.format(question=question, answer=answer, generate_answer=judge_df.loc[row, 'long_mem_recall_answer']))
    judge_df.loc[row, 'short_mem_result'] = short_searched
    judge_df.loc[row, 'short_mem_answer'] = llm_create(generate_prompt.format(question=question, memory=short_searched))
    judge_df.loc[row, 'short_mem_judge'] = llm_create(judge_prompt.format(question=question, answer=answer, generate_answer=judge_df.loc[row, 'short_mem_answer']))
    judge_df.loc[row, 'remind_result'] = remind_searched
    judge_df.loc[row, 'remind_answer'] = llm_create(generate_prompt.format(question=question, memory=remind_searched))
    judge_df.loc[row, 'remind_judge'] = llm_create(judge_prompt.format(question=question, answer=answer, generate_answer=judge_df.loc[row, 'remind_answer']))
    
    if (row+1)%20==0:
        judge_df.to_json("judge_remind.json", orient="records", lines=True)
judge_df.to_json("judge_remind.json", orient="records", lines=True)

In [11]:
print(f"short mem score:{len(judge_df[judge_df['short_mem_judge']=='CORRECT'])/len(judge_df)}")
print(f"long mem score:{len(judge_df[judge_df['long_mem_judge']=='CORRECT'])/len(judge_df)}")
print(f"long mem recall score:{len(judge_df[judge_df['long_mem_recall_judge']=='CORRECT'])/len(judge_df)}")
print(f"remind score:{len(judge_df[judge_df['remind_judge']=='CORRECT'])/len(judge_df)}")

short mem score:0.852
long mem score:0.822
long mem recall score:0.854
remind score:0.916
