Load datasets and generate result

In [15]:
from datetime import datetime, timedelta
import re

def transform_time_intervals(intervals):
    def parse_relative_time(text):
        match = re.match(r"A (few|couple of) (hours|days|weeks|months|years) after", text)
        if match:
            qty = 3 if match.group(1) == "few" else 2
            unit = match.group(2)
            delta = {
                "hours": timedelta(hours=qty),
                "days": timedelta(days=qty),
                "weeks": timedelta(weeks=qty),
                "months": timedelta(days=qty * 30),  # Approximate month as 30 days
                "years": timedelta(days=qty * 365)  # Approximate year as 365 days
            }
            return delta[unit]
        return None

    def generate_timeline(events):
        now = datetime.now() - timedelta(minutes=10)  # 固定當前時間
        timeline = [now]
        for event in reversed(events[1:]):
            delta = parse_relative_time(event)
            if delta:
                now -= delta
            timeline.append(now)

        return list(reversed(timeline))
    
    result = generate_timeline(intervals)
    if len([dt.strftime("%Y-%m-%d %H:%M") for dt in result]) > 5:
        print(f'error:{intervals}')
    return [dt.strftime("%Y-%m-%d %H:%M") for dt in result]


In [16]:
import pandas as pd
question_df = pd.read_json('questions_0205.json', lines=True)
question_df['time_changed'] = question_df['time_interval'].apply(transform_time_intervals)
question_df = question_df.drop(['check', 'evidence', 'explain', 'answer', 'generate_dialogue', 'relationship', 'time_interval'], axis=1)

In [2]:
import pandas as pd
question_df = pd.read_json("eval_result_remind.json", lines=True)

In [5]:
question_df = question_df[question_df['question_type']=='mix']
len(question_df)

165

In [6]:
question_df.reset_index(drop=True, inplace=True)
question_df.head()

Unnamed: 0,dataID,first_session_dialogue,first_session_speakers,second_session_dialogue,second_session_speakers,third_session_dialogue,third_session_speakers,fourth_session_dialogue,fourth_session_speakers,fifth_session_dialogue,fifth_session_speakers,time_changed,question_type,question
0,episode-15776,"[Hey, B. Can I talk to you about something per...","[Co-workers A, Co-workers B, Co-workers A, Co-...","[Hey, I have some exciting news to share with ...","[Co-workers A, Co-workers B, Co-workers A, Co-...","[Hey, did I tell you about my bike ride to the...","[Co-workers A, Co-workers B, Co-workers A, Co-...",[I moved in with my grandparents when I was ju...,"[Co-workers A, Co-workers B, Co-workers A, Co-...",[I just can't believe my grandson is gone. I f...,"[Co-workers A, Co-workers B, Co-workers A, Co-...","[2023-01-06 09:19, 2025-01-05 09:19, 2025-01-0...",mix,Do you remember what personal change I mention...
1,episode-15808,"[Hi there! How are you doing?, Hey! I'm doing ...","[Neighbors A, Neighbors B, Neighbors A, Neighb...",[I'm so proud of myself for waking up early to...,"[Neighbors A, Neighbors B, Neighbors A, Neighb...","[Wow, that book looks really interesting. Can ...","[Neighbors A, Neighbors B, Neighbors A, Neighb...","[Hi there, did you see me riding my bike to th...","[Neighbors A, Neighbors B, Neighbors A, Neighb...",[That reminds me of when I first learned how t...,"[Neighbors A, Neighbors B, Neighbors A, Neighb...","[2024-12-15 09:19, 2024-12-18 09:19, 2025-01-0...",mix,Remember when I mentioned riding my bike to th...
2,episode-15815,"[Boss, I know I messed up big time and I'm sor...","[Employee, Boss, Employee, Boss, Employee, Bos...","[ Boss, I'm excited to tell you that I've been...","[Employee, Boss, Employee, Boss, Employee, Boss]","[Hi Boss, how can I help you today?, Hi Employ...","[Employee, Boss, Employee, Boss, Employee, Bos...","[Hmm, I'm really hoping it doesn't rain today....","[Employee, Boss, Employee, Boss, Employee, Bos...","[Boss, I have some exciting news. I have been ...","[Employee, Boss, Employee, Boss, Employee, Bos...","[2017-04-13 09:19, 2019-04-13 09:19, 2021-04-1...",mix,Do you recall when I was irresponsible and cau...
3,episode-15821,"[Hey, can we talk for a sec?, Yeah, sure. What...","[Classmates A, Classmates B, Classmates A, Cla...",[I can't believe I won the contest! I never ex...,"[Classmates A, Classmates B, Classmates A, Cla...","[Hey, B! I've been assigned to write an import...","[Classmates A, Classmates B, Classmates A, Cla...","[Whew, I'm glad I had enough money to cover th...","[Classmates A, Classmates B, Classmates A, Cla...","[, I finally got the perfect Christmas tree fo...","[Classmates A, Classmates B, Classmates A, Cla...","[2025-03-15 06:19, 2025-03-18 06:19, 2025-03-1...",mix,"Based on my recent financial efforts, would yo..."
4,episode-15854,"[Hi Teacher, I wanted to share some exciting n...","[Student, Teacher, Student, Teacher, Student, ...","[I want to be a tough guy, someone who nobody ...","[Student, Teacher, Student, Teacher, Student, ...",[I am really excited about winning the contest...,"[Student, Teacher, Student, Teacher, Student, ...",[I hope it doesn't rain today. I want to go ou...,"[Student, Teacher, Student, Teacher, Student, ...","[Teacher, do you remember how I used to take p...","[Student, Teacher, Student, Teacher, Student, ...","[2021-03-01 09:19, 2023-03-01 09:19, 2023-03-2...",mix,Remember when you mentioned that one of my pai...


In [None]:
from long_memory.component import WeaviateLongMemory
from short_memory.component import WeaviateShortMemory

long_mem = WeaviateLongMemory(user='GPT_4o_mini')
short_mem = WeaviateShortMemory(user='GPT_4o_mini')

In [18]:
combine_prompt = """The following memories have been retrieved from the system.

Recent memory:{short_memory}

Older memory:{long_memory}"""

In [20]:
import math
from datetime import datetime, timedelta

yesterday = datetime.now() - timedelta(days=1)
for row in range(len(question_df)):
    print(f'---process {row+1}---')
    
    question = f"user:{question_df['question'][row]}"
    
    # remind process
    success = False
    while not success:
        try:
            long_mem.del_memory()
            short_mem.del_memory()
            for number, col in enumerate(['first_session_dialogue', 'second_session_dialogue', 'third_session_dialogue', 'fourth_session_dialogue', 'fifth_session_dialogue']):
                chatlogs = []
                current_time = datetime.strptime(question_df['time_changed'][row][number], "%Y-%m-%d %H:%M")
                # save to long or short by time
                if current_time >= yesterday:
                    for i in range(math.ceil(len(question_df[col][row])/2)):
                        try:
                            chatlogs.append({
                                "user":question_df[col][row][i*2],
                                "assistant":question_df[col][row][i*2+1],
                                "time":current_time.strftime("%Y-%m-%dT%H:%M:%SZ")
                            })
                        except:
                            chatlogs.append({
                                "user":question_df[col][row][i*2],
                                "assistant":"",
                                "time":current_time.strftime("%Y-%m-%dT%H:%M:%SZ")
                            })
                        current_time = current_time + timedelta(minutes=1)
                    short_mem.add_chatlogs(chatlogs)
                else:
                    for i in range(math.ceil(len(question_df[col][row])/2)):
                        try:
                            chatlogs.append({
                                "text":f"user:{question_df[col][row][i*2]}, assistant:{question_df[col][row][i*2+1]}",
                                "time":current_time.strftime("%Y-%m-%dT%H:%M:%SZ")
                            })
                        except:
                            chatlogs.append({
                                "text":f"user:{question_df[col][row][i*2]}",
                                "time":current_time.strftime("%Y-%m-%dT%H:%M:%SZ")
                            })
                        current_time = current_time + timedelta(minutes=1)
                    long_mem.add_chat_logs(chatlogs)
            success = True
        except Exception as e:
            print(f"---remind error:{e}, retry...---")
    
    # get memory
    success = False
    while not success:
        try:
            # searched_memory
            long_memory = long_mem.get_memory(question, recall=True, retrieve_number=5)
            long_group = long_mem.get_memory(question, recall=False, retrieve_number=5)
            short_memory = short_mem.get_memory(question, k=5)
            searched_memory = combine_prompt.format(short_memory=short_memory, long_memory=long_memory)
            question_df.loc[row, 'remind_result'] = str(searched_memory)
            question_df.loc[row, 'stm_result'] = str(short_memory)
            question_df.loc[row, 'ltm_result'] = str(long_memory)
            question_df.loc[row, 'long_group'] = str(long_group)
            success = True
        except Exception as e:
            print(f"---remind error:{e}, retry...---")

    if (row+1)%50==0:
        question_df.to_json("eval_result_remind.json", orient="records", lines=True)
question_df.to_json("eval_result_remind.json", orient="records", lines=True)

---process 1---
Detect empty group memory, create memory space...
Detect empty child memory, create memory space...
Detect empty short memory, create memory space...
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
---process 2---
Detect empty group memory, create memory space...
Detect empty child memory, create memory space...
Detect empty short memory, create memory space...
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
---process 3---
Detect empty group memory, create memory space...
Detect empty child memory, create memory space...
Detect empty short memory, create memory space...
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
[34mSa



Detect empty short memory, create memory space...
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
---process 385---
Detect empty group memory, create memory space...
Detect empty child memory, create memory space...
Detect empty short memory, create memory space...
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
---process 386---
Detect empty group memory, create memory space...
Detect empty child memory, create memory space...
Detect empty short memory, create memory space...
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
---process 387---
Detect empty group memory, create memory space...
Detect

Score

In [10]:
from dotenv import load_dotenv
from openai import OpenAI
import json
import re
import os

load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def llm_create(prompt):
    messages = [{"role": "user", "content": prompt}]
    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages,
    )
    return completion.choices[0].message.content

def llm_response_handler(response:str):
    """handle llm response format, especially for llama family"""
    try:
        return json.loads(response)
    except:
        response = response.strip()
        try:
            return json.loads(re.search(r"```json(.*?)```", response, re.DOTALL).group(1).strip())
        except:
            try:
                return json.loads(re.search(r"```(.*?)```", response, re.DOTALL).group(1).strip())
            except:
                return response

In [11]:
generate_prompt = """You are given a question from user ask assistant, and have a memory retrieve from assistant
Pretend you are assistant and give a response.

Question:{question}

Memory:{memory}"""

judge_prompt = """Your task is to label an answer to a question as ‘CORRECT’ or ‘WRONG’.

You will be given the following data:

A question (posed by user to assistant).
A ‘gold’ (ground truth) answer.
A generated answer, which you will score as CORRECT or WRONG.
The point of the question is to ask about something assistant should know about the user based on their prior conversations. The gold answer will usually be a concise and short answer that includes the referenced topic.

Example:
Question: Do you remember what I got the last time I went to Hawaii?
Gold answer: A shell necklace

The generated answer might be much longer, but you should be generous with your grading—as long as it touches on the same topic as the gold answer, it should be counted as CORRECT.

Examples of CORRECT answers:
- Oh yeah, that was so fun! I got so much stuff there, including that shell necklace.
- I got a ton of stuff... that surfboard, the mug, the necklace, those coasters too.
- That cute necklace.

Examples of WRONG answers:
- Oh yeah, that was so fun! I got so much stuff there, including that mug.
- I got a ton of stuff... that surfboard, the mug, those coasters too.
- I’m sorry, I don’t remember what you’re talking about.

Now it’s time for the real question:
Question: {question}
Gold answer: {answer}
Generated answer: {generate_answer}

Only answer CORRECT or WRONG, no need to explain"""

In [21]:
import pandas as pd
question_df = pd.read_json('questions_0205.json', lines=True)
# question_df = question_df[question_df['question_type']=='mix']
# question_df.reset_index(drop=True, inplace=True)
remind_df = pd.read_json("eval_result_remind.json", lines=True)
judge_df = pd.DataFrame()
# judge_df = pd.read_json("judge_remind.json", lines=True)

In [22]:
for row in range(len(question_df)):
    print(f'---process {row+1}---')
    
    question = f"{question_df['question'][row]}"
    answer = f"{question_df['answer'][row]}"
    
    long_searched = remind_df.loc[row, 'long_group']
    long_recall_searched = remind_df.loc[row, 'ltm_result']
    short_searched = remind_df.loc[row, 'stm_result']
    remind_searched = remind_df.loc[row, 'remind_result']
    
    judge_df.loc[row, 'long_mem_result'] = long_searched
    judge_df.loc[row, 'long_mem_answer'] = llm_create(generate_prompt.format(question=question, memory=long_searched))
    judge_df.loc[row, 'long_mem_judge'] = llm_create(judge_prompt.format(question=question, answer=answer, generate_answer=judge_df.loc[row, 'long_mem_answer']))
    judge_df.loc[row, 'long_mem_recall_result'] = long_recall_searched
    judge_df.loc[row, 'long_mem_recall_answer'] = llm_create(generate_prompt.format(question=question, memory=long_recall_searched))
    judge_df.loc[row, 'long_mem_recall_judge'] = llm_create(judge_prompt.format(question=question, answer=answer, generate_answer=judge_df.loc[row, 'long_mem_recall_answer']))
    judge_df.loc[row, 'short_mem_result'] = short_searched
    judge_df.loc[row, 'short_mem_answer'] = llm_create(generate_prompt.format(question=question, memory=short_searched))
    judge_df.loc[row, 'short_mem_judge'] = llm_create(judge_prompt.format(question=question, answer=answer, generate_answer=judge_df.loc[row, 'short_mem_answer']))
    judge_df.loc[row, 'remind_result'] = remind_searched
    judge_df.loc[row, 'remind_answer'] = llm_create(generate_prompt.format(question=question, memory=remind_searched))
    judge_df.loc[row, 'remind_judge'] = llm_create(judge_prompt.format(question=question, answer=answer, generate_answer=judge_df.loc[row, 'remind_answer']))
    
    if (row+1)%20==0:
        judge_df.to_json("judge_remind.json", orient="records", lines=True)
judge_df.to_json("judge_remind.json", orient="records", lines=True)

---process 1---
---process 2---
---process 3---
---process 4---
---process 5---
---process 6---
---process 7---
---process 8---
---process 9---
---process 10---
---process 11---
---process 12---
---process 13---
---process 14---
---process 15---
---process 16---
---process 17---
---process 18---
---process 19---
---process 20---
---process 21---
---process 22---
---process 23---
---process 24---
---process 25---
---process 26---
---process 27---
---process 28---
---process 29---
---process 30---
---process 31---
---process 32---
---process 33---
---process 34---
---process 35---
---process 36---
---process 37---
---process 38---
---process 39---
---process 40---
---process 41---
---process 42---
---process 43---
---process 44---
---process 45---
---process 46---
---process 47---
---process 48---
---process 49---
---process 50---
---process 51---
---process 52---
---process 53---
---process 54---
---process 55---
---process 56---
---process 57---
---process 58---
---process 59---
---pro

In [23]:
print(f"short mem score:{len(judge_df[judge_df['short_mem_judge']=='CORRECT'])/len(judge_df)}")
print(f"long mem score:{len(judge_df[judge_df['long_mem_judge']=='CORRECT'])/len(judge_df)}")
print(f"long mem recall score:{len(judge_df[judge_df['long_mem_recall_judge']=='CORRECT'])/len(judge_df)}")
print(f"remind score:{len(judge_df[judge_df['remind_judge']=='CORRECT'])/len(judge_df)}")

short mem score:0.62
long mem score:0.672
long mem recall score:0.698
remind score:0.922
