Load datasets and generate result

In [1]:
from datetime import datetime, timedelta
import re

def transform_time_intervals(intervals):
    def parse_relative_time(text):
        match = re.match(r"A (few|couple of) (hours|days|weeks|months|years) after", text)
        if match:
            qty = 3 if match.group(1) == "few" else 2
            unit = match.group(2)
            delta = {
                "hours": timedelta(hours=qty),
                "days": timedelta(days=qty),
                "weeks": timedelta(weeks=qty),
                "months": timedelta(days=qty * 30),  # Approximate month as 30 days
                "years": timedelta(days=qty * 365)  # Approximate year as 365 days
            }
            return delta[unit]
        return None

    def generate_timeline(events):
        now = datetime.now() - timedelta(minutes=10)  # 固定當前時間
        timeline = [now]
        for event in reversed(events[1:]):
            delta = parse_relative_time(event)
            if delta:
                now -= delta
            timeline.append(now)

        return list(reversed(timeline))
    
    result = generate_timeline(intervals)
    if len([dt.strftime("%Y-%m-%d %H:%M") for dt in result]) > 5:
        print(f'error:{intervals}')
    return [dt.strftime("%Y-%m-%d %H:%M") for dt in result]


In [2]:
import pandas as pd
question_df = pd.read_json('questions_0205.json', lines=True)
question_df['time_changed'] = question_df['time_interval'].apply(transform_time_intervals)
question_df = question_df.drop(['check', 'evidence', 'explain', 'answer', 'generate_dialogue', 'relationship', 'time_interval'], axis=1)

In [1]:
import pandas as pd
question_df = pd.read_json("eval_result_remind.json", lines=True)

In [3]:
question_df.head()

Unnamed: 0,dataID,first_session_dialogue,first_session_speakers,second_session_dialogue,second_session_speakers,third_session_dialogue,third_session_speakers,fourth_session_dialogue,fourth_session_speakers,fifth_session_dialogue,fifth_session_speakers,time_changed,question_type,question
0,episode-15761,"[Hi there, how are you doing today?, Hi. I'm d...","[Neighbors A, Neighbors B, Neighbors A, Neighb...","[That looks like an interesting book, do you m...","[Neighbors A, Neighbors B, Neighbors A, Neighb...","[Hey there, did you see me riding my bike in t...","[Neighbors A, Neighbors B, Neighbors A, Neighb...",[It's nice to be back in the neighborhood afte...,"[Neighbors A, Neighbors B, Neighbors A, Neighb...","[Ugh, my boss keeps calling me on my days off....","[Neighbors A, Neighbors B, Neighbors A, Neighb...","[2021-01-11 09:18, 2021-04-11 09:18, 2021-04-1...",long,Remember when I said you could borrow somethin...
1,episode-15776,"[Hey, B. Can I talk to you about something per...","[Co-workers A, Co-workers B, Co-workers A, Co-...","[Hey, I have some exciting news to share with ...","[Co-workers A, Co-workers B, Co-workers A, Co-...","[Hey, did I tell you about my bike ride to the...","[Co-workers A, Co-workers B, Co-workers A, Co-...",[I moved in with my grandparents when I was ju...,"[Co-workers A, Co-workers B, Co-workers A, Co-...",[I just can't believe my grandson is gone. I f...,"[Co-workers A, Co-workers B, Co-workers A, Co-...","[2023-01-05 12:18, 2025-01-04 12:18, 2025-01-0...",mix,Do you remember what personal change I mention...
2,episode-15784,[I'm so sick of having to take out the trash e...,"[Parent, Child, Parent, Child, Parent, Child, ...",[Today's hike was so much fun! I loved explori...,"[Parent, Child, Parent, Child, Parent, Child, ...",[ I'm just so sick of taking out the trash eve...,"[Parent, Child, Parent, Child, Parent, Parent,...","[Today was amazing, I won the contest and I fe...","[Parent, Child, Parent, Child, Parent, Child, ...","[When I was ten, I moved in with my grandparen...","[Parent, Child, Parent, Child, Parent, Child, ...","[2019-04-09 12:18, 2021-04-08 12:18, 2021-04-1...",long,Remember our hike today? What specifically did...
3,episode-15787,"[Hey there, Classmates B! What's up? Why are y...","[Classmates A, Classmates B, Classmates A, Cla...","[Hey, have you ever witnessed a car accident?,...","[Classmates A, Classmates B, Classmates A, Cla...",[I'm almost done with my work for today. Just ...,"[Classmates A, Classmates B, Classmates A, Cla...","[You know, I still get emotional thinking abou...","[Classmates A, Classmates B, Classmates A, Cla...","[Hey, what are you reading there?, Just a book...","[Classmates A, Classmates B, Classmates A, Cla...","[2023-04-02 12:18, 2023-04-05 12:18, 2023-04-0...",short,Remember when you mentioned the book you're re...
4,episode-15790,"[Hey, how's your day going so far?, It's going...","[Husband, Wife, Husband, Wife, Husband, Wife, ...","[Hey., Hey, everything okay?, Yeah, everything...","[Husband, Wife, Husband, Wife, Husband, Wife, ...",[I can't stop thinking about the Grand Canyon....,"[Husband, Wife, Husband, Wife, Husband, Wife, ...",[I don't know where the chips went. I put them...,"[Husband, Wife, Husband, Wife, Husband, Wife, ...","[Hey, have you seen the book I was looking at ...","[Husband, Wife, Husband, Wife, Husband, Wife, ...","[2024-07-11 12:18, 2024-10-09 12:18, 2025-01-0...",long,Remember when you were really stressed about o...


In [5]:
question_df = question_df[question_df['question_type']=='short']
len(question_df)

169

In [10]:
question_df.reset_index(drop=True, inplace=True)
question_df.head()

Unnamed: 0,dataID,first_session_dialogue,first_session_speakers,second_session_dialogue,second_session_speakers,third_session_dialogue,third_session_speakers,fourth_session_dialogue,fourth_session_speakers,fifth_session_dialogue,fifth_session_speakers,time_changed,question_type,question
0,episode-15787,"[Hey there, Classmates B! What's up? Why are y...","[Classmates A, Classmates B, Classmates A, Cla...","[Hey, have you ever witnessed a car accident?,...","[Classmates A, Classmates B, Classmates A, Cla...",[I'm almost done with my work for today. Just ...,"[Classmates A, Classmates B, Classmates A, Cla...","[You know, I still get emotional thinking abou...","[Classmates A, Classmates B, Classmates A, Cla...","[Hey, what are you reading there?, Just a book...","[Classmates A, Classmates B, Classmates A, Cla...","[2023-04-02 12:18, 2023-04-05 12:18, 2023-04-0...",short,Remember when you mentioned the book you're re...
1,episode-15880,[I was thinking about doing something special ...,"[Parent, Child, Parent, Child, Parent, Child, ...","[Child, I want to talk to you about something ...","[Parent, Child, Parent, Child, Parent, Child, ...","[Guess what, kiddo? I got a new car!, That's g...","[Parent, Child, Parent, Child, Parent, Child, ...","[You know, jumping on the trampoline made me f...","[Parent, Child, Parent, Child, Parent, Child, ...",[ I can't believe my grandson is gone. It's so...,"[Parent, Child, Parent, Child, Parent, Child, ...","[2025-02-24 09:18, 2025-02-24 12:18, 2025-03-1...",short,Remember when we talked about keeping his memo...
2,episode-15899,"[Hey, have you heard about the new project we'...","[Co-workers A, Co-workers B, Co-workers A, Co-...","[So yeah, I'm really excited about becoming a ...","[Co-workers A, Co-workers B, Co-workers A, Co-...","[, you wouldn't believe what I did over the we...","[Co-workers A, Co-workers B, Co-workers A, Co-...","[Hey, that looks like an interesting book you'...","[Co-workers A, Co-workers B, Co-workers A, Co-...",[That story reminds me of when I learned how t...,"[Co-workers A, Co-workers B, Co-workers A, Co-...","[2023-02-28 09:18, 2023-03-21 09:18, 2025-03-2...",short,Remember when we talked about biking? What out...
3,episode-15958,"[Hey, how's it going?, Not bad, thanks. How ab...","[Co-workers A, Co-workers B, Co-workers A, Co-...","[Excuse me, I was wondering if you could help ...","[Co-workers A, Co-workers B, Co-workers A, Co-...","[I hope it doesn't rain today., Why is that?, ...","[Co-workers A, Co-workers B, Co-workers A, Co-...",[I'm really grateful that I have this job at t...,"[Co-workers A, Co-workers B, Co-workers A, Co-...","[I had a bike with two wheels and a bell, and ...","[Co-workers A, Co-workers B, Co-workers A, Co-...","[2024-10-12 06:18, 2024-10-12 09:18, 2025-01-1...",short,Remember when I mentioned how satisfying it is...
4,episode-15977,"[Hey, Neighbors B! How are you doing today?, I...","[Neighbors A, Neighbors B, Neighbors A, Neighb...",[I finally got around to mowing the lawn yeste...,"[Neighbors A, Neighbors B, Neighbors A, Neighb...",[It's crazy to think that I've been living wit...,"[Neighbors A, Neighbors B, Neighbors A, Neighb...","[Hi there, Neighbors B! How are you doing toda...","[Neighbors A, Neighbors B, Neighbors A, Neighb...","[My grandson passed away last month, and I sti...","[Neighbors A, Neighbors B, Neighbors A, Neighb...","[2021-02-28 12:18, 2021-03-21 12:18, 2023-03-2...",short,Do you remember saying that losing a loved one...


In [6]:
from long_memory.component import WeaviateLongMemory
from short_memory.component import WeaviateShortMemory

long_mem = WeaviateLongMemory(user='GPT_4o_mini')
short_mem = WeaviateShortMemory(user='GPT_4o_mini')

Detect existed GPT_4o_mini user group memory space, loading...
Detect existed GPT_4o_mini user child memory space, loading...
Detect existed GPT_4o_mini user short memory space, loading...


In [7]:
combine_prompt = """The following memories have been retrieved from the system.

Recent memory:{short_memory}

Older memory:{long_memory}"""

In [11]:
import math
from datetime import datetime, timedelta

yesterday = datetime.now() - timedelta(days=1)
for row in range(len(question_df)):
    print(f'---process {row+1}---')
    
    question = f"user:{question_df['question'][row]}"
    
    # remind process
    success = False
    while not success:
        try:
            long_mem.del_memory()
            short_mem.del_memory()
            for number, col in enumerate(['first_session_dialogue', 'second_session_dialogue', 'third_session_dialogue', 'fourth_session_dialogue', 'fifth_session_dialogue']):
                chatlogs = []
                current_time = datetime.strptime(question_df['time_changed'][row][number], "%Y-%m-%d %H:%M")
                # save to long or short by time
                if current_time >= yesterday:
                    for i in range(math.ceil(len(question_df[col][row])/2)):
                        try:
                            chatlogs.append({
                                "user":question_df[col][row][i*2],
                                "assistant":question_df[col][row][i*2+1],
                                "time":current_time.strftime("%Y-%m-%dT%H:%M:%SZ")
                            })
                        except:
                            chatlogs.append({
                                "user":question_df[col][row][i*2],
                                "assistant":"",
                                "time":current_time.strftime("%Y-%m-%dT%H:%M:%SZ")
                            })
                        current_time = current_time + timedelta(minutes=1)
                    short_mem.add_chatlogs(chatlogs)
                else:
                    for i in range(math.ceil(len(question_df[col][row])/2)):
                        try:
                            chatlogs.append({
                                "text":f"user:{question_df[col][row][i*2]}, assistant:{question_df[col][row][i*2+1]}",
                                "time":current_time.strftime("%Y-%m-%dT%H:%M:%SZ")
                            })
                        except:
                            chatlogs.append({
                                "text":f"user:{question_df[col][row][i*2]}",
                                "time":current_time.strftime("%Y-%m-%dT%H:%M:%SZ")
                            })
                        current_time = current_time + timedelta(minutes=1)
                    long_mem.add_chat_logs(chatlogs)
            success = True
        except Exception as e:
            print(f"---remind error:{e}, retry...---")
    
    # get memory
    success = False
    while not success:
        try:
            # searched_memory
            long_memory = long_mem.get_memory(question, recall=True, retrieve_number=5)
            long_group = long_mem.get_memory(question, recall=False, retrieve_number=5)
            short_memory = short_mem.get_memory(question, k=5)
            searched_memory = combine_prompt.format(short_memory=short_memory, long_memory=long_memory)
            question_df.loc[row, 'remind_result'] = str(searched_memory)
            question_df.loc[row, 'stm_result'] = str(short_memory)
            question_df.loc[row, 'ltm_result'] = str(long_memory)
            question_df.loc[row, 'long_group'] = str(long_group)
            success = True
        except Exception as e:
            print(f"---remind error:{e}, retry...---")

    if (row+1)%50==0:
        question_df.to_json("eval_result_remind_short.json", orient="records", lines=True)
question_df.to_json("eval_result_remind_short.json", orient="records", lines=True)

---process 1---
Detect empty group memory, create memory space...
Detect empty child memory, create memory space...
Detect empty short memory, create memory space...
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
---process 2---
Detect empty group memory, create memory space...
Detect empty child memory, create memory space...
Detect empty short memory, create memory space...
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
---process 3---
Detect empty group memory, create memory space...
Detect empty child memory, create memory space...
Detect empty short memory, create memory space...
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
[34mSa

Score

In [12]:
from dotenv import load_dotenv
from openai import OpenAI
import json
import re
import os

load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def llm_create(prompt):
    messages = [{"role": "user", "content": prompt}]
    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages,
    )
    return completion.choices[0].message.content

def llm_response_handler(response:str):
    """handle llm response format, especially for llama family"""
    try:
        return json.loads(response)
    except:
        response = response.strip()
        try:
            return json.loads(re.search(r"```json(.*?)```", response, re.DOTALL).group(1).strip())
        except:
            try:
                return json.loads(re.search(r"```(.*?)```", response, re.DOTALL).group(1).strip())
            except:
                return response

In [13]:
generate_prompt = """You are given a question from user ask assistant, and have a memory retrieve from assistant
Pretend you are assistant and give a response.

Question:{question}

Memory:{memory}"""

judge_prompt = """Your task is to label an answer to a question as ‘CORRECT’ or ‘WRONG’.

You will be given the following data:

A question (posed by user to assistant).
A ‘gold’ (ground truth) answer.
A generated answer, which you will score as CORRECT or WRONG.
The point of the question is to ask about something assistant should know about the user based on their prior conversations. The gold answer will usually be a concise and short answer that includes the referenced topic.

Example:
Question: Do you remember what I got the last time I went to Hawaii?
Gold answer: A shell necklace

The generated answer might be much longer, but you should be generous with your grading—as long as it touches on the same topic as the gold answer, it should be counted as CORRECT.

Examples of CORRECT answers:
- Oh yeah, that was so fun! I got so much stuff there, including that shell necklace.
- I got a ton of stuff... that surfboard, the mug, the necklace, those coasters too.
- That cute necklace.

Examples of WRONG answers:
- Oh yeah, that was so fun! I got so much stuff there, including that mug.
- I got a ton of stuff... that surfboard, the mug, those coasters too.
- I’m sorry, I don’t remember what you’re talking about.

Now it’s time for the real question:
Question: {question}
Gold answer: {answer}
Generated answer: {generate_answer}

Only answer CORRECT or WRONG, no need to explain"""

In [17]:
import pandas as pd
question_df = pd.read_json('questions_0205.json', lines=True)
question_df = question_df[question_df['question_type']=='short']
question_df.reset_index(drop=True, inplace=True)
remind_df = pd.read_json("eval_result_remind_short.json", lines=True)
judge_df = pd.DataFrame()
# judge_df = pd.read_json("judge_remind.json", lines=True)

In [18]:
for row in range(len(question_df)):
    print(f'---process {row+1}---')
    
    question = f"{question_df['question'][row]}"
    answer = f"{question_df['answer'][row]}"
    
    long_searched = remind_df.loc[row, 'long_group']
    long_recall_searched = remind_df.loc[row, 'ltm_result']
    short_searched = remind_df.loc[row, 'stm_result']
    remind_searched = remind_df.loc[row, 'remind_result']
    
    judge_df.loc[row, 'long_mem_result'] = long_searched
    judge_df.loc[row, 'long_mem_answer'] = llm_create(generate_prompt.format(question=question, memory=long_searched))
    judge_df.loc[row, 'long_mem_judge'] = llm_create(judge_prompt.format(question=question, answer=answer, generate_answer=judge_df.loc[row, 'long_mem_answer']))
    judge_df.loc[row, 'long_mem_recall_result'] = long_recall_searched
    judge_df.loc[row, 'long_mem_recall_answer'] = llm_create(generate_prompt.format(question=question, memory=long_recall_searched))
    judge_df.loc[row, 'long_mem_recall_judge'] = llm_create(judge_prompt.format(question=question, answer=answer, generate_answer=judge_df.loc[row, 'long_mem_recall_answer']))
    judge_df.loc[row, 'short_mem_result'] = short_searched
    judge_df.loc[row, 'short_mem_answer'] = llm_create(generate_prompt.format(question=question, memory=short_searched))
    judge_df.loc[row, 'short_mem_judge'] = llm_create(judge_prompt.format(question=question, answer=answer, generate_answer=judge_df.loc[row, 'short_mem_answer']))
    judge_df.loc[row, 'remind_result'] = remind_searched
    judge_df.loc[row, 'remind_answer'] = llm_create(generate_prompt.format(question=question, memory=remind_searched))
    judge_df.loc[row, 'remind_judge'] = llm_create(judge_prompt.format(question=question, answer=answer, generate_answer=judge_df.loc[row, 'remind_answer']))
    
    if (row+1)%20==0:
        judge_df.to_json("judge_remind_short.json", orient="records", lines=True)
judge_df.to_json("judge_remind_short.json", orient="records", lines=True)

---process 1---
---process 2---
---process 3---
---process 4---
---process 5---
---process 6---
---process 7---
---process 8---
---process 9---
---process 10---
---process 11---
---process 12---
---process 13---
---process 14---
---process 15---
---process 16---
---process 17---
---process 18---
---process 19---
---process 20---
---process 21---
---process 22---
---process 23---
---process 24---
---process 25---
---process 26---
---process 27---
---process 28---
---process 29---
---process 30---
---process 31---
---process 32---
---process 33---
---process 34---
---process 35---
---process 36---
---process 37---
---process 38---
---process 39---
---process 40---
---process 41---
---process 42---
---process 43---
---process 44---
---process 45---
---process 46---
---process 47---
---process 48---
---process 49---
---process 50---
---process 51---
---process 52---
---process 53---
---process 54---
---process 55---
---process 56---
---process 57---
---process 58---
---process 59---
---pro

In [19]:
print(f"short mem score:{len(judge_df[judge_df['short_mem_judge']=='CORRECT'])/len(judge_df)}")
print(f"long mem score:{len(judge_df[judge_df['long_mem_judge']=='CORRECT'])/len(judge_df)}")
print(f"long mem recall score:{len(judge_df[judge_df['long_mem_recall_judge']=='CORRECT'])/len(judge_df)}")
print(f"remind score:{len(judge_df[judge_df['remind_judge']=='CORRECT'])/len(judge_df)}")

short mem score:0.9822485207100592
long mem score:0.46745562130177515
long mem recall score:0.46153846153846156
remind score:0.9881656804733728
