In [14]:
from datetime import datetime, timedelta
import re

def transform_time_intervals(intervals):
    def parse_relative_time(text):
        match = re.match(r"A (few|couple of) (hours|days|weeks|months|years) after", text)
        if match:
            qty = 3 if match.group(1) == "few" else 2
            unit = match.group(2)
            delta = {
                "hours": timedelta(hours=qty),
                "days": timedelta(days=qty),
                "weeks": timedelta(weeks=qty),
                "months": timedelta(days=qty * 30),  # Approximate month as 30 days
                "years": timedelta(days=qty * 365)  # Approximate year as 365 days
            }
            return delta[unit]
        return None

    def generate_timeline(events):
        now = datetime.now() - timedelta(minutes=10)  # 固定當前時間
        timeline = [now]
        for event in reversed(events[1:]):
            delta = parse_relative_time(event)
            if delta:
                now -= delta
            timeline.append(now)

        return list(reversed(timeline))
    
    result = generate_timeline(intervals)
    if len([dt.strftime("%Y-%m-%d %H:%M") for dt in result]) > 5:
        print(f'error:{intervals}')
    return [dt.strftime("%Y-%m-%d %H:%M") for dt in result]


In [15]:
import pandas as pd
question_df = pd.read_json('questions_0205.json', lines=True)
question_df['time_changed'] = question_df['time_interval'].apply(transform_time_intervals)
question_df = question_df.drop(['check', 'evidence', 'explain', 'answer', 'generate_dialogue', 'relationship', 'time_interval'], axis=1)

In [16]:
question_df = question_df[question_df['question_type']=='mix']
question_df.reset_index(drop=True, inplace=True)
len(question_df)

165

In [17]:
from letta_client import CreateBlock, Letta, MessageCreate

client = Letta(
    base_url="http://localhost:8283",
)

In [19]:
import math
from datetime import datetime, timedelta

yesterday = datetime.now() - timedelta(days=1)

for row in range(len(question_df)):
    print(f'---process {row+1}---')
    
    question = f"user:{question_df['question'][row]}"
    
    # Create a new agent
    agent = client.agents.create(
        model="openai/gpt-4o-mini",
        embedding="openai/text-embedding-3-small",
    )
    
    # MemGPT process
    success = False
    while not success:
        try:
            for number, col in enumerate(['first_session_dialogue', 'second_session_dialogue', 'third_session_dialogue', 'fourth_session_dialogue', 'fifth_session_dialogue']):
                chatlogs = []
                current_time = datetime.strptime(question_df['time_changed'][row][number], "%Y-%m-%d %H:%M")
                for i in range(math.ceil(len(question_df[col][row])/2)):
                    try:
                        chatlogs.append({
                            "user":question_df[col][row][i*2],
                            "assistant":question_df[col][row][i*2+1],
                            "time":current_time.strftime("%Y-%m-%dT%H:%M:%SZ")
                        })
                    except:
                        chatlogs.append({
                            "user":question_df[col][row][i*2],
                            "assistant":"",
                            "time":current_time.strftime("%Y-%m-%dT%H:%M:%SZ")
                        })
                    current_time = current_time + timedelta(minutes=1)
                    
                response = client.agents.messages.create(
                    agent_id=agent.id,
                    messages=[
                        MessageCreate(
                            role="user",
                            content=str(chatlogs),
                        ),
                    ],
                )
            success = True
        except Exception as e:
            print(f"Error: {e}, Retrying...")
            continue
        
    # get memory
    success = False
    while not success:
        try:
            response = client.agents.messages.create(
                agent_id=agent.id,
                messages=[
                    MessageCreate(
                        role="user",
                        content=question,
                    ),
                ],
            )
            question_df.loc[row, 'memgpt_response'] = str(response.messages[-1].content)
            success = True
        except Exception as e:
            print(f"Error: {e}, Retrying...")
            continue
    
    # delete agent
    client.agents.delete(agent_id=agent.id)
    
    if (row+1)%50==0:
        question_df.to_json("eval_result_memgpt_mix.json", orient="records", lines=True)
question_df.to_json("eval_result_memgpt_mix.json", orient="records", lines=True)

---process 1---
---process 2---
---process 3---
---process 4---
---process 5---
---process 6---
---process 7---
---process 8---
---process 9---
---process 10---
---process 11---
---process 12---
---process 13---
---process 14---
---process 15---
---process 16---
---process 17---
---process 18---
---process 19---
---process 20---
---process 21---
---process 22---
---process 23---
---process 24---
---process 25---
---process 26---
---process 27---
---process 28---
---process 29---
---process 30---
---process 31---
---process 32---
---process 33---
---process 34---
---process 35---
---process 36---
---process 37---
---process 38---
---process 39---
---process 40---
---process 41---
---process 42---
---process 43---
---process 44---
---process 45---
---process 46---
---process 47---
---process 48---
---process 49---
---process 50---
---process 51---
---process 52---
---process 53---
---process 54---
---process 55---
---process 56---
---process 57---
---process 58---
---process 59---
---pro

In [9]:
from dotenv import load_dotenv
from openai import OpenAI
import json
import re
import os

load_dotenv()
openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def llm_create(prompt):
    messages = [{"role": "user", "content": prompt}]
    completion = openai_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages,
    )
    return completion.choices[0].message.content

def llm_response_handler(response:str):
    """handle llm response format, especially for llama family"""
    try:
        return json.loads(response)
    except:
        response = response.strip()
        try:
            return json.loads(re.search(r"```json(.*?)```", response, re.DOTALL).group(1).strip())
        except:
            try:
                return json.loads(re.search(r"```(.*?)```", response, re.DOTALL).group(1).strip())
            except:
                return response

In [10]:
judge_prompt = """Your task is to label an answer to a question as ‘CORRECT’ or ‘WRONG’.

You will be given the following data:

A question (posed by user to assistant).
A ‘gold’ (ground truth) answer.
A generated answer, which you will score as CORRECT or WRONG.
The point of the question is to ask about something assistant should know about the user based on their prior conversations. The gold answer will usually be a concise and short answer that includes the referenced topic.

Example:
Question: Do you remember what I got the last time I went to Hawaii?
Gold answer: A shell necklace

The generated answer might be much longer, but you should be generous with your grading—as long as it touches on the same topic as the gold answer, it should be counted as CORRECT.

Examples of CORRECT answers:
- Oh yeah, that was so fun! I got so much stuff there, including that shell necklace.
- I got a ton of stuff... that surfboard, the mug, the necklace, those coasters too.
- That cute necklace.

Examples of WRONG answers:
- Oh yeah, that was so fun! I got so much stuff there, including that mug.
- I got a ton of stuff... that surfboard, the mug, those coasters too.
- I’m sorry, I don’t remember what you’re talking about.

Now it’s time for the real question:
Question: {question}
Gold answer: {answer}
Generated answer: {generate_answer}

Only answer CORRECT or WRONG, no need to explain"""

In [20]:
import pandas as pd
question_df = pd.read_json('questions_0205.json', lines=True)
question_df = question_df[question_df['question_type']=='mix']
question_df.reset_index(drop=True, inplace=True)
memgpt_df = pd.read_json("eval_result_memgpt_mix.json", lines=True)
judge_df = pd.DataFrame()

In [21]:
for row in range(len(question_df)):
    print(f'---process {row+1}---')
    
    question = f"{question_df['question'][row]}"
    answer = f"{question_df['answer'][row]}"
    judge_df.loc[row, 'memgpt_answer'] = memgpt_df.loc[row, 'memgpt_response']
    judge_df.loc[row, 'memgpt_judge'] = llm_create(judge_prompt.format(question=question, answer=answer, generate_answer=judge_df.loc[row, 'memgpt_answer']))
    
    if (row+1)%20==0:
        judge_df.to_json("judge_memgpt_mix.json", orient="records", lines=True)
judge_df.to_json("judge_memgpt_mix.json", orient="records", lines=True)

---process 1---
---process 2---
---process 3---
---process 4---
---process 5---
---process 6---
---process 7---
---process 8---
---process 9---
---process 10---
---process 11---


---process 12---
---process 13---
---process 14---
---process 15---
---process 16---
---process 17---
---process 18---
---process 19---
---process 20---
---process 21---
---process 22---
---process 23---
---process 24---
---process 25---
---process 26---
---process 27---
---process 28---
---process 29---
---process 30---
---process 31---
---process 32---
---process 33---
---process 34---
---process 35---
---process 36---
---process 37---
---process 38---
---process 39---
---process 40---
---process 41---
---process 42---
---process 43---
---process 44---
---process 45---
---process 46---
---process 47---
---process 48---
---process 49---
---process 50---
---process 51---
---process 52---
---process 53---
---process 54---
---process 55---
---process 56---
---process 57---
---process 58---
---process 59---
---process 60---
---process 61---
---process 62---
---process 63---
---process 64---
---process 65---
---process 66---
---process 67---
---process 68---
---process 69---
---process 70-

In [None]:
# long
print(f"memgpt score:{len(judge_df[judge_df['memgpt_judge']=='CORRECT'])/len(judge_df)}")

memgpt score:0.9096385542168675


In [13]:
print(f"memgpt short score:{len(judge_df[judge_df['memgpt_judge']=='CORRECT'])/len(judge_df)}")

memgpt short score:0.9467455621301775


In [23]:
print(f"memgpt mix score:{len(judge_df[judge_df['memgpt_judge']=='CORRECT'])/len(judge_df)}")

memgpt mix score:0.7636363636363637
