Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
87c37fb
fix:#(268)https://github.com/MemTensor/MemOS/issues/286
kakack Sep 11, 2025
a6a5558
Add pymysql dependency for MySQL user management
kakack Sep 11, 2025
996e4a4
Merge branch 'dev' into dev
fridayL Sep 11, 2025
4bb4b5c
add: change deafult pre_load (#338)
fridayL Sep 24, 2025
98dbf8a
feat:reoganize prompt with reference in user content
kakack Sep 25, 2025
3734b26
Feat: update load cubes (#350)
fridayL Sep 26, 2025
7c16d36
feat:reoganize prompt with reference in user content
kakack Sep 26, 2025
d47cc90
Merge branch 'test' into dev
kakack Sep 26, 2025
7aafbd0
ruff format
kakack Sep 26, 2025
04bc4fb
feat:reoganize prompt with reference in user content
kakack Sep 26, 2025
6bd1135
Merge branch 'dev' of github.com:kakack/MemOS into dev
kakack Sep 26, 2025
4bd373e
eat:reoganize prompt with reference in user content (#351)
kakack Sep 26, 2025
4cca56a
fix bugs to support eval answer hit with chat history only
tangg555 Sep 26, 2025
b6834d3
change the consume interval from 3 to 0.5 seconds, and refactor the c…
tangg555 Sep 26, 2025
ccef651
add new feat of thread race, and add a new test case for scheduler di…
tangg555 Sep 28, 2025
d01c8cf
hotfix:noe4j community dataformat (#353)
fridayL Sep 29, 2025
2da62c8
milvus implement (#354)
Wang-Daoji Sep 30, 2025
15cdbac
fix: code ruff format (#355)
fridayL Sep 30, 2025
a2715f5
feat: add server api prd (#362)
fridayL Oct 15, 2025
675eeca
add new feat of time eval for temporal locomo benchamrk, but this is …
tangg555 Oct 16, 2025
e8346fc
Feat: add neo4j db for user_name (#365)
fridayL Oct 16, 2025
ec3d657
feat & refactor: enable mem scheduler to load auth config from enviro…
tangg555 Oct 16, 2025
de2b5c6
refactor: sort out config files in examples.
tangg555 Oct 16, 2025
5481f56
Feat: add chat complete for new server_api (#366)
fridayL Oct 17, 2025
3e721da
feat(mem_scheduler): add messages logging for stuck tasks monitoring
tangg555 Oct 17, 2025
7bb5bd6
feat(mem_scheduler): add configurable thread/process startup mode
tangg555 Oct 20, 2025
e1de4ad
Feat/merge dev (#374)
fridayL Oct 20, 2025
d64c6ba
Merge branch 'dev_new_update' into feat/test
fridayL Oct 20, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions evaluation/.env-example
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,24 @@ ZEP_API_KEY="z_***REDACTED***"
CHAT_MODEL="gpt-4o-mini"
CHAT_MODEL_BASE_URL="http://***.***.***.***:3000/v1"
CHAT_MODEL_API_KEY="sk-***REDACTED***"

# Configuration Only For Scheduler
# RabbitMQ Configuration
MEMSCHEDULER_RABBITMQ_HOST_NAME=rabbitmq-cn-***.cn-***.amqp-32.net.mq.amqp.aliyuncs.com
MEMSCHEDULER_RABBITMQ_USER_NAME=***
MEMSCHEDULER_RABBITMQ_PASSWORD=***
MEMSCHEDULER_RABBITMQ_VIRTUAL_HOST=memos
MEMSCHEDULER_RABBITMQ_ERASE_ON_CONNECT=true
MEMSCHEDULER_RABBITMQ_PORT=5672

# OpenAI Configuration
MEMSCHEDULER_OPENAI_API_KEY=sk-***
MEMSCHEDULER_OPENAI_BASE_URL=http://***.***.***.***:3000/v1
MEMSCHEDULER_OPENAI_DEFAULT_MODEL=gpt-4o-mini

# Graph DB Configuration
MEMSCHEDULER_GRAPHDBAUTH_URI=bolt://localhost:7687
MEMSCHEDULER_GRAPHDBAUTH_USER=neo4j
MEMSCHEDULER_GRAPHDBAUTH_PASSWORD=***
MEMSCHEDULER_GRAPHDBAUTH_DB_NAME=neo4j
MEMSCHEDULER_GRAPHDBAUTH_AUTO_CREATE=true
Empty file added evaluation/__init__.py
Empty file.
Empty file added evaluation/scripts/__init__.py
Empty file.
Empty file.
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@

from bert_score import score as bert_score
from dotenv import load_dotenv
from modules.locomo_eval_module import LocomoEvalModelModules
from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
from nltk.translate.meteor_score import meteor_score
from openai import AsyncOpenAI
Expand All @@ -19,6 +18,7 @@
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

from evaluation.scripts.temporal_locomo.modules.locomo_eval_module import LocomoEvalModelModules
from memos.log import get_logger


Expand Down Expand Up @@ -281,33 +281,64 @@ def __init__(self, args):
api_key=os.getenv("OPENAI_API_KEY"), base_url=os.getenv("OPENAI_BASE_URL")
)

async def run(self):
print(
f"\n=== Starting LoCoMo evaluation for {self.frame} (version: {self.version}) with {self.num_runs} run(s) per question ==="
)
print(f"Using {self.max_workers} concurrent workers for processing groups")
def _load_response_data(self):
"""
Load response data from the response path file.

Returns:
dict: The loaded response data
"""
with open(self.response_path) as file:
locomo_responses = json.load(file)
return json.load(file)

num_users = 10
def _load_existing_evaluation_results(self):
"""
Attempt to load existing evaluation results from the judged path.
If the file doesn't exist or there's an error loading it, return an empty dict.

Returns:
dict: Existing evaluation results or empty dict if none available
"""
all_grades = {}
try:
if os.path.exists(self.judged_path):
with open(self.judged_path) as f:
all_grades = json.load(f)
print(f"Loaded existing evaluation results from {self.judged_path}")
except Exception as e:
print(f"Error loading existing evaluation results: {e}")

total_responses_count = sum(
len(locomo_responses.get(f"locomo_exp_user_{i}", [])) for i in range(num_users)
)
print(f"Found {total_responses_count} total responses across {num_users} users to evaluate")
return all_grades

def _create_evaluation_tasks(self, locomo_responses, all_grades, num_users):
"""
Create evaluation tasks for groups that haven't been evaluated yet.

Args:
locomo_responses (dict): The loaded response data
all_grades (dict): Existing evaluation results
num_users (int): Number of user groups to process

# Create tasks for processing each group
Returns:
tuple: (tasks list, active users count)
"""
tasks = []
active_users = 0

for group_idx in range(num_users):
group_id = f"locomo_exp_user_{group_idx}"
group_responses = locomo_responses.get(group_id, [])

if not group_responses:
print(f"No responses found for group {group_id}")
continue

# Skip groups that already have evaluation results
if all_grades.get(group_id):
print(f"Skipping group {group_id} as it already has evaluation results")
active_users += 1
continue

active_users += 1
tasks.append(
process_single_group(
Expand All @@ -319,29 +350,50 @@ async def run(self):
)
)

print(f"Starting evaluation of {active_users} user groups with responses")
return tasks, active_users

async def _process_tasks(self, tasks):
"""
Process evaluation tasks with concurrency control.

Args:
tasks (list): List of tasks to process

Returns:
list: Results from processing all tasks
"""
if not tasks:
return []

semaphore = asyncio.Semaphore(self.max_workers)

async def limited_task(task):
"""Helper function to limit concurrent task execution"""
async with semaphore:
return await task

limited_tasks = [limited_task(task) for task in tasks]
group_results = await asyncio.gather(*limited_tasks)
return await asyncio.gather(*limited_tasks)

for group_id, graded_responses in group_results:
all_grades[group_id] = graded_responses
def _calculate_scores(self, all_grades):
"""
Calculate evaluation scores based on all grades.

print("\n=== Evaluation Complete: Calculating final scores ===")
Args:
all_grades (dict): The complete evaluation results

Returns:
tuple: (run_scores, evaluated_count)
"""
run_scores = []
evaluated_count = 0

if self.num_runs > 0:
for i in range(1, self.num_runs + 1):
judgment_key = f"judgment_{i}"
current_run_correct_count = 0
current_run_total_count = 0

for group in all_grades.values():
for response in group:
if judgment_key in response["llm_judgments"]:
Expand All @@ -355,6 +407,16 @@ async def limited_task(task):

evaluated_count = current_run_total_count

return run_scores, evaluated_count

def _report_scores(self, run_scores, evaluated_count):
"""
Report evaluation scores to the console.

Args:
run_scores (list): List of accuracy scores for each run
evaluated_count (int): Number of evaluated responses
"""
if evaluated_count > 0:
mean_of_scores = np.mean(run_scores)
std_of_scores = np.std(run_scores)
Expand All @@ -368,11 +430,63 @@ async def limited_task(task):
print("No responses were evaluated")
print("LLM-as-a-Judge score: N/A (0/0)")

def _save_results(self, all_grades):
"""
Save evaluation results to the judged path file.

Args:
all_grades (dict): The complete evaluation results to save
"""
all_grades = convert_numpy_types(all_grades)
with open(self.judged_path, "w") as f:
json.dump(all_grades, f, indent=2)
print(f"Saved detailed evaluation results to {self.judged_path}")

async def run(self):
"""
Main execution method for the LoCoMo evaluation process.
This method orchestrates the entire evaluation workflow:
1. Loads existing evaluation results if available
2. Processes only groups that haven't been evaluated yet
3. Calculates and reports final evaluation scores
"""
print(
f"\n=== Starting LoCoMo evaluation for {self.frame} (version: {self.version}) with {self.num_runs} run(s) per question ==="
)
print(f"Using {self.max_workers} concurrent workers for processing groups")

# Load response data and existing evaluation results
locomo_responses = self._load_response_data()
all_grades = self._load_existing_evaluation_results()

# Count total responses for reporting
num_users = 10
total_responses_count = sum(
len(locomo_responses.get(f"locomo_exp_user_{i}", [])) for i in range(num_users)
)
print(f"Found {total_responses_count} total responses across {num_users} users to evaluate")

# Create tasks only for groups that haven't been evaluated yet
tasks, active_users = self._create_evaluation_tasks(locomo_responses, all_grades, num_users)
print(
f"Starting evaluation of {len(tasks)} user groups with responses (out of {active_users} active users)"
)

# Process tasks and update all_grades with results
if tasks:
group_results = await self._process_tasks(tasks)
for group_id, graded_responses in group_results:
all_grades[group_id] = graded_responses

print("\n=== Evaluation Complete: Calculating final scores ===")

# Calculate and report scores
run_scores, evaluated_count = self._calculate_scores(all_grades)
self._report_scores(run_scores, evaluated_count)

# Save results
self._save_results(all_grades)


if __name__ == "__main__":
parser = argparse.ArgumentParser()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,16 @@
from datetime import datetime, timezone
from pathlib import Path

from modules.constants import (
from tqdm import tqdm

from evaluation.scripts.temporal_locomo.modules.constants import (
MEM0_GRAPH_MODEL,
MEM0_MODEL,
MEMOS_MODEL,
MEMOS_SCHEDULER_MODEL,
ZEP_MODEL,
)
from modules.locomo_eval_module import LocomoEvalModelModules
from tqdm import tqdm

from evaluation.scripts.temporal_locomo.modules.locomo_eval_module import LocomoEvalModelModules
from memos.log import get_logger


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import numpy as np
import pandas as pd

from modules.locomo_eval_module import LocomoEvalModelModules
from evaluation.scripts.temporal_locomo.modules.locomo_eval_module import LocomoEvalModelModules


# Category mapping as per your request
Expand Down
Loading