diff --git a/README.md b/README.md index 6873ba2b1..50621b584 100644 --- a/README.md +++ b/README.md @@ -54,22 +54,20 @@ ## 📈 Performance Benchmark -MemOS demonstrates significant improvements over baseline memory solutions in multiple reasoning tasks. - -| Model | Avg. Score | Multi-Hop | Open Domain | Single-Hop | Temporal Reasoning | -|-------------|------------|-----------|-------------|------------|---------------------| -| **OpenAI** | 0.5275 | 0.6028 | 0.3299 | 0.6183 | 0.2825 | -| **MemOS** | **0.7331** | **0.6430** | **0.5521** | **0.7844** | **0.7321** | -| **Improvement** | **+38.98%** | **+6.67%** | **+67.35%** | **+26.86%** | **+159.15%** | - -> 💡 **Temporal reasoning accuracy improved by 159% compared to the OpenAI baseline.** - -### Details of End-to-End Evaluation on LOCOMO - -> [!NOTE] -> Comparison of LLM Judge Scores across five major tasks in the LOCOMO benchmark. Each bar shows the mean evaluation score judged by LLMs for a given method-task pair, with standard deviation as error bars. MemOS-0630 consistently outperforms baseline methods (LangMem, Zep, OpenAI, Mem0) across all task types, especially in multi-hop and temporal reasoning scenarios. - -END2END SCORE +MemOS demonstrates significant improvements over baseline memory solutions in multiple memory tasks, +showcasing its capabilities in **information extraction**, **temporal and cross-session reasoning**, and **personalized preference responses**. + +| Model | LOCOMO | LongMemEval | PrefEval-10 | PersonaMem | +|-----------------|-------------|-------------|-------------|-------------| +| **GPT-4o-mini** | 52.75 | 55.4 | 2.8 | 43.46 | +| **MemOS** | **75.80** | **77.80** | **71.90** | **61.17** | +| **Improvement** | **+43.70%** | **+40.43%** | **+2568%** | **+40.75%** | + +### Detailed Evaluation Results +- We use gpt-4o-mini as the processing and judging LLM and bge-m3 as embedding model in MemOS evaluation. +- The evaluation was conducted under conditions that align various settings as closely as possible. Reproduce the results with our scripts at [`evaluation`](./evaluation). +- Check the full search and response details at huggingface https://huggingface.co/datasets/MemTensor/MemOS_eval_result. +> 💡 **MemOS outperforms all other methods (Mem0, Zep, Memobase, SuperMemory et al.) across all benchmarks!** ## ✨ Key Features diff --git a/evaluation/scripts/locomo/locomo_ingestion.py b/evaluation/scripts/locomo/locomo_ingestion.py index 518d90c4c..a9e4d5f02 100644 --- a/evaluation/scripts/locomo/locomo_ingestion.py +++ b/evaluation/scripts/locomo/locomo_ingestion.py @@ -88,7 +88,7 @@ def ingest_session(client, session, frame, version, metadata): return elapsed_time -def process_user(conv_idx, frame, locomo_df, version): +def process_user(conv_idx, frame, locomo_df, version, success_records, f): conversation = locomo_df["conversation"].iloc[conv_idx] max_session_count = 35 start_time = time.time() @@ -149,11 +149,15 @@ def process_user(conv_idx, frame, locomo_df, version): print(f"Processing {valid_sessions} sessions for user {conv_idx}") - for session, metadata in sessions_to_process: - session_time = ingest_session(client, session, frame, version, metadata) - total_session_time += session_time - print(f"User {conv_idx}, {metadata['session_key']} processed in {session_time} seconds") - + for session_idx, (session, metadata) in enumerate(sessions_to_process): + if f"{conv_idx}_{session_idx}" not in success_records: + session_time = ingest_session(client, session, frame, version, metadata) + total_session_time += session_time + print(f"User {conv_idx}, {metadata['session_key']} processed in {session_time} seconds") + f.write(f"{conv_idx}_{session_idx}\n") + f.flush() + else: + print(f"Session {conv_idx}_{session_idx} already ingested") end_time = time.time() elapsed_time = round(end_time - start_time, 2) print(f"User {conv_idx} processed successfully in {elapsed_time} seconds") @@ -170,9 +174,20 @@ def main(frame, version="default", num_workers=4): print( f"Starting processing for {num_users} users in serial mode, each user using {num_workers} workers for sessions..." ) - with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor: + os.makedirs(f"results/locomo/{frame}-{version}/", exist_ok=True) + success_records = [] + record_file = f"results/locomo/{frame}-{version}/success_records.txt" + if os.path.exists(record_file): + with open(record_file) as f: + for i in f.readlines(): + success_records.append(i.strip()) + + with ( + concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor, + open(record_file, "a+") as f, + ): futures = [ - executor.submit(process_user, user_id, frame, locomo_df, version) + executor.submit(process_user, user_id, frame, locomo_df, version, success_records, f) for user_id in range(num_users) ] for future in concurrent.futures.as_completed(futures): @@ -216,7 +231,7 @@ def main(frame, version="default", num_workers=4): help="Version identifier for saving results (e.g., 1010)", ) parser.add_argument( - "--workers", type=int, default=3, help="Number of parallel workers to process users" + "--workers", type=int, default=10, help="Number of parallel workers to process users" ) args = parser.parse_args() lib = args.lib