Skip to content

feat: skill triger bug#1770

Merged
fridayL merged 3 commits into
MemTensor:dev-20260518-v2.0.17from
Wang-Daoji:feat/skill_triger_bug
May 20, 2026
Merged

feat: skill triger bug#1770
fridayL merged 3 commits into
MemTensor:dev-20260518-v2.0.17from
Wang-Daoji:feat/skill_triger_bug

Conversation

@Wang-Daoji
Copy link
Copy Markdown
Collaborator

Description

Please include a summary of the change, the problem it solves, the implementation approach, and relevant context. List any dependencies required for this change.

Related Issue (Required): Fixes #1769

Type of change

Please delete options that are not relevant.

  • Bug fix (non-breaking change which fixes an issue)

How Has This Been Tested?

import time
import json
from memos.api.routers.server_router import add_memories, search_memories
from memos.api.product_models import APIADDRequest, APISearchRequest


messages = TOOL_CALL_SKILL_EXAMPLES_FIVE_ROUNDS

if __name__ == "__main__":
    user_id = f"test_user_{time.time()}"
    # user_id = "test_user_456765aax43156789gg921336533nn08khb21122"
    # user_id = 'test_user_1770630514.518709'
    readable_cube_ids = [user_id]
    add_req = APIADDRequest(
        user_id=user_id,
        writable_cube_ids=readable_cube_ids,
        messages=[
    {
        "role": "system",
        "content": "你是一个生产故障排查助手。需要使用工具获取日志、指标、链路、配置和历史工单后,再输出结构化排查结论。",
    },
    {
        "role": "user",
        "content": "推荐服务 recommendation-api 今天 10 点左右延迟突然升高,帮我排查根因,并总结以后同类问题应该怎么查。",
    },
    {
        "role": "assistant",
        "tool_calls": [
            {
                "id": "call_metrics_001",
                "type": "function",
                "function": {
                    "name": "query_metrics",
                    "arguments": '{"service":"recommendation-api","time_window":"2026-04-18 09:40-10:20 CST","metrics":["p95_latency","p99_latency","error_rate","request_count"]}',
                },
            }
        ],
    },
    {
        "role": "tool",
        "tool_call_id": "call_metrics_001",
        "content": '{"service":"recommendation-api","time_window":"2026-04-18 09:40-10:20 CST","p95_latency":"320ms -> 2.8s","p99_latency":"610ms -> 6.4s","error_rate":"0.2% -> 8.7%","request_count":"stable at 4200 rpm","first_anomaly_at":"2026-04-18 10:03 CST"}',
    },
    {
        "role": "assistant",
        "tool_calls": [
            {
                "id": "call_logs_002",
                "type": "function",
                "function": {
                    "name": "search_logs",
                    "arguments": '{"service":"recommendation-api","time_window":"2026-04-18 10:00-10:12 CST","levels":["ERROR","WARN"],"keywords":["timeout","redis","feature_flag"]}',
                },
            }
        ],
    },
    {
        "role": "tool",
        "tool_call_id": "call_logs_002",
        "content": '{"matches":[{"time":"2026-04-18 10:03:11 CST","level":"WARN","message":"RedisTimeout: feature_cache get timeout after 800ms"},{"time":"2026-04-18 10:03:13 CST","level":"ERROR","message":"fallback model scoring failed because feature vector is incomplete"},{"time":"2026-04-18 10:04:01 CST","level":"WARN","message":"circuit breaker open for feature_cache"}],"summary":"错误集中在 feature_cache 读取超时和降级链路失败。"}',
    },
    {
        "role": "assistant",
        "tool_calls": [
            {
                "id": "call_trace_003",
                "type": "function",
                "function": {
                    "name": "query_traces",
                    "arguments": '{"service":"recommendation-api","time_window":"2026-04-18 10:00-10:12 CST","sample_type":"slow_and_error","limit":20}',
                },
            }
        ],
    },
    {
        "role": "tool",
        "tool_call_id": "call_trace_003",
        "content": '{"slow_spans":[{"span":"GET /recommend","p95":"2.9s","child_span":"feature-service.batchGet","child_p95":"2.4s"},{"span":"feature-service.batchGet","dependency":"redis://feature-cache","p95":"2.1s"}],"error_traces":"18/20 traces fail after feature_cache timeout, then fallback scoring receives empty feature vector."}',
    },
    {
        "role": "assistant",
        "tool_calls": [
            {
                "id": "call_deploy_004",
                "type": "function",
                "function": {
                    "name": "get_deploy_diff",
                    "arguments": '{"service":"recommendation-api","around_time":"2026-04-18 10:03 CST","include":["deployments","config_changes","feature_flags"]}',
                },
            }
        ],
    },
    {
        "role": "tool",
        "tool_call_id": "call_deploy_004",
        "content": '{"deployments":[],"config_changes":[{"time":"2026-04-18 09:58 CST","owner":"search-platform","key":"feature_cache.timeout_ms","old_value":"300","new_value":"800"},{"time":"2026-04-18 09:59 CST","owner":"ml-platform","key":"fallback_scoring.required_features","old_value":"allow_missing","new_value":"strict"}],"feature_flags":[{"time":"2026-04-18 10:00 CST","flag":"new_feature_cache_key_v2","status":"enabled","scope":"100% traffic"}]}',
    },
    # {
    #     "role": "assistant",
    #     "tool_calls": [
    #         {
    #             "id": "call_incidents_005",
    #             "type": "function",
    #             "function": {
    #                 "name": "search_incident_history",
    #                 "arguments": '{"keywords":["feature_cache","new_feature_cache_key_v2","fallback_scoring","redis timeout"],"lookback_days":90}',
    #             },
    #         }
    #     ],
    # },
    # {
    #     "role": "tool",
    #     "tool_call_id": "call_incidents_005",
    #     "content": '{"related_incidents":[{"date":"2026-03-22","service":"ranking-api","cause":"new cache key format caused 35% miss rate; fallback tolerated missing features so impact was hidden","fix":"rolled back key format flag, warmed cache, added miss-rate alert"},{"date":"2026-02-09","service":"recommendation-api","cause":"fallback scoring strict mode exposed incomplete feature vector","fix":"changed fallback policy to allow missing non-critical features"}],"suggested_checks":["compare cache hit rate by key version","rollback new_feature_cache_key_v2","temporarily set fallback_scoring.required_features=allow_missing"]}',
    # },
    {
        "role": "assistant",
        "content": "# recommendation-api 延迟升高排查结论\n\n## 现象\n- 2026-04-18 10:03 CST 开始,p95 从 320ms 升至 2.8s,p99 升至 6.4s,错误率升至 8.7%。\n- 请求量稳定,说明不是流量突增导致。\n\n## 根因判断\n1. 直接瓶颈在 `feature_cache`:日志和链路都显示 `feature-service.batchGet` 访问 Redis feature cache 超时。\n2. 触发因素是 10:00 全量打开 `new_feature_cache_key_v2`,很可能造成缓存 key 变更后的命中率下降。\n3. 影响被放大是因为 09:59 同时把 `fallback_scoring.required_features` 从 `allow_missing` 改成 `strict`,导致缓存超时后降级模型无法接受缺失特征,错误率随延迟一起上升。\n\n## 建议动作\n1. 立即回滚 `new_feature_cache_key_v2` 到关闭状态,观察 p95、错误率、feature cache 命中率是否恢复。\n2. 临时把 `fallback_scoring.required_features` 改回 `allow_missing`,先恢复服务可用性。\n3. 回滚后补充验证:按 key 版本对比 cache hit rate、Redis timeout、fallback scoring error 三个指标。\n4. 复盘时补一条发布护栏:缓存 key 变更必须先灰度并预热缓存,且不能和降级策略收紧同时发布。\n\n## 可复用排障流程\n以后排查类似线上问题,先锁定异常时间窗和用户影响,再用指标判断是流量、依赖还是错误率问题;随后用日志确认错误类型,用 trace 定位慢 span 和依赖,再查部署、配置、feature flag 的时间线,最后用历史工单验证是否有相同模式。输出结论时要分清直接瓶颈、触发因素、放大因素,并给出回滚、验证和长期防护动作。",
    },
    {
        "role": "user",
        "content": "以后这类生产故障分析,请固定输出“现象、根因判断、立即止血、验证指标、长期防护”五个模块,并且要明确区分触发因素和放大因素。",
    },
    {
        "role": "assistant",
        "content": "收到。以后处理生产故障排查类任务,我会先通过指标、日志、链路、变更记录和历史工单形成证据链,再固定按“现象、根因判断、立即止血、验证指标、长期防护”五个模块输出;在根因判断里会明确拆分直接瓶颈、触发因素和放大因素,避免把相关现象混成单一原因。",
    },
]
,
        async_mode="sync",
        mode="fine",
        info={"sbbbbb": "asdsdadsad", "bhjjjjjjjjjjj": "uuueueueueueueue"},
        session_id="1234567890",
        manager_user_id="super_manager_xxx",
        project_id="super_project_xxx",
    )

    add_rsp = add_memories(add_req)
    print("add_memories rsp: \n\n", add_rsp)

    time.sleep(5)

    search_req = APISearchRequest(
        user_id=user_id,
        readable_cube_ids=readable_cube_ids,
        query="线上服务延迟升高和错误率上升时如何排查根因",
        include_preference=True,
        search_tool_memory=True,
        include_skill_memory=True,
    )
    start = time.time()
    search_rsp = search_memories(search_req).data["skill_mem"]
    end = time.time()
    print(f"search_memories cost time: {end - start}")

    print("\n\nsearch_rsp: \n\n", json.dumps(search_rsp, indent=2, ensure_ascii=False))

Please describe the tests that you ran to verify your changes. Provide instructions so we can reproduce. Please also list any relevant details for your test configuration

  • Unit Test
  • Test Script Or Test Steps (please provide)
  • Pipeline Automated API Test (please provide)

Checklist

  • I have performed a self-review of my own code | 我已自行检查了自己的代码
  • I have commented my code in hard-to-understand areas | 我已在难以理解的地方对代码进行了注释
  • I have added tests that prove my fix is effective or that my feature works | 我已添加测试以证明我的修复有效或功能正常
  • I have created related documentation issue/PR in MemOS-Docs (if applicable) | 我已在 MemOS-Docs 中创建了相关的文档 issue/PR(如果适用)
  • I have linked the issue to this PR (if applicable) | 我已将 issue 链接到此 PR(如果适用)
  • I have mentioned the person who will review this PR | 我已提及将审查此 PR 的人

Reviewer Checklist

  • closes #xxxx (Replace xxxx with the GitHub issue number)
  • Made sure Checks passed
  • Tests have been provided

@fridayL fridayL merged commit 16eeaa4 into MemTensor:dev-20260518-v2.0.17 May 20, 2026
16 checks passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants