### Load question

In [1]:
from datetime import datetime, timedelta
import re

def transform_time_intervals(intervals):
    def parse_relative_time(text):
        match = re.match(r"A (few|couple of) (hours|days|weeks|months|years) after", text)
        if match:
            qty = 3 if match.group(1) == "few" else 2
            unit = match.group(2)
            delta = {
                "hours": timedelta(hours=qty),
                "days": timedelta(days=qty),
                "weeks": timedelta(weeks=qty),
                "months": timedelta(days=qty * 30),  # Approximate month as 30 days
                "years": timedelta(days=qty * 365)  # Approximate year as 365 days
            }
            return delta[unit]
        return None

    def generate_timeline(events):
        now = datetime.now() - timedelta(minutes=10)  # 固定當前時間
        timeline = [now]
        for event in reversed(events[1:]):
            delta = parse_relative_time(event)
            if delta:
                now -= delta
            timeline.append(now)

        return list(reversed(timeline))
    
    result = generate_timeline(intervals)
    if len([dt.strftime("%Y-%m-%d %H:%M") for dt in result]) > 5:
        print(f'error:{intervals}')
    return [dt.strftime("%Y-%m-%d %H:%M") for dt in result]


In [2]:
import pandas as pd
question_df = pd.read_json('questions_0205.json', lines=True)
question_df['time_changed'] = question_df['time_interval'].apply(transform_time_intervals)
question_df = question_df.drop(['check', 'evidence', 'explain', 'answer', 'question_type', 'generate_dialogue', 'relationship', 'time_interval'], axis=1)

### Evaluation 

In [None]:
from long_memory.component import WeaviateLongMemory
long_mem = WeaviateLongMemory(user='GPT_4o_mini')

Detect existed GPT_4o_mini user short memory space, loading...
Detect existed GPT_4o_mini user group memory space, loading...
Detect existed GPT_4o_mini user child memory space, loading...


In [4]:
index = ['long_mem_result', 'long_mem_recall_result']
for i in index:
    question_df[i]=None

In [None]:
import math
from datetime import datetime, timedelta

for row in [1]:
    print(f'---process {row+1}---')
    
    question = f"user:{question_df['question'][row]}"
    
    # long mem process
    success = False
    while not success:
        try:
            long_mem.del_memory()
            for number, col in enumerate(['first_session_dialogue', 'second_session_dialogue', 'third_session_dialogue', 'fourth_session_dialogue', 'fifth_session_dialogue']):
                chatlogs = []
                current_time = datetime.strptime(question_df['time_changed'][row][number], "%Y-%m-%d %H:%M")
                for i in range(math.ceil(len(question_df[col][row])/2)):
                    try:
                        chatlogs.append({
                            "text":f"user:{question_df[col][row][i*2]}, assistant:{question_df[col][row][i*2+1]}",
                            "time":current_time.strftime("%Y-%m-%dT%H:%M:%SZ")
                        })
                    except:
                        chatlogs.append({
                            "text":f"user:{question_df[col][row][i*2]}",
                            "time":current_time.strftime("%Y-%m-%dT%H:%M:%SZ")
                        })
                    current_time = current_time + timedelta(minutes=1)
                long_mem.add_chat_logs(chatlogs)
            searched_memory = long_mem.get_memory(question, recall=False, retrieve_number=5)
            question_df.loc[row, 'long_mem_result'] = str(searched_memory)
            # (recall)
            searched_memory = long_mem.get_memory(question, recall=True, retrieve_number=5)
            question_df.loc[row, 'long_mem_recall_result'] = str(searched_memory)
            success = True
        except Exception as e:
            print(f"---long memory error:{e}, retry...---")
    
    """if (row+1)%50==0:
        question_df.to_json("eval_result_gpt-4o-mini.json", orient="records", lines=True)"""

---process 2---
Detect empty group memory, create memory space...
Detect empty child memory, create memory space...
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m


In [None]:
question_df.to_json("eval_result_gpt-4o-mini.json", orient="records", lines=True)

Record result from Baseline

In [5]:
import pandas as pd
question_df = pd.read_json('questions_0205.json', lines=True)
question_df['time_changed'] = question_df['time_interval'].apply(transform_time_intervals)
question_df = question_df.drop(['check', 'evidence', 'explain', 'answer', 'question_type', 'generate_dialogue', 'relationship', 'time_interval'], axis=1)

In [6]:
question_df['short_memory'] = None # STM
question_df['base_dialog'] = None # each sentence
question_df['base_paragraph'] = None # a paragraph with 5 sentence

In [8]:
from short_memory.component import WeaviateShortMemory
short_mem = WeaviateShortMemory()

Detect existed deafult user short memory space, loading...


In [10]:
BASE_DIALOG_SCHEMA = {
    "class": "base_dialog",
    "description": "chat log",
    "vectorIndexType": "hnsw",
    "properties": [
        {
            "name": "text",
            "dataType": ["text"],
        },
    ],
    "moduleConfig": {
        "text2vec-openai": {
            "model": "text-embedding-3-small"
        }
    }
}

BASE_PARAGRAPH_SCHEMA = {
    "class": "base_paragraph",
    "description": "chat log",
    "vectorIndexType": "hnsw",
    "properties": [
        {
            "name": "text",
            "dataType": ["text"],
        },
    ],
    "moduleConfig": {
        "text2vec-openai": {
            "model": "text-embedding-3-small"
        }
    }
}

In [None]:
import weaviate
from weaviate.classes.query import MetadataQuery
import math
from datetime import datetime, timedelta

weaviate_client = weaviate.connect_to_local('127.0.0.1', 8080)

for row in range(1):
    print(f'---process {row+1}---')
    
    question = f"user:{question_df['question'][row]}"
    
    """# STM
    success = False
    while not success:
        try:
            short_mem.del_memory()
            for number, col in enumerate(['first_session_dialogue', 'second_session_dialogue', 'third_session_dialogue', 'fourth_session_dialogue', 'fifth_session_dialogue']):
                chatlogs = []
                current_time = datetime.strptime(question_df['time_changed'][row][number], "%Y-%m-%d %H:%M")
                for i in range(math.ceil(len(question_df[col][row])/2)):
                    try:
                        chatlogs.append({
                            "user":question_df[col][row][i*2],
                            "assistant":question_df[col][row][i*2+1],
                            "time":current_time.strftime("%Y-%m-%dT%H:%M:%SZ")
                        })
                    except:
                        chatlogs.append({
                            "user":question_df[col][row][i*2],
                            "assistant":"",
                            "time":current_time.strftime("%Y-%m-%dT%H:%M:%SZ")
                        })
                    current_time = current_time + timedelta(minutes=1)
                short_mem.add_chatlogs(chatlogs)
            searched_memory = short_mem.get_memory(question, k=5)
            question_df.loc[row, 'short_mem_result'] = str(searched_memory)
            success = True
        except Exception as e:
            print(f"---short memory error:{e}, retry...---")"""
    
    # base dialog
    success = False
    while not success:
        try:
            weaviate_client.collections.delete('base_dialog')
            weaviate_client.collections.create_from_dict(BASE_DIALOG_SCHEMA)
            base_dialog = weaviate_client.collections.get('base_dialog')
            for number, col in enumerate(['first_session_dialogue', 'second_session_dialogue', 'third_session_dialogue', 'fourth_session_dialogue', 'fifth_session_dialogue']):
                chatlogs = []
                current_time = datetime.strptime(question_df['time_changed'][row][number], "%Y-%m-%d %H:%M")
                for i in range(math.ceil(len(question_df[col][row])/2)):
                    data = {
                        "text":f"user:{question_df[col][row][i*2]}, assistant:{question_df[col][row][i*2+1]}",
                    }
                    base_dialog.data.insert(
                        properties=data,
                    )
            """response = base_dialog.query.near_text(
                query=question,
                limit=5,
                return_properties=["text"],
                return_metadata=MetadataQuery(distance=True)
            )
            retrieve_memory = []
            for data in response.objects:
                retrieve_memory.append({"text":data.properties['text']})
            res = {
                "retrieve_memory":retrieve_memory
            }
            question_df.loc[row, 'base_dialog'] = str(res)"""
            success = True
        except Exception as e:
            print(f"---base dialog error:{e}, retry...---")
            
    # base paragraph
    success = False
    while not success:
        try:
            weaviate_client.collections.delete('base_paragraph')
            weaviate_client.collections.create_from_dict(BASE_PARAGRAPH_SCHEMA)
            base_paragraph = weaviate_client.collections.get('base_paragraph')
            conv_list = []
            for number, col in enumerate(['first_session_dialogue', 'second_session_dialogue', 'third_session_dialogue', 'fourth_session_dialogue', 'fifth_session_dialogue']):
                chatlogs = []
                current_time = datetime.strptime(question_df['time_changed'][row][number], "%Y-%m-%d %H:%M")
                for i in range(math.ceil(len(question_df[col][row])/2)):
                    conv_list.append(f"user:{question_df[col][row][i*2]}, assistant:{question_df[col][row][i*2+1]}")
            jump = 5
            for i in range(0, len(conv_list), jump):
                base_paragraph.data.insert(
                    properties={
                            "text":str(conv_list[i:i+jump])
                        },
                )
            response = base_paragraph.query.near_text(
                query=question,
                limit=1,
                return_properties=["text"],
                return_metadata=MetadataQuery(distance=True)
            )
            retrieve_memory = []
            for data in response.objects:
                retrieve_memory.append({"text":data.properties['text']})
            res = {
                "retrieve_memory":retrieve_memory
            }
            question_df.loc[row, 'base_paragraph'] = str(res)
            success = True
        except Exception as e:
            print(f"---base paragraph error:{e}, retry...---")
    
    if (row+1)%50==0:
        question_df.to_json("eval_result_base.json", orient="records", lines=True)

---process 1---
Detect empty short memory, create memory space...
---base dialog error:list index out of range, retry...---


KeyboardInterrupt: 

In [None]:
question_df.to_json("eval_result_base.json", orient="records", lines=True)

Evaluation

In [1]:
import pandas as pd
question_df = pd.read_json('eval_result_qwen.json', lines=True)

In [9]:
from dotenv import load_dotenv
from openai import OpenAI
import os

load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def llm_create(prompt):
    messages = [{"role": "user", "content": prompt}]
    completion = client.chat.completions.create(
        model="gpt-4o",
        messages=messages,
    )
    return completion.choices[0].message.content

In [6]:
import requests
import json
import re

def llm_response_handler(response:str):
    """handle llm response format, especially for llama family"""
    try:
        return json.loads(response)
    except:
        response = response.strip()
        try:
            return json.loads(re.search(r"```json(.*?)```", response, re.DOTALL).group(1).strip())
        except:
            try:
                return json.loads(re.search(r"```(.*?)```", response, re.DOTALL).group(1).strip())
            except:
                return response

def llama_create(prompt):
    url = "http://localhost:11434/api/generate"
    data = {
        "model": "qwen2.5:32b",
        "prompt": prompt,
        "stream":False
    }

    response = requests.post(url, json=data)
    return response.json()['response']

In [4]:
question_df['short_score'] = None
question_df['long_score'] = None
question_df['long_recall_score'] = None
question_df['base_dialog_score'] = None
question_df['base_paragraph_score'] = None

In [7]:
evaluation_prompt = """You are a reviewer, and you are reviewing a memory retrieval system,
and you are given a transcript of the conversation, and questions that arise and answer from the transcript.

Next, you will get the memory found by five system based on this question.
You have to score the memory searched by these system.

Origin conversation:{origin_conversation}

Question:{question}

Short memory:{short_memory}

Long memory:{long_memory}

Long recall memory:{long_recall_memory}

Base dialog:{base_dialog}

Base_paragraph:{base_paragraph}

The output format is as follows, rate each memory from 1 to 10, 1 being the worst and 10 being the best.
```json
{{
    "short_memory":"1~10",
    "long_memory":"1~10",
    "long_recall_memory":"1~10",
    "base_dialog":"1~10",
    "base_paragraph":"1~10"
}}
```"""

In [None]:
def get_res(origin_conversation, question, short_searched, long_searched, long_recall_searched, base_dialog_searched, base_paragraph_searched):
    res = llm_create(evaluation_prompt.format(origin_conversation=origin_conversation, question=question, short_memory=short_searched, long_memory=long_searched, long_recall_memory=long_recall_searched, base_dialog=base_dialog_searched, base_paragraph=base_paragraph_searched))
    res_dict = llm_response_handler(res)
    return res_dict.get('short_memory'), res_dict.get('long_memory'), res_dict.get('long_recall_memory'), res_dict.get('base_dialog'), res_dict.get('base_paragraph')

for row in range(500):
    print(f'---process {row+1}---')
    
    origin_conversation = question_df['first_session_dialogue'][row]
    qa_pair = question_df['question'][row]
    
    short_searched = question_df.loc[row, 'short_mem_result']
    long_searched = question_df.loc[row, 'long_mem_result']
    long_recall_searched = question_df.loc[row, 'long_mem_recall_result']
    base_dialog_searched = question_df.loc[row, 'base_dialog']
    base_paragraph_searched = question_df.loc[row, 'base_paragraph']
    
    short_score, long_score, long_recall_score, base_dialog_score, base_paragraph_score = get_res(origin_conversation, qa_pair, short_searched, long_searched, long_recall_searched, base_dialog_searched, base_paragraph_searched)
    question_df.loc[row, 'short_score'] = short_score
    question_df.loc[row, 'long_score'] = long_score
    question_df.loc[row, 'long_recall_score'] = long_recall_score
    question_df.loc[row, 'base_dialog_score'] = base_dialog_score
    question_df.loc[row, 'base_paragraph_score'] = base_paragraph_score
    if row%50==0:
        question_df.to_json("eval_result_qwen.json", orient="records", lines=True)

In [None]:
number=500
observe_df = question_df.drop(['first_session_dialogue', 'second_session_dialogue', 'third_session_dialogue', 'fourth_session_dialogue', 'fifth_session_dialogue'], axis=1)[:number]
print(observe_df['short_score'].astype(int).sum()/number)
print(observe_df['long_score'].astype(int).sum()/number)
print(observe_df['long_recall_score'].astype(int).sum()/number)
print(observe_df['base_dialog_score'].astype(int).sum()/number)
print(observe_df['base_paragraph_score'].astype(int).sum()/number)

In [28]:
question_df.to_json("eval_result_gpt_4o.json", orient="records", lines=True)

Ablation Study
- short memory

In [50]:
import pandas as pd
question_df = pd.read_json('question_ready2.json', lines=True)

In [51]:
question_df['short_similarity'] = None
question_df['short_bm25'] = None
question_df['short_keyword'] = None
question_df['short_hyde'] = None

In [None]:
for row in range(176, 300):
    print(f'---process {row+1}---')
    
    question = f"user:{eval(question_df['question'][row])['user']}"
    
    # short mem process
    success = False
    while not success:
        try:
            long_mem.short_memory.del_memory()
            for col in ['first_session_dialogue', 'second_session_dialogue', 'third_session_dialogue', 'fourth_session_dialogue', 'fifth_session_dialogue']:
                conv = eval(question_df[col][row])
                long_mem.short_memory.add_chatlogs(conv)
            searched_similarity = long_mem.short_memory.get_memory(question,method="similarity", k=5)
            searched_bm25 = long_mem.short_memory.get_memory(question,method="BM25", k=5)
            searched_keyword = long_mem.short_memory.get_memory(question,method="keyword", k=5)
            searched_hyde = long_mem.short_memory.get_memory(question, method="HyDE", k=5)
            question_df.loc[row, 'short_similarity'] = str(searched_similarity)
            question_df.loc[row, 'short_bm25'] = str(searched_bm25)
            question_df.loc[row, 'short_keyword'] = str(searched_keyword)
            question_df.loc[row, 'short_hyde'] = str(searched_hyde)
            success = True
        except Exception as e:
            print(f"---short memory error:{e}, retry...---")
    
    if (row+1)%50==0:
        question_df.to_json("ablation_short.json", orient="records", lines=True)

---process 177---
Detect empty short memory, create memory space...
Search by keywords:['dog food', "Kibble n' Bits", 'benefits']
---process 178---
Detect empty short memory, create memory space...
Search by keywords:['visual learning', 'quantum mechanics', 'suggestion']
---process 179---
Detect empty short memory, create memory space...
Search by keywords:['weekend', 'spending', 'memory']
---process 180---
Detect empty short memory, create memory space...
Search by keywords:['dog food', 'brand', 'reminder']
---process 181---
Detect empty short memory, create memory space...
Search by keywords:['reminder', 'apartment', 'help']
---process 182---
Detect empty short memory, create memory space...
Search by keywords:['token of appreciation', 'memory', 'recognition']
---process 183---
Detect empty short memory, create memory space...
Search by keywords:['sunscreen', 'SPF', 'recommendation']
---process 184---
Detect empty short memory, create memory space...
Search by keywords:['dishes', 'co

In [6]:
question_df.to_json("ablation_short.json", orient="records", lines=True)

In [4]:
import pandas as pd
question_df = pd.read_json("ablation_short.json", lines=True)

In [7]:
evaluation_prompt = """You are a reviewer, and you are reviewing a memory retrieval system,
and you are given a transcript of the conversation, and questions that arise and answer from the transcript.

Next, you will get the memory found by five system based on this question.
You have to rate these memory systems seek on a scale of 1 to 10.

Origin conversation:{origin_conversation}

Question:{question}

A memory:{similarity_memory}

B memory:{bm25_memory}

C memory:{keyword_memory}

D memory:{hyde_memory}

The output format is as follows
```json
{{
    "A_memory":"1~10",
    "B_memory":"1~10",
    "C_memory":"1~10",
    "D_memory":"1~10"
}}
```"""

In [None]:
question_df['similarity_score']=None
question_df['bm25_score']=None
question_df['keyword_score']=None
question_df['hyde_score']=None

In [8]:
import json
import re


def get_res(origin_conversation, question, similarity_memory, bm25_memory, keyword_memory, hyde_memory):
    res = llm_create(evaluation_prompt.format(origin_conversation=origin_conversation, question=question, similarity_memory=similarity_memory, bm25_memory=bm25_memory, keyword_memory=keyword_memory, hyde_memory=hyde_memory))
    res_dict = json.loads(re.search(r"```json(.*?)```", res, re.DOTALL).group(1).strip())
    return res_dict.get('A_memory'), res_dict.get('B_memory'), res_dict.get('C_memory'), res_dict.get('D_memory')

for row in range(100, 300):
    print(f'---process {row+1}---')
    
    origin_conversation = question_df['second_session_dialogue'][row]
    qa_pair = question_df['question'][row]
    
    similarity_memory = question_df.loc[row, 'short_similarity']
    bm25_memory = question_df.loc[row, 'short_bm25']
    keyword_memory = question_df.loc[row, 'short_keyword']
    hyde_memory = question_df.loc[row, 'short_hyde']
    
    # var 修正
    short_score, bm25_score, keyword_score, hyde_score = get_res(origin_conversation, qa_pair, similarity_memory, bm25_memory, keyword_memory, hyde_memory)
    question_df.loc[row, 'similarity_score'] = short_score
    question_df.loc[row, 'bm25_score'] = bm25_score
    question_df.loc[row, 'keyword_score'] = keyword_score
    question_df.loc[row, 'hyde_score'] = hyde_score

---process 101---


  question_df.loc[row, 'similarity_score'] = short_score
  question_df.loc[row, 'bm25_score'] = bm25_score
  question_df.loc[row, 'keyword_score'] = keyword_score
  question_df.loc[row, 'hyde_score'] = hyde_score


---process 102---
---process 103---
---process 104---
---process 105---
---process 106---
---process 107---
---process 108---
---process 109---
---process 110---
---process 111---
---process 112---
---process 113---
---process 114---
---process 115---
---process 116---
---process 117---
---process 118---
---process 119---
---process 120---
---process 121---
---process 122---
---process 123---
---process 124---
---process 125---
---process 126---
---process 127---
---process 128---
---process 129---
---process 130---
---process 131---
---process 132---
---process 133---
---process 134---
---process 135---
---process 136---
---process 137---
---process 138---
---process 139---
---process 140---
---process 141---
---process 142---
---process 143---
---process 144---
---process 145---
---process 146---
---process 147---
---process 148---
---process 149---
---process 150---
---process 151---
---process 152---
---process 153---
---process 154---
---process 155---
---process 156---
---process

In [10]:
question_df.to_json("ablation_short.json", orient="records", lines=True)

In [None]:
observe_df = question_df.drop(['first_session_dialogue', 'second_session_dialogue', 'third_session_dialogue', 'fourth_session_dialogue', 'fifth_session_dialogue'], axis=1)[:300]
print(observe_df['similarity_score'].astype(int).sum()/300)
print(observe_df['bm25_score'].astype(int).sum()/300)
print(observe_df['keyword_score'].astype(int).sum()/300)
print(observe_df['hyde_score'].astype(int).sum()/300)

8.666666666666666
7.213333333333333
6.78
8.596666666666666


- long memory

In [None]:
from component import MemoRA

long_mem = MemoRA()

Detect existed deafult user short memory space, loading...
Detect existed deafult user group memory space, loading...
Detect existed deafult user child memory space, loading...


In [2]:
import pandas as pd
question_df = pd.read_json('question_ready2.json', lines=True)

In [3]:
question_df['long_similarity'] = None
question_df['long_bm25'] = None
question_df['long_keyword'] = None
question_df['long_HyDE'] = None

In [None]:
for row in range(100, 300):
    print(f'---process {row+1}---')
    
    question = f"user:{eval(question_df['question'][row])['user']}"
    
    # long mem process
    success = False
    while not success:
        try:
            long_mem.long_memory.del_memory()
            for col in ['first_session_dialogue', 'second_session_dialogue', 'third_session_dialogue', 'fourth_session_dialogue', 'fifth_session_dialogue']:
                conv = eval(question_df[col][row])
                chat_logs = []
                for log in conv:
                    chat_logs.append({
                        "text":f"user:{log['user']}, assistant:{log['assistant']}",
                        "time":log['time']
                    })
                long_mem.long_memory.add_chat_logs(chat_logs)
            searched_similarity = long_mem.long_memory.get_relevant_memory(query=question, k=5, method="similarity")
            searched_bm25 = long_mem.long_memory.get_relevant_memory(query=question, k=5, method="BM25")
            searched_keyword = long_mem.long_memory.get_relevant_memory(query=question, k=5, method="keyword")
            searched_hyde = long_mem.long_memory.get_relevant_memory(query=question, k=5, method="HyDE")
            question_df.loc[row, 'long_similarity'] = str(searched_similarity)
            question_df.loc[row, 'long_bm25'] = str(searched_bm25)
            question_df.loc[row, 'long_keyword'] = str(searched_keyword)
            question_df.loc[row, 'long_HyDE'] = str(searched_hyde)
            success = True
        except Exception as e:
            print(f"---long memory error:{e}, retry...---")
    
    if (row+1)%50==0:
        question_df.to_json("ablation_long.json", orient="records", lines=True)

---process 101---
Detect empty group memory, create memory space...
Detect empty child memory, create memory space...
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
Search by keywords:['cute dress', 'waiting in line', 'store']
---process 102---
Detect empty group memory, create memory space...
Detect empty child memory, create memory space...
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory done.[0m
Search by keywords:['travel', 'planning', 'advice']
---process 103---
Detect empty group memory, create memory space...
Detect empty child memory, create memory space...
[34mSave chat logs to long memory done.[0m
[34mSave chat logs to long memory do

In [4]:
question_df.to_json("ablation_long.json", orient="records", lines=True)

In [5]:
evaluation_prompt = """You are a reviewer, and you are reviewing a memory retrieval system,
and you are given a transcript of the conversation, and questions that arise and answer from the transcript.

Next, you will get the memory found by five system based on this question.
You have to rate these memory systems seek on a scale of 1 to 10.

Origin conversation:{origin_conversation}

Question:{question}

A memory:{similarity_memory}

B memory:{bm25_memory}

C memory:{keyword_memory}

D memory:{hyde_memory}

The output format is as follows
```json
{{
    "A_memory":"1~10",
    "B_memory":"1~10",
    "C_memory":"1~10",
    "D_memory":"1~10"
}}
```"""

In [15]:
question_df['similarity_score']=None
question_df['bm25_score']=None
question_df['keyword_score']=None
question_df['hyde_score']=None

In [9]:
import json
import re


def get_res(origin_conversation, question, similarity_memory, bm25_memory, keyword_memory, hyde_memory):
    res = llm_create(evaluation_prompt.format(origin_conversation=origin_conversation, question=question, similarity_memory=similarity_memory, bm25_memory=bm25_memory, keyword_memory=keyword_memory, hyde_memory=hyde_memory))
    res_dict = json.loads(re.search(r"```json(.*?)```", res, re.DOTALL).group(1).strip())
    return res_dict.get('A_memory'), res_dict.get('B_memory'), res_dict.get('C_memory'), res_dict.get('D_memory')

for row in range(100, 300):
    print(f'---process {row+1}---')
    
    origin_conversation = question_df['second_session_dialogue'][row]
    qa_pair = question_df['question'][row]
    
    similarity_memory = question_df.loc[row, 'long_similarity']
    bm25_memory = question_df.loc[row, 'long_bm25']
    keyword_memory = question_df.loc[row, 'long_keyword']
    hyde_memory = question_df.loc[row, 'long_HyDE']
    
    similarity_score, bm25_score, keyword_score, hyde_score = get_res(origin_conversation, qa_pair, similarity_memory, bm25_memory, keyword_memory, hyde_memory)
    question_df.loc[row, 'similarity_score'] = similarity_score
    question_df.loc[row, 'bm25_score'] = bm25_score
    question_df.loc[row, 'keyword_score'] = keyword_score
    question_df.loc[row, 'hyde_score'] = hyde_score

---process 101---


  question_df.loc[row, 'similarity_score'] = similarity_score
  question_df.loc[row, 'bm25_score'] = bm25_score
  question_df.loc[row, 'keyword_score'] = keyword_score
  question_df.loc[row, 'hyde_score'] = hyde_score


---process 102---
---process 103---
---process 104---
---process 105---
---process 106---
---process 107---
---process 108---
---process 109---
---process 110---
---process 111---
---process 112---
---process 113---
---process 114---
---process 115---
---process 116---
---process 117---
---process 118---
---process 119---
---process 120---
---process 121---
---process 122---
---process 123---
---process 124---
---process 125---
---process 126---
---process 127---
---process 128---
---process 129---
---process 130---
---process 131---
---process 132---
---process 133---
---process 134---
---process 135---
---process 136---
---process 137---
---process 138---
---process 139---
---process 140---
---process 141---
---process 142---
---process 143---
---process 144---
---process 145---
---process 146---
---process 147---
---process 148---
---process 149---
---process 150---
---process 151---
---process 152---
---process 153---
---process 154---
---process 155---
---process 156---
---process

In [11]:
question_df.to_json("ablation_long.json", orient="records", lines=True)

In [None]:
observe_df = question_df.drop(['first_session_dialogue', 'second_session_dialogue', 'third_session_dialogue', 'fourth_session_dialogue', 'fifth_session_dialogue'], axis=1)[:300]
print(observe_df['similarity_score'].astype(int).sum()/300)
print(observe_df['bm25_score'].astype(int).sum()/300)
print(observe_df['keyword_score'].astype(int).sum()/300)
print(observe_df['hyde_score'].astype(int).sum()/300)

8.9
8.203333333333333
7.136666666666667
8.63


Test of Grouping task

In [4]:
from dotenv import load_dotenv
from openai import OpenAI
import os

load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def llm_create(prompt):
    messages = [{"role": "user", "content": prompt}]
    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages,
    )
    return completion.choices[0].message.content

In [83]:
import requests

def llama_create(prompt):
    url = "http://localhost:11434/api/generate"
    data = {
        "model": "mistral",
        "prompt": prompt,
        "stream":False
    }

    response = requests.post(url, json=data)
    return response.json()['response']

In [54]:
chatlog_classify_prompt = """Watch the following chat logs, you need to write the memory for youself,
Group chat records according to topics and summarize each group with json format.
Each summary can't over {summary_limit} and need as detail like date, where or do what as you can.
Example:
Chat logs:[
    {{'id':1, 'text':'assistant:Hi, how are you today?, user:Good. I walked in the park today'}},
    {{'id':2, 'text':'user:I saw dogs and a parrot, it can speak chinese!, assistant:That's really gread!'}}
]
```json
{{
    "groups": [
        {{
            "summary": "user walked in the park today and saw some dogs and a parrot that can speak chinese",
            "chat_logs": [1, 2]
        }}
    ]
}}
```
Chat logs:{chat_logs}

Only json output, don't say something else.
"""

In [102]:
import pandas as pd
question_df = pd.read_json('question_ready2.json', lines=True)

In [16]:
group_df = pd.read_json('group_task_model.json', lines=True)

In [None]:
group_df = pd.DataFrame()
group_df['gpt-4o-mini'] = None
group_df['gpt-4o-mini_try'] = None
group_df['llama-3_3-70b'] = None
group_df['llama-3_3-70b_try'] = None
group_df['Gemma-2-27b'] = None
group_df['Gemma-2-27b_try'] = None
group_df['chat_logs_number'] = None
group_df['qwen-2.5-32b'] = None
group_df['qwen-2.5-32b_try'] = None
group_df['llama-3_1-8b'] = None
group_df['llama-3_1-8b_try'] = None

In [None]:
import json
import re

def group_task(chat_logs):
    chat_logs_list = [chat_logs[i:i + 15] for i in range(0, len(chat_logs), 15)]
    for _, chat_logs in enumerate(chat_logs_list):
        for i, log in enumerate(chat_logs):
            log['id'] = i
        log_set = set(range(0, len(chat_logs)))
        
        try_count = 0
        success = False
        while try_count<5:
            try_count+=1
            json_res = llama_create(chatlog_classify_prompt.format(summary_limit=50,chat_logs=chat_logs))
            if json_res[0]=='`':
                groups = json.loads(re.search(r"```json(.*?)```", json_res, re.DOTALL).group(1).strip())
            else:
                groups = eval(json_res)
            classify_set = set()
            for group in groups['groups']:
                classify_set.update(group.get('chat_logs'))
            if log_set != classify_set:
                print(f"Chat logs not correct, missing id:{log_set.difference(classify_set)}, unknown id:{classify_set.difference(log_set)}, retry..")
            else:
                success = True
                break
        if success:
            return groups, try_count
        return groups, None

for row in range(300):
    print(f'---process {row+1}---')
    
    question = f"user:{eval(question_df['question'][row])['user']}"
    chat_logs = []
    for col in ['first_session_dialogue', 'second_session_dialogue', 'third_session_dialogue', 'fourth_session_dialogue', 'fifth_session_dialogue']:
        conv = eval(question_df[col][row])
        for log in conv:
            chat_logs.append({
                "text":f"user:{log['user']}, assistant:{log['assistant']}",
                "time":log['time']
            })
    groups, try_count = group_task(chat_logs)
    group_df.loc[row, 'llama-3_2-3b_try'] = try_count
    group_df.loc[row, 'llama-3_2-3b'] = str(groups)
    
    if (row+1)%20==0:
        group_df.to_json("group_task_model.json", orient="records", lines=True)

In [99]:
group_df.head()

Unnamed: 0,gpt-4o-mini,gpt-40-mini_try,llama-3_3-70b,llama-3_3-70b_try,Gemma-2-27b,Gemma-2-27b_try,chat_logs_number,qwen-2.5-32b,qwen-2.5-32b_try,llama-3_1-8b,llama-3_1-8b_try
0,{'groups': [{'summary': 'User gifted a house t...,1.0,{'groups': [{'summary': 'user gifted assistant...,1.0,{'groups': [{'summary': 'user gives assistant ...,1.0,23,{'groups': [{'summary': 'Neighbor discusses ne...,1,{'groups': [{'summary': 'user gave assistant a...,1.0
1,{'groups': [{'summary': 'User bought assistant...,1.0,{'groups': [{'summary': 'user bought assistant...,1.0,{'groups': [{'summary': 'User bought a new hou...,1.0,23,"{'groups': [{'summary': ""On 2024-11-20, user s...",1,{'groups': [{'summary': 'user bought assistant...,2.0
2,{'groups': [{'summary': 'User discussing their...,1.0,{'groups': [{'summary': 'user unsure about fut...,1.0,"{'groups': [{'summary': ""User and mentor discu...",1.0,23,{'groups': [{'summary': 'Mentee discusses care...,1,{'groups': [{'summary': 'Mentee talked about t...,1.0
3,{'groups': [{'summary': 'User had an amazing v...,1.0,{'groups': [{'summary': 'user vacationed in Fl...,,{'groups': [{'summary': 'user describes their ...,,23,{'groups': [{'summary': 'User returned from va...,2,{'groups': [{'summary': 'user went on vacation...,3.0
4,{'groups': [{'summary': 'User cleaned their tr...,1.0,{'groups': [{'summary': 'user cleaned tree hou...,1.0,{'groups': [{'summary': 'User and assistant di...,1.0,23,"{'groups': [{'summary': 'On 2024-11-20, user c...",1,{'groups': [{'summary': 'User cleaned their tr...,2.0


In [100]:
group_df.to_json("group_task_model.json", orient="records", lines=True)