Generate datasets from Conversation Chronicles

In [1]:
from datetime import datetime, timedelta
import re

def transform_time_intervals(intervals):
    def parse_relative_time(text):
        match = re.match(r"A (few|couple of) (hours|days|weeks|months|years) after", text)
        if match:
            qty = 3 if match.group(1) == "few" else 2
            unit = match.group(2)
            delta = {
                "hours": timedelta(hours=qty),
                "days": timedelta(days=qty),
                "weeks": timedelta(weeks=qty),
                "months": timedelta(days=qty * 30),  # Approximate month as 30 days
                "years": timedelta(days=qty * 365)  # Approximate year as 365 days
            }
            return delta[unit]
        return None

    def generate_timeline(events):
        now = datetime.now() - timedelta(minutes=10)  # 固定當前時間
        timeline = [now]
        for event in reversed(events[1:]):
            delta = parse_relative_time(event)
            if delta:
                now -= delta
            timeline.append(now)

        return list(reversed(timeline))
    
    result = generate_timeline(intervals)
    if len([dt.strftime("%Y-%m-%d %H:%M") for dt in result]) > 5:
        print(f'error:{intervals}')
    return [dt.strftime("%Y-%m-%d %H:%M") for dt in result]


In [3]:
import pandas as pd
df = pd.read_json('test.jsonl', lines=True)
df = df[:500]

In [8]:
target_strings = ['A couple of years after', 'A few months after', 'A few weeks after' , 'A few days after', 'A few hours after']

counts = {target: df['time_interval'].apply(lambda x: x.count(target)).sum() for target in target_strings}

for target, count in counts.items():
    print(f"{target}: {count}")

A couple of years after: 381
A few months after: 412
A few weeks after: 408
A few days after: 381
A few hours after: 395


In [204]:
import pandas as pd

df = pd.read_json('test.jsonl', lines=True) # test.jsonl is Conversation Chronicles datasets
question_df = df.copy()
question_df = question_df.drop(['summary'], axis=1)
question_df['time_changed'] = question_df['time_interval'].apply(transform_time_intervals)
question_df['generate_dialogue']=None
question_df['question_type'] = None
question_df['question']=None
question_df['answer']=None
question_df['explain']=None
question_df['evidence']=None
question_df = question_df[:500]

In [114]:
question_df.to_json("questions_0205.json", orient="records", lines=True)

Generate 500 data

In [2]:
import pandas as pd
question_df = pd.read_json('questions_0205.json', lines=True)
question_df['time_changed'] = question_df['time_interval'].apply(transform_time_intervals)

In [7]:
from dotenv import load_dotenv
from openai import OpenAI
import os

load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def llm_create(prompt):
        messages = [{"role": "user", "content": prompt}]
        completion = client.chat.completions.create(
            model="o3-mini",
            messages=messages,
        )
        return completion.choices[0].message.content

In [4]:
dialog_dict = {
    "0":"first_session_dialogue",
    "1":"second_session_dialogue",
    "2":"third_session_dialogue",
    "3":"fourth_session_dialogue",
    "4":"fifth_session_dialogue"
}

speakers_dict = {
    "0":"first_session_speakers",
    "1":"second_session_speakers",
    "2":"third_session_speakers",
    "3":"fourth_session_speakers",
    "4":"fifth_session_speakers"
}

In [9]:
mix_groups = [1, 5, 6, 7, 8, 9, 11, 14, 15, 16, 24, 26, 27, 30, 31, 32, 39, 41, 43, 45, 46, 49, 50, 53, 54, 57, 58, 72, 73, 79, 83, 93, 97, 99, 101, 103, 109, 110, 112, 113, 118, 119, 123, 125, 126, 127, 133, 137, 138, 140, 142, 143, 144, 145, 150, 151, 157, 165, 166, 171, 174, 177, 178, 179, 182, 192, 197, 199, 204, 205, 207, 213, 216, 217, 218, 219, 221, 225, 232, 234, 241, 243, 249, 254, 255, 258, 260, 261, 269, 270, 272, 280, 284, 288, 299, 300, 307, 310, 313, 314, 317, 321, 325, 326, 327, 328, 331, 339, 344, 345, 346, 347, 349, 350, 357, 362, 365, 367, 370, 379, 380, 381, 382, 383, 384, 388, 392, 393, 394, 395, 400, 401, 402, 408, 413, 416, 418, 419, 421, 425, 427, 428, 429, 435, 441, 442, 443, 445, 446, 448, 449, 450, 451, 453, 458, 461, 463, 467, 476, 481, 482, 485, 492, 495, 498]
short_groups = [3, 12, 13, 17, 18, 19, 21, 22, 23, 28, 35, 36, 37, 38, 51, 52, 56, 63, 64, 66, 69, 70, 71, 75, 76, 77, 78, 80, 85, 86, 89, 90, 91, 95, 96, 98, 100, 104, 106, 107, 114, 115, 120, 124, 129, 130, 134, 139, 147, 148, 149, 152, 156, 158, 159, 164, 167, 168, 173, 175, 186, 188, 191, 195, 200, 202, 203, 206, 208, 210, 214, 215, 220, 223, 224, 228, 229, 231, 233, 235, 237, 240, 247, 251, 253, 256, 259, 263, 266, 268, 277, 278, 281, 282, 285, 289, 290, 292, 293, 297, 301, 302, 304, 308, 309, 311, 312, 319, 320, 322, 324, 333, 334, 335, 336, 351, 352, 354, 355, 356, 358, 359, 363, 364, 366, 369, 372, 375, 386, 387, 390, 396, 397, 404, 405, 406, 410, 411, 414, 417, 420, 422, 423, 426, 432, 436, 438, 444, 447, 454, 455, 456, 462, 470, 471, 473, 474, 475, 477, 480, 483, 484, 488, 489, 490, 491, 493, 494, 499]
long_groups = [0, 2, 4, 10, 20, 25, 29, 33, 34, 40, 42, 44, 47, 48, 55, 59, 60, 61, 62, 65, 67, 68, 74, 81, 82, 84, 87, 88, 92, 94, 102, 105, 108, 111, 116, 117, 121, 122, 128, 131, 132, 135, 136, 141, 146, 153, 154, 155, 160, 161, 162, 163, 169, 170, 172, 176, 180, 181, 183, 184, 185, 187, 189, 190, 193, 194, 196, 198, 201, 209, 211, 212, 222, 226, 227, 230, 236, 238, 239, 242, 244, 245, 246, 248, 250, 252, 257, 262, 264, 265, 267, 271, 273, 274, 275, 276, 279, 283, 286, 287, 291, 294, 295, 296, 298, 303, 305, 306, 315, 316, 318, 323, 329, 330, 332, 337, 338, 340, 341, 342, 343, 348, 353, 360, 361, 368, 371, 373, 374, 376, 377, 378, 385, 389, 391, 398, 399, 403, 407, 409, 412, 415, 424, 430, 431, 433, 434, 437, 439, 440, 452, 457, 459, 460, 464, 465, 466, 468, 469, 472, 478, 479, 486, 487, 496, 497]

In [10]:
len(mix_groups), len(short_groups), len(long_groups)

(165, 169, 166)

mix memory

In [14]:
mix_memory_question_prompt = """You are a strict question designer, and the questions you design are appropriate and uncontroversial.
I want to generate a memory reasoning question like this:

Memory 1
User: I like drinking coffee, especially lattes.
Assistant: Lattes are great! How do you usually drink them?

Memory 2
User: I'm allergic to milk, so I usually drink oat milk or soy milk.
Assistant: Oat milk and soy milk are great alternatives!

Example Question
User: Do you know what I usually add to my coffee?

Explanation
This question requires combining Memory 1 (the user likes lattes) and Memory 2 (the user is allergic to milk and chooses oat milk or soy milk) to correctly answer: "You usually add oat milk or soy milk to your coffee to make a latte." If only one of these memories is known, the question cannot be fully answered.

WARNING:The question must require multiple dialogue memories to determine the answer. 
It should refer to a specific fact with a clear correct answer, not be too vague, because the memories are not limited to these, there are other memories as well. And the answer must be derived solely from the given memories.
This question is asked by the first person in the conversation to the second person.

Next, I will provide five different time memory entries, if any two memories can be combined to form a question, That's exactly what I need.
It is possible that they refer to the same event, so questions involving counting occurrences may be inaccurate.
Time is not needed in the question, if you need to use time, use relative time, avoid using absolute time.
Current time:{current_time}

Memory 1
{memory_1}

Memory 2
{memory_2}

Memory 3
{memory_3}

Memory 4
{memory_4}

Memory 5
{memory_5}

Output as json format, if the memory is not appropriate, don' answer the question, just return "appropriate":False
```json
{{
  "appropriate":"True/False",
  "question":"asked by the first person in the conversation to the second person, no need add the person at begin",
  "answer":"The concise answer by the second person in the conversation to the first person, no need add the person at begin",
  "memory":"List of memory number used to generate the question, like [1,2]",
  "explain":"A concise description of which memories need to be combined to answer the question",
  "evidence":"List of the origin dialogue used to answer the question like ['User: I like drinking coffee, especially lattes.']"
}}
```
"""

In [None]:
import json

not_appropriate = []
for number, row in enumerate(mix_groups):
    success = False
    while not success:
        try:
            print(f'==={number}===')
            dialog_1 = []
            for i in range(len(question_df['first_session_dialogue'][row])):
                dialog_1.append(f"{question_df['first_session_speakers'][row][i]}:{question_df['first_session_dialogue'][row][i]}")
            dialog_1.append(question_df['time_changed'][row][0])
            dialog_2 = []
            for i in range(len(question_df['second_session_dialogue'][row])):
                dialog_2.append(f"{question_df['second_session_speakers'][row][i]}:{question_df['second_session_dialogue'][row][i]}")
            dialog_2.append(question_df['time_changed'][row][1])
            dialog_3 = []
            for i in range(len(question_df['third_session_dialogue'][row])):
                dialog_3.append(f"{question_df['third_session_speakers'][row][i]}:{question_df['third_session_dialogue'][row][i]}")
            dialog_3.append(question_df['time_changed'][row][2])
            dialog_4 = []
            for i in range(len(question_df['fourth_session_dialogue'][row])):
                dialog_4.append(f"{question_df['fourth_session_speakers'][row][i]}:{question_df['fourth_session_dialogue'][row][i]}")
            dialog_4.append(question_df['time_changed'][row][3])
            dialog_5 = []
            for i in range(len(question_df['fifth_session_dialogue'][row])):
                dialog_5.append(f"{question_df['fifth_session_speakers'][row][i]}:{question_df['fifth_session_dialogue'][row][i]}")
            dialog_5.append(question_df['time_changed'][row][4])
            
            current_time = datetime.now().strftime("%Y-%m-%d %H:%M")
            res = llm_create(mix_memory_question_prompt.format(memory_1=dialog_1, memory_2=dialog_2, memory_3=dialog_3, memory_4=dialog_4, memory_5=dialog_5, current_time=current_time))
            res_dict = json.loads(re.search(r"```json(.*?)```", res, re.DOTALL).group(1).strip())
            if res_dict['appropriate']=="True":
                question_df.loc[row, 'generate_dialogue'] = str(res_dict['memory'])
                question_df.loc[row, 'question_type'] = 'mix'
                question_df.loc[row, 'question'] = res_dict['question']
                question_df.loc[row, 'answer'] = res_dict['answer']
                question_df.loc[row, 'explain'] = res_dict['explain']
                question_df.loc[row, 'evidence'] = str(res_dict['evidence'])
            else:
                print(f'not appropriate')
                not_appropriate.append(row)
            success = True
        except Exception as e:
            print(f'error:{e}')
question_df.to_json("questions_0205.json", orient="records", lines=True)

short memory

In [90]:
one_memory_question_prompt = """You are a strict question designer, and the questions you design are appropriate and uncontroversial.
I want to generate a memory question like this:

Memory
Classmates A: Are you into surfing? I'm super into surfing myself
Classmates B: Actually I'm looking to learn. Maybe you could give me a basic lesson some time!
Classmates A: Yeah for sure! We could go to Pacifica, the waves there are pretty light and easy
Classmates B: That sounds awesome
Classmates A: There's even a cool Taco Bell right by the beach, could grab a bite after
Classmates B: What about this Sunday around noon?
Classmates A: Yeah let's do it!

Example Question
"Classmates A":"Remember that one time we went surfing about month age? What was that one place we went to for lunch called?",
"Classmates B":"Taco Bell"

It should refer to a specific fact with a clear correct answer, not be too vague, because the memories are not limited to these, there are other memories as well. And the answer must be derived solely from the given memories.
This question is asked by the first person in the conversation to the second person.
Time is not needed in the question, if you need to use time, use relative time, avoid using absolute time.
Current time:{current_time}

Memory
{memory}

Output as json format
```json
{{
  "question":"asked by the first person in the conversation to the second person, no need add the person at begin",
  "answer":"The concise answer by the second person in the conversation to the first person, no need add the person at begin",
  "evidence":"List of the origin dialogue used to answer the question like ['User: I like drinking coffee, especially lattes.']"
}}
```
"""

In [None]:
from datetime import datetime, timedelta
import random
import json

for number, row in enumerate(short_groups):
    success = False
    while not success:
        try:
            print(f'==={number}===')
            dialog = []
            
            time_list = question_df['time_changed'][row]
            current_time = datetime.now()
            one_day_ago = current_time - timedelta(days=1)
            indices = [i for i, date_str in enumerate(time_list) if one_day_ago <= datetime.strptime(date_str, '%Y-%m-%d %H:%M')]
            session = random.choice(indices)
            
            for i in range(len(question_df[dialog_dict[str(session)]][row])):
                dialog.append(f"{question_df[speakers_dict[str(session)]][row][i]}:{question_df[dialog_dict[str(session)]][row][i]}")
            dialog.append(question_df['time_changed'][row][session])
            
            res = llm_create(one_memory_question_prompt.format(memory=dialog, current_time=current_time.strftime("%Y-%m-%d %H:%M")))
            res_dict = json.loads(re.search(r"```json(.*?)```", res, re.DOTALL).group(1).strip())
            question_df.loc[row, 'generate_dialogue'] = str([session+1])
            question_df.loc[row, 'question_type'] = 'short'
            question_df.loc[row, 'question'] = res_dict['question']
            question_df.loc[row, 'answer'] = res_dict['answer']
            question_df.loc[row, 'evidence'] = str(res_dict['evidence'])
            success = True
        except Exception as e:
            print(f'error:{e}')
question_df.to_json("questions_0205.json", orient="records", lines=True)

long memory

In [26]:
from datetime import datetime, timedelta
import random
import json

for number, row in enumerate(long_groups):
    success = False
    while not success:
        try:
            print(f'==={number}===')
            dialog = []
            
            time_list = question_df['time_changed'][row]
            current_time = datetime.now()
            month_ago = current_time - timedelta(days=30)
            indices = [i for i, date_str in enumerate(time_list) if month_ago >= datetime.strptime(date_str, '%Y-%m-%d %H:%M')]
            if indices:
                session = random.choice(indices)
            else:
                print(f'Earliest time:{time_list[0]}')
                session = 0
            for i in range(len(question_df[dialog_dict[str(session)]][row])):
                dialog.append(f"{question_df[speakers_dict[str(session)]][row][i]}:{question_df[dialog_dict[str(session)]][row][i]}")
            dialog.append(question_df['time_changed'][row][session])
            
            res = llm_create(one_memory_question_prompt.format(memory=dialog, current_time=current_time.strftime("%Y-%m-%d %H:%M")))
            res_dict = json.loads(re.search(r"```json(.*?)```", res, re.DOTALL).group(1).strip())
            question_df.loc[row, 'generate_dialogue'] = str([session+1])
            question_df.loc[row, 'question_type'] = 'long'
            question_df.loc[row, 'question'] = res_dict['question']
            question_df.loc[row, 'answer'] = res_dict['answer']
            question_df.loc[row, 'evidence'] = str(res_dict['evidence'])
            success = True
        except Exception as e:
            print(f'error:{e}')
question_df.to_json("questions_0205.json", orient="records", lines=True)

===0===


In [27]:
question_df.to_json("questions_0205.json", orient="records", lines=True)

Double check question sets

In [11]:
# question_df['check'] = None

mix type check

In [11]:
check_mix_prompt = """You are a strict question designer, and the questions you design are appropriate and uncontroversial.
I will give you two memory entries and a memory reasoning question. You must determine whether the question can only be answered by combining these two memories. 
If a single memory is not enough to answer it and the answer is not too vague, respond with True; otherwise, respond with False. No explanation is needed.
Current time:{current_time}

Memory 1
{memory_1}

Memory 2
{memory_2}

Question and Answer
{question}
"""

In [None]:
for number, row in enumerate(mix_groups):
    success = False
    while not success:
        try:
            print(f'==={number}===')
            session = eval(question_df.loc[row, 'generate_dialogue'])
            memory_1 = session[0]
            memory_2 = session[0]
            
            dialog_1 = []
            for i in range(len(question_df[dialog_dict[str(memory_1-1)]][row])):
                dialog_1.append(f"{question_df[speakers_dict[str(memory_1-1)]][row][i]}:{question_df[dialog_dict[str(memory_1-1)]][row][i]}")
            dialog_1.append(question_df['time_changed'][row][memory_1-1])
            dialog_2 = []
            for i in range(len(question_df[dialog_dict[str(memory_2-1)]][row])):
                dialog_2.append(f"{question_df[speakers_dict[str(memory_2-1)]][row][i]}:{question_df[dialog_dict[str(memory_2-1)]][row][i]}")
            dialog_2.append(question_df['time_changed'][row][memory_2-1])
            
            question = f"{question_df[speakers_dict[str(memory_1-1)]][row][0]}:{question_df.loc[row, 'question']}, {question_df[speakers_dict[str(memory_1-1)]][row][1]}:{question_df.loc[row, 'answer']}"
            
            current_time = datetime.now().strftime("%Y-%m-%d %H:%M")
            res = llm_create(check_mix_prompt.format(memory_1=dialog_1, memory_2=dialog_2, current_time=current_time, question=question))
            
            question_df.loc[row, 'check'] = res
            if res=="False":
                print(f'Not appropriate:{row}')
            success = True
        except Exception as e:
            print(f'error:{e}')
question_df.to_json("questions_0205.json", orient="records", lines=True)

===0===


short check

In [8]:
check_question_prompt = """You are a strict question designer, and the questions you design are appropriate and uncontroversial.
I will give you a memory and a memory question. You must determine whether the question can only be answered by the memory. 
If answer is not too vague and good question, respond with True; otherwise, respond with False. No explanation is needed.
Current time:{current_time}

Memory
{memory}

Question and Answer
{question}
"""

In [9]:
for number, row in enumerate(short_groups):
    success = False
    while not success:
        try:
            print(f'==={number}===')
            session = eval(question_df.loc[row, 'generate_dialogue'])
            memory = session[0]
            
            dialog = []
            for i in range(len(question_df[dialog_dict[str(memory-1)]][row])):
                dialog.append(f"{question_df[speakers_dict[str(memory-1)]][row][i]}:{question_df[dialog_dict[str(memory-1)]][row][i]}")
            dialog.append(question_df['time_changed'][row][memory-1])

            question = f"{question_df[speakers_dict[str(memory-1)]][row][0]}:{question_df.loc[row, 'question']}, {question_df[speakers_dict[str(memory-1)]][row][1]}:{question_df.loc[row, 'answer']}"
            
            current_time = datetime.now().strftime("%Y-%m-%d %H:%M")
            res = llm_create(check_question_prompt.format(memory=dialog, current_time=current_time, question=question))
            
            question_df.loc[row, 'check'] = res
            if res=="False":
                print(f'Not appropriate:{row}')
            success = True
        except Exception as e:
            print(f'error:{e}')
question_df.to_json("questions_0205.json", orient="records", lines=True)

===0===
===1===
===2===
===3===
===4===
===5===
===6===
===7===
===8===
===9===
===10===
===11===
===12===
===13===
===14===
===15===
===16===
===17===
===18===
===19===
===20===
===21===
===22===
===23===
===24===
===25===
===26===
===27===
===28===
===29===
===30===
===31===
===32===
===33===
===34===
===35===
===36===
===37===
===38===
===39===
===40===
===41===
===42===
===43===
===44===
===45===
===46===
===47===
===48===
===49===
===50===
===51===
===52===
===53===
===54===
===55===
===56===
===57===
===58===
===59===
===60===
===61===
===62===
===63===
===64===
===65===
===66===
===67===
===68===
===69===
===70===
===71===
===72===
===73===
===74===
===75===
===76===
===77===
===78===
===79===
===80===
===81===
===82===
===83===
===84===
===85===
===86===
===87===
===88===
===89===
===90===
===91===
===92===
===93===
===94===
===95===
===96===
===97===
===98===
===99===
===100===
===101===
===102===
===103===
===104===
===105===
===106===
===107===
===108===
===109===
===110===


long check

In [None]:
for number, row in enumerate(long_groups):
    success = False
    while not success:
        try:
            print(f'==={number}===')
            session = eval(question_df.loc[row, 'generate_dialogue'])
            memory = session[0]
            
            dialog = []
            for i in range(len(question_df[dialog_dict[str(memory-1)]][row])):
                dialog.append(f"{question_df[speakers_dict[str(memory-1)]][row][i]}:{question_df[dialog_dict[str(memory-1)]][row][i]}")
            dialog.append(question_df['time_changed'][row][memory-1])

            question = f"{question_df[speakers_dict[str(memory-1)]][row][0]}:{question_df.loc[row, 'question']}, {question_df[speakers_dict[str(memory-1)]][row][1]}:{question_df.loc[row, 'answer']}"
            
            current_time = datetime.now().strftime("%Y-%m-%d %H:%M")
            res = llm_create(check_question_prompt.format(memory=dialog, current_time=current_time, question=question))
            
            question_df.loc[row, 'check'] = res
            if res=="False":
                print(f'Not appropriate:{row}')
            success = True
        except Exception as e:
            print(f'error:{e}')
question_df.to_json("questions_0205.json", orient="records", lines=True)