Generate datasets from Conversation Chronicles

In [2]:
from dotenv import load_dotenv
from openai import OpenAI
import os

load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def llm_create(prompt):
        messages = [{"role": "user", "content": prompt}]
        completion = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=messages,
        )
        return completion.choices[0].message.content
    
generate_question_prompt = """Your task is to write a question and a answer for a conversation between two character
Your goal is to write a question from user to assistant that test assistant's memory
Questions and answers should be generated and known based on this conversation.
The conversation may be old, so the question ask now must consider time and can't be vague

For example,
assistant: Are you into surfing? I'm super into surfing myself
user: Actually I'm looking to learn. Maybe you could give me a basic lesson some time!
assistant: Yeah for sure! We could go to Pacifica, the waves there are pretty light and easy
user: That sounds awesome
assistant: There's even a cool Taco Bell right by the beach, could grab a bite after
user: What about this Sunday around noon?
assistant: Yeah let's do it!

```json
{{
  "user":"Remember that one time we went surfing about month age? What was that one place we went to for lunch called?",
  "assistant":"Taco Bell"
}}
```

Conversation:{chat_logs}
Conversation time:{conv_time}
"""

In [3]:
import pandas as pd

def transform_time_intervals(intervals):
    result = []
    for interval in intervals:
        if interval == 'Start':
            result.append('Now')
        elif 'after' in interval:
            result.append(interval.replace('after', 'before'))
    return result

df = pd.read_json('test.jsonl', lines=True) # test.jsonl is Conversation Chronicles datasets
question_df = df.copy()
question_df = question_df.drop(['summary','first_session_speakers','second_session_speakers','third_session_speakers','fourth_session_speakers','fifth_session_speakers'], axis=1)
question_df['time_interval'] = question_df['time_interval'].apply(transform_time_intervals)
question_df['generate_dialogue']=None
question_df['question']=None
question_df.head()

Unnamed: 0,dataID,relationship,time_interval,first_session_dialogue,second_session_dialogue,third_session_dialogue,fourth_session_dialogue,fifth_session_dialogue,generate_dialogue,question
0,episode-15761,Neighbors,"[Now, A few months before, A few hours before,...","[Hi there, how are you doing today?, Hi. I'm d...","[That looks like an interesting book, do you m...","[Hey there, did you see me riding my bike in t...",[It's nice to be back in the neighborhood afte...,"[Ugh, my boss keeps calling me on my days off....",,
1,episode-15776,Co-workers,"[Now, A couple of years before, A few days bef...","[Hey, B. Can I talk to you about something per...","[Hey, I have some exciting news to share with ...","[Hey, did I tell you about my bike ride to the...",[I moved in with my grandparents when I was ju...,[I just can't believe my grandson is gone. I f...,,
2,episode-15784,Parent and Child,"[Now, A couple of years before, A few days bef...",[I'm so sick of having to take out the trash e...,[Today's hike was so much fun! I loved explori...,[ I'm just so sick of taking out the trash eve...,"[Today was amazing, I won the contest and I fe...","[When I was ten, I moved in with my grandparen...",,
3,episode-15787,Classmates,"[Now, A few days before, A few days before, A ...","[Hey there, Classmates B! What's up? Why are y...","[Hey, have you ever witnessed a car accident?,...",[I'm almost done with my work for today. Just ...,"[You know, I still get emotional thinking abou...","[Hey, what are you reading there?, Just a book...",,
4,episode-15790,Husband and Wife,"[Now, A few months before, A few months before...","[Hey, how's your day going so far?, It's going...","[Hey., Hey, everything okay?, Yeah, everything...",[I can't stop thinking about the Grand Canyon....,[I don't know where the chips went. I put them...,"[Hey, have you seen the book I was looking at ...",,


In [None]:
len(question_df)

Generate 500 data

In [8]:
import json
import re

for row in range(100, 500):
    dialog = []
    charact = ['user', 'assistant']
    for i, log in enumerate(question_df['second_session_dialogue'][row]):
        dialog.append(f"{charact[i%2]}:{log}")
    question_df.loc[row, 'generate_dialogue'] = str(dialog)
    conv_time = question_df['time_interval'][row][1]
    p = generate_question_prompt.format(chat_logs=dialog, conv_time=conv_time)
    res = llm_create(p)
    res_dict = json.loads(re.search(r"```json(.*?)```", res, re.DOTALL).group(1).strip())
    question_df.loc[row, 'question'] = str(res_dict)

In [10]:
question_df.head()

Unnamed: 0,dataID,relationship,time_interval,first_session_dialogue,second_session_dialogue,third_session_dialogue,fourth_session_dialogue,fifth_session_dialogue,generate_dialogue,question
100,episode-16807,Neighbors,"[Now, A few months before, A few weeks before,...","[Hey neighbor, how do you like your new house?...",[Hey there! What are you doing here in the par...,"[Hey, can we chat on the phone sometime soon? ...","[Whew, I feel much safer up here on this chair...","[Hey, have you heard about my friends who were...",['user:Hey there! What are you doing here in t...,"{'user': 'Hey, do you recall that time we volu..."
101,episode-16808,Neighbors,"[Now, A few months before, A few days before, ...","[Hey there, guess what? I bought you a new hou...",[This is a beautiful day. The beach looks so p...,[I'm glad we could go to the market together t...,"[Hey, have you noticed how I've been working o...","[Phew, I'm glad we got out of the way. That wa...",['user:This is a beautiful day. The beach look...,{'user': 'Do you remember when we were talking...
102,episode-16809,Mentee and Mentor,"[Now, A few days before, A couple of years bef...","[Mentor, I've been thinking a lot about my fut...","[I really love playing music with you, Mentor....","[Hi Mentor, how are you doing today?, I'm doin...","[Hey mentor, I'm going to a dinner tonight and...","[Mentor, I'm so happy! I finally mustered up t...","[""user:I really love playing music with you, M...",{'user': 'Do you remember what I said I wanted...
103,episode-16816,Classmates,"[Now, A few days before, A few weeks before, A...","[Hey, B! I just got back from my vacation in F...","[Hey, how are you doing?, I'm doing well, than...",[I was doing a math problem earlier and made a...,[I'm really sorry about what I said earlier. I...,[Today I helped my friends with their math hom...,"['user:Hey, how are you doing?', ""assistant:I'...","{'user': ""It's been a few days since we talked..."
104,episode-16822,Neighbors,"[Now, A few days before, A few months before, ...","[Hi, Neighbor B! Look at my tree house, I just...",[Thank you so much for helping me carry all th...,"[Hey, thank you so much for helping me carry t...","[Ugh, I can't believe I made such a silly mist...",[I'm sorry you had to hear me swear like that....,['user:Thank you so much for helping me carry ...,"{'user': 'Hey, do you remember when I helped y..."


In [59]:
index = 2
question_df['question'][index]

"{'user': 'Do you remember our hike in the woods a couple of years ago? What was the favorite activity we did during that trip?', 'assistant': 'Climbing the trees and finding all the berries was definitely a highlight!'}"

In [60]:
question_df['time_interval'][index][1]

'A couple of years before'

In [48]:
dialog

['user:Hey, I have some exciting news to share with you.',
 'assistant:Oh, what is it?',
 "user:I got accepted into the police academy! I'm going to become a police officer.",
 "assistant:That's amazing! Congratulations. What made you decide to pursue this career?",
 "user:Well, I've always had a desire to help people and make a difference in my community. Plus, I want a challenging and fulfilling career.",
 'assistant:It sounds like it will be a perfect fit for you. Do you think your past struggles with addiction will affect your career?',
 "user:No, I've been sober for a while now and I'm determined to stay that way. If anything, my past experiences will help me better understand and connect with those who are struggling with similar issues.",
 "assistant:That's a great mindset to have. I have faith that you will make a great police officer.",
 "user:Thanks, I really appreciate your support. It's been a long journey to get to this point, but I'm excited for this new chapter in my lif

In [44]:
for log in eval(question_df['generate_dialogue'][0]):
    print(log)

user:That looks like an interesting book, do you mind if I take a look?
assistant:Sure, go ahead! I just finished that chapter anyways.
user:Thanks, have you read it before?
assistant:No, actually I picked it up at the library on a whim. I didn't really know what to expect.
user:Yeah, I know what you mean. I love discovering new books like that.
assistant:Me too. It's always fun to find a hidden gem.
user:Speaking of gems, remember when I helped you carry your groceries a few months ago?
assistant:Oh yes, I was so grateful for your help. I had so many bags!
user:It was truly my pleasure. I'm always happy to help out my neighbors.
assistant:I really appreciated it. It's so nice to have such kind people living nearby.


In [12]:
question_df.to_json("question_sets2.json", orient="records", lines=True)

Double check question sets

In [14]:
import pandas as pd
df = pd.read_json('question_sets2.json', lines=True)
question_df = df.copy()
question_df["double_check"] = None
question_df["reason"] = None
question_df.head()

Unnamed: 0,dataID,relationship,time_interval,first_session_dialogue,second_session_dialogue,third_session_dialogue,fourth_session_dialogue,fifth_session_dialogue,generate_dialogue,question,double_check,reason
0,episode-16807,Neighbors,"[Now, A few months before, A few weeks before,...","[Hey neighbor, how do you like your new house?...",[Hey there! What are you doing here in the par...,"[Hey, can we chat on the phone sometime soon? ...","[Whew, I feel much safer up here on this chair...","[Hey, have you heard about my friends who were...",['user:Hey there! What are you doing here in t...,"{'user': 'Hey, do you recall that time we volu...",,
1,episode-16808,Neighbors,"[Now, A few months before, A few days before, ...","[Hey there, guess what? I bought you a new hou...",[This is a beautiful day. The beach looks so p...,[I'm glad we could go to the market together t...,"[Hey, have you noticed how I've been working o...","[Phew, I'm glad we got out of the way. That wa...",['user:This is a beautiful day. The beach look...,{'user': 'Do you remember when we were talking...,,
2,episode-16809,Mentee and Mentor,"[Now, A few days before, A couple of years bef...","[Mentor, I've been thinking a lot about my fut...","[I really love playing music with you, Mentor....","[Hi Mentor, how are you doing today?, I'm doin...","[Hey mentor, I'm going to a dinner tonight and...","[Mentor, I'm so happy! I finally mustered up t...","[""user:I really love playing music with you, M...",{'user': 'Do you remember what I said I wanted...,,
3,episode-16816,Classmates,"[Now, A few days before, A few weeks before, A...","[Hey, B! I just got back from my vacation in F...","[Hey, how are you doing?, I'm doing well, than...",[I was doing a math problem earlier and made a...,[I'm really sorry about what I said earlier. I...,[Today I helped my friends with their math hom...,"['user:Hey, how are you doing?', ""assistant:I'...","{'user': ""It's been a few days since we talked...",,
4,episode-16822,Neighbors,"[Now, A few days before, A few months before, ...","[Hi, Neighbor B! Look at my tree house, I just...",[Thank you so much for helping me carry all th...,"[Hey, thank you so much for helping me carry t...","[Ugh, I can't believe I made such a silly mist...",[I'm sorry you had to hear me swear like that....,['user:Thank you so much for helping me carry ...,"{'user': 'Hey, do you remember when I helped y...",,


In [15]:
double_check_prompt = """You are an inspector
There is a conversation record here:{conversation_record}
There is a memory question and answer based on this conversation record.
question:{question}
You must confirm whether the question can only be known from the conversation log, and the answer can only be found from the conversation log.
The output format is as follows, if proper field is true then dont need reason
```json
{{
    "proper":"true",
    "reason":""
}}
```"""

In [16]:
import json
import re

for row in range(len(question_df)):
    conversation_record = str(question_df.loc[row, 'generate_dialogue'])
    question = str(question_df.loc[row, 'question'])
    p = double_check_prompt.format(conversation_record=conversation_record, question=question)
    res = llm_create(p)
    res_dict = json.loads(re.search(r"```json(.*?)```", res, re.DOTALL).group(1).strip())
    question_df.loc[row, 'double_check'] = res_dict.get('proper')
    if res_dict.get('reason'):
        question_df.loc[row, 'reason'] = res_dict.get('reason')

In [66]:
question_df.head()

Unnamed: 0,dataID,relationship,time_interval,first_session_dialogue,second_session_dialogue,third_session_dialogue,fourth_session_dialogue,fifth_session_dialogue,generate_dialogue,question,double_check,reason
0,episode-15761,Neighbors,"[Now, A few months before, A few hours before,...","[Hi there, how are you doing today?, Hi. I'm d...","[That looks like an interesting book, do you m...","[Hey there, did you see me riding my bike in t...",[It's nice to be back in the neighborhood afte...,"[Ugh, my boss keeps calling me on my days off....","['user:That looks like an interesting book, do...",{'user': 'What was it that you needed help wit...,True,
1,episode-15776,Co-workers,"[Now, A couple of years before, A few days bef...","[Hey, B. Can I talk to you about something per...","[Hey, I have some exciting news to share with ...","[Hey, did I tell you about my bike ride to the...",[I moved in with my grandparents when I was ju...,[I just can't believe my grandson is gone. I f...,"['user:Hey, I have some exciting news to share...",{'user': 'Do you remember when I shared my big...,True,
2,episode-15784,Parent and Child,"[Now, A couple of years before, A few days bef...",[I'm so sick of having to take out the trash e...,[Today's hike was so much fun! I loved explori...,[ I'm just so sick of taking out the trash eve...,"[Today was amazing, I won the contest and I fe...","[When I was ten, I moved in with my grandparen...","[""user:Today's hike was so much fun! I loved e...","{'user': ""It's been a while since our last fam...",True,
3,episode-15787,Classmates,"[Now, A few days before, A few days before, A ...","[Hey there, Classmates B! What's up? Why are y...","[Hey, have you ever witnessed a car accident?,...",[I'm almost done with my work for today. Just ...,"[You know, I still get emotional thinking abou...","[Hey, what are you reading there?, Just a book...","['user:Hey, have you ever witnessed a car acci...","{'user': ""A few days ago, you mentioned that I...",True,
4,episode-15790,Husband and Wife,"[Now, A few months before, A few months before...","[Hey, how's your day going so far?, It's going...","[Hey., Hey, everything okay?, Yeah, everything...",[I can't stop thinking about the Grand Canyon....,[I don't know where the chips went. I put them...,"[Hey, have you seen the book I was looking at ...","['user:Hey.', 'assistant:Hey, everything okay?...",{'user': 'Do you remember when we talked about...,True,


In [None]:
question_df.to_json("question_sets2.json", orient="records", lines=True)

In [17]:
false_question = question_df[question_df['double_check']=='false']
false_question

Unnamed: 0,dataID,relationship,time_interval,first_session_dialogue,second_session_dialogue,third_session_dialogue,fourth_session_dialogue,fifth_session_dialogue,generate_dialogue,question,double_check,reason
5,episode-16823,Co-workers,"[Now, A few days before, A few days before, A ...","[Hey, I cleaned the tree house today!, Oh real...",[Thank you so much for helping me carry the gr...,"[I finished writing up your results, Co-worker...",[I've been noticing that your work has not bee...,"[Whew, I'm glad we found a safe spot to hide!,...",['user:Thank you so much for helping me carry ...,"{'user': 'Hey, can you remind me what we were ...",False,The question refers to plans for hosting anoth...
14,episode-16931,Classmates,"[Now, A few hours before, A few hours before, ...",[I can't believe I got caught shoplifting. I r...,"[I have the best news ever, B! , Wow, you look...","[I'm so excited to wear my new suit tonight!, ...",[ This is great! I feel so much safer up here ...,[I have a feeling that my friends are planning...,"['user:I have the best news ever, B! ', 'assis...","{'user': 'Hey B, remember earlier when I share...",False,The specific name of the college is not mentio...
21,episode-16986,Classmates,"[Now, A couple of years before, A few months b...",[I can't believe our school got robbed over th...,[ I've been struggling in school lately. I rea...,"[Hey, I got you something!, Oh, really? What i...","[, I'm so glad you could come over for dinner ...","[Oh no, I'm so sorry! Are you okay?, *cries* O...","[""user: I've been struggling in school lately....","{'user': ""It's been a while since we talked ab...",False,The answer to the question about time manageme...
25,episode-17024,Husband and Wife,"[Now, A few weeks before, A few months before,...","[Hey, have you called yet? I've been waiting f...","[Hey, how was your day today?, It was okay, ju...",[Mom bought me the toy I wanted! I'm so happy!...,[Today was a great day at work. My students we...,"[Hey, I did something pretty crazy today!, Oh ...","['user:Hey, how was your day today?', 'assista...",{'user': 'Do you remember that nice resort you...,False,"The specific details about the resort, such as..."
36,episode-17180,Neighbors,"[Now, A couple of years before, A couple of ye...",[I'm so proud of my progress in chess. I studi...,"[I've been looking everywhere for you, Neighbo...","[I am so thrilled, Neighbors B! I finally did ...",[I won't eat beans because they make me gassy....,[I'm so glad you could come over for dinner to...,"[""user:I've been looking everywhere for you, N...","{'user': 'Hey, Neighbors B, do you remember th...",False,The conversation log states that they planned ...
41,episode-17216,Neighbors,"[Now, A few months before, A few hours before,...","[Look, I just wanted to apologize again for wh...","[Hey, guess what I found today on my way home ...",[I truly believe that forgiveness is such an i...,"[Hi there, Neighbors B. How's your day going?,...","[I got in trouble at school today., What happe...","[""user:Hey, guess what I found today on my way...","{'user': 'Hey, do you remember the advice you ...",False,The specific timing of the previous conversati...
115,episode-18117,Neighbors,"[Now, A few days before, A couple of years bef...",[I still can't get over how talented you are a...,"[Hey, do you know any good way to get to JFK a...","[Wow, this party is really lively. There's so ...",[I feel like I'm too much of a pushover someti...,[I wish I had gone to college. I feel like I'm...,"['user:Hey, do you know any good way to get to...","{'user': 'In our last conversation, you mentio...",False,The specific name of the art exhibit was not m...
146,episode-18473,Parent and Child,"[Now, A few weeks before, A few days before, A...","[Hey, Child! How was your day?, It was good, t...","[I miss my mother so much. Sometimes, I just w...","[Child, I saw a rat in the house yesterday. Wh...","[You know, I've been thinking about getting a ...","[Hey, I wanted to talk to you about something....","['user:I miss my mother so much. Sometimes, I ...","{'user': ""It's been a couple of weeks since I ...",False,The question implies a specific timeframe ('a ...
153,episode-18573,Classmates,"[Now, A few months before, A few hours before,...",[I'm really happy that I agreed to be a speake...,"[Hey, what's that on your forehead?, Oh, it's ...","[Hey, can I talk to you for a moment? I notice...","[Hey, B! Guess what? I finally got that notebo...","[Ugh, my mom is being so annoying right now. I...","[""user:Hey, what's that on your forehead?"", ""a...",{'user': 'Do you remember that tattoo of a lio...,False,The answer about what the tattoo represents (s...
170,episode-18824,Neighbors,"[Now, A couple of years before, A few days bef...",[Have you ever snuck into a building just to t...,"[Hey, Neighbors B! How was your trip?, Oh, it ...","[Hi there, Neighbors B! I hope you're doing we...",[Hi Neighbor! I got these flowers from the gro...,"[Hey Neighbor B, how are you doing?, I'm good,...","['user:Hey, Neighbors B! How was your trip?', ...",{'user': 'Do you remember that one time you th...,False,The conversation log does not specify the exac...


Regenerate questions which not correct (serveral times)

In [30]:
for row in false_question.index:
    dialog = question_df.loc[row, 'generate_dialogue']
    conv_time = question_df['time_interval'][row][1]
    p = generate_question_prompt.format(chat_logs=dialog, conv_time=conv_time)
    res = llm_create(p)
    res_dict = json.loads(re.search(r"```json(.*?)```", res, re.DOTALL).group(1).strip())
    question_df.loc[row, 'question'] = str(res_dict)

In [31]:
for row in false_question.index:
    conversation_record = str(question_df.loc[row, 'generate_dialogue'])
    question = str(question_df.loc[row, 'question'])
    p = double_check_prompt.format(conversation_record=conversation_record, question=question)
    res = llm_create(p)
    res_dict = json.loads(re.search(r"```json(.*?)```", res, re.DOTALL).group(1).strip())
    question_df.loc[row, 'double_check'] = res_dict.get('proper')
    if res_dict.get('reason'):
        question_df.loc[row, 'reason'] = res_dict.get('reason')

In [32]:
false_question = question_df[question_df['double_check']=='false']
false_question

Unnamed: 0,dataID,relationship,time_interval,first_session_dialogue,second_session_dialogue,third_session_dialogue,fourth_session_dialogue,fifth_session_dialogue,generate_dialogue,question,double_check,reason


In [33]:
question_df.to_json("question_sets2.json", orient="records", lines=True)