Generate datasets from Conversation Chronicles

In [4]:
from dotenv import load_dotenv
from openai import OpenAI
import os

load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def llm_create(prompt):
        messages = [{"role": "user", "content": prompt}]
        completion = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=messages,
        )
        return completion.choices[0].message.content
    
generate_question_prompt = """Your task is to write a question and a answer for a chat logs between two character
Your goal is to write a question from user to assistant that test assistant's memory
The question and answer should been generated accroding to this chat logs

For example,
assistant: Are you into surfing? I'm super into surfing myself
user: Actually I'm looking to learn. Maybe you could give me a basic lesson some time!
assistant: Yeah for sure! We could go to Pacifica, the waves there are pretty light and easy
user: That sounds awesome
assistant: There's even a cool Taco Bell right by the beach, could grab a bite after
user: What about this Sunday around noon?
assistant: Yeah let's do it!

```json
{{
  "user":"Remember that one time we went surfing? What was that one place we went to for lunch called?",
  "assistant":"Taco Bell"
}}
```

Chat logs:{chat_logs}
"""

In [2]:
import pandas as pd

df = pd.read_json('test.jsonl', lines=True) # test.jsonl is Conversation Chronicles datasets
question_df = df.copy()
question_df = question_df.drop(['time_interval','summary','first_session_speakers','second_session_dialogue','second_session_speakers','third_session_dialogue','third_session_speakers','fourth_session_dialogue','fourth_session_speakers','fifth_session_dialogue','fifth_session_speakers'], axis=1)
question_df['generate_dialogue']=None
question_df['question']=None
question_df.head()

Unnamed: 0,dataID,relationship,first_session_dialogue,generate_dialogue,question
0,episode-15761,Neighbors,"[Hi there, how are you doing today?, Hi. I'm d...",,
1,episode-15776,Co-workers,"[Hey, B. Can I talk to you about something per...",,
2,episode-15784,Parent and Child,[I'm so sick of having to take out the trash e...,,
3,episode-15787,Classmates,"[Hey there, Classmates B! What's up? Why are y...",,
4,episode-15790,Husband and Wife,"[Hey, how's your day going so far?, It's going...",,


In [3]:
len(question_df)

20000

Generate 500 data

In [4]:
import json
import re

for row in range(500):
    dialog = []
    charact = ['user', 'assistant']
    for i, log in enumerate(question_df['first_session_dialogue'][row]):
        dialog.append(f"{charact[i%2]}:{log}")
    question_df.loc[row, 'generate_dialogue'] = str(dialog)
    p = generate_question_prompt.format(chat_logs=dialog)
    res = llm_create(p)
    res_dict = json.loads(re.search(r"```json(.*?)```", res, re.DOTALL).group(1).strip())
    question_df.loc[row, 'question'] = str(res_dict)

In [5]:
question_df.head()

Unnamed: 0,dataID,relationship,first_session_dialogue,generate_dialogue,question
0,episode-15761,Neighbors,"[Hi there, how are you doing today?, Hi. I'm d...","['user:Hi there, how are you doing today?', ""a...",{'user': 'Can you remind me what you said yest...
1,episode-15776,Co-workers,"[Hey, B. Can I talk to you about something per...","['user:Hey, B. Can I talk to you about somethi...",{'user': 'What did I say I wanted to focus on ...
2,episode-15784,Parent and Child,[I'm so sick of having to take out the trash e...,"[""user:I'm so sick of having to take out the t...",{'user': 'What did I say about feeling sick of...
3,episode-15787,Classmates,"[Hey there, Classmates B! What's up? Why are y...","[""user:Hey there, Classmates B! What's up? Why...",{'user': 'What did I say that made you feel un...
4,episode-15790,Husband and Wife,"[Hey, how's your day going so far?, It's going...","[""user:Hey, how's your day going so far?"", ""as...",{'user': 'What did I promise to take care of w...


In [6]:
question_df['question'][0]

"{'user': 'Can you remind me what you said yesterday about the time my car got stuck in the snow?', 'assistant': 'I mentioned that I was really grateful you were there to help me dig it out.'}"

In [7]:
for log in eval(question_df['generate_dialogue'][0]):
    print(log)

user:Hi there, how are you doing today?
assistant:Hi. I'm doing good, thanks for asking. How about you?
user:I'm feeling great actually, especially after helping you carry your groceries to your car yesterday.
assistant:Oh yes, thank you so much for helping me out! It was so kind of you.
user:It was my pleasure. It felt good to be able to help you. We are neighbors after all.
assistant:Absolutely! I really appreciate your kindness. If you ever need any help with anything, don't hesitate to ask.
user:I appreciate that. I'm always here to lend a helping hand if you need it too.
assistant:That's great to hear. You know, it reminds me of the time when my car got stuck in the snow and you helped me dig it out.
user:Oh yeah, I remember that. It was quite the winter storm that day.
assistant:It definitely was. But I was so grateful that you were there to help. You're a great neighbor.
user:Thank you, I try my best. It's nice to know we can count on each other in times of need.
assistant:Absol

In [8]:
question_df.to_json("question_sets.json", orient="records", lines=True)

Double check question sets

In [7]:
import pandas as pd
df = pd.read_json('question_sets.json', lines=True)
question_df = df.copy()
question_df["double_check"] = None
question_df["reason"] = None
question_df.head()

Unnamed: 0,dataID,relationship,first_session_dialogue,generate_dialogue,question,double_check,reason
0,episode-15761,Neighbors,"[Hi there, how are you doing today?, Hi. I'm d...","['user:Hi there, how are you doing today?', ""a...",{'user': 'Can you remind me what you said yest...,,
1,episode-15776,Co-workers,"[Hey, B. Can I talk to you about something per...","['user:Hey, B. Can I talk to you about somethi...",{'user': 'What did I say I wanted to focus on ...,,
2,episode-15784,Parent and Child,[I'm so sick of having to take out the trash e...,"[""user:I'm so sick of having to take out the t...",{'user': 'What did I say about feeling sick of...,,
3,episode-15787,Classmates,"[Hey there, Classmates B! What's up? Why are y...","[""user:Hey there, Classmates B! What's up? Why...",{'user': 'What did I say that made you feel un...,,
4,episode-15790,Husband and Wife,"[Hey, how's your day going so far?, It's going...","[""user:Hey, how's your day going so far?"", ""as...",{'user': 'What did I promise to take care of w...,,


In [8]:
double_check_prompt = """You are an inspector
There is a conversation record here:{conversation_record}
There is a memory question and answer based on this conversation record.
question:{question}
You must check whether this question and answer can only be obtained from this conversation record. 
The output format is as follows, if proper field is true then dont need reason
```json
{{
    "proper":"true",
    "reason":""
}}
```"""

In [11]:
import json
import re

for row in range(500):
    conversation_record = str(question_df.loc[row, 'generate_dialogue'])
    question = str(question_df.loc[row, 'question'])
    p = double_check_prompt.format(conversation_record=conversation_record, question=question)
    res = llm_create(p)
    res_dict = json.loads(re.search(r"```json(.*?)```", res, re.DOTALL).group(1).strip())
    question_df.loc[row, 'double_check'] = res_dict.get('proper')
    if res_dict.get('reason'):
        question_df.loc[row, 'reason'] = res_dict.get('reason')

In [12]:
question_df.head()

Unnamed: 0,dataID,relationship,first_session_dialogue,generate_dialogue,question,double_check,reason
0,episode-15761,Neighbors,"[Hi there, how are you doing today?, Hi. I'm d...","['user:Hi there, how are you doing today?', ""a...",{'user': 'Can you remind me what you said yest...,True,
1,episode-15776,Co-workers,"[Hey, B. Can I talk to you about something per...","['user:Hey, B. Can I talk to you about somethi...",{'user': 'What did I say I wanted to focus on ...,True,
2,episode-15784,Parent and Child,[I'm so sick of having to take out the trash e...,"[""user:I'm so sick of having to take out the t...",{'user': 'What did I say about feeling sick of...,True,
3,episode-15787,Classmates,"[Hey there, Classmates B! What's up? Why are y...","[""user:Hey there, Classmates B! What's up? Why...",{'user': 'What did I say that made you feel un...,True,
4,episode-15790,Husband and Wife,"[Hey, how's your day going so far?, It's going...","[""user:Hey, how's your day going so far?"", ""as...",{'user': 'What did I promise to take care of w...,True,


In [13]:
question_df.to_json("question_sets.json", orient="records", lines=True)

In [15]:
false_question = question_df[question_df['reason']=='false']
false_question

Unnamed: 0,dataID,relationship,first_session_dialogue,generate_dialogue,question,double_check,reason
