In [15]:
import pickle
import json
from typing import List, Dict, Any
from collections import Counter
import string
import regex
import sys
from nltk.stem import PorterStemmer


QA_PROMPT = """You are <<<Chatbot>>>, a long-term conversation agent capable of interacting with multiple users. 
Based on the [Dialog History] provided, please answer the given [Question]. 
Note the following points: 
1. Your responses should solely rely on the retrieved dialog history. If the information in the dialog history is insufficient to answer the question, you must admit that you don't know the answer.
2. This question is being asked in the context of <<<Date>>>.

[Question] <<<Question>>>
[Answer]
"""

In [19]:
script_names = [ 'friends', 'bigbang', 'theoffice']
for script_name in script_names:
    raw_data = []
    
    print(f"Processing script: {script_name}")
    if script_name == 'friends':
        chatbot = 'Ross'
    elif script_name == 'bigbang':
        chatbot = 'Sheldon'
    elif script_name == 'theoffice':
        chatbot = 'Michael'
    else:
        assert 0
    

    with open(f"./dialsim_v1.1/{script_name}_dialsim.pickle", 'rb') as f:
        data = pickle.load(f)

    
    corpus = ""
    cur_conv_num = 0
    for epi_id, epi_data in data.items():
        for session_id, session_data in epi_data.items():
            date = session_data.get('date', 'Unknown Date')
            script = session_data.get('script', '')
            cur_conv_num += 1
            processed_history = f"[Date: {date}, Session #{cur_conv_num}]\n{script}\n"
            
            corpus += processed_history
            # if cur_conv_num == 100:
            #     break
            
    # Get size of corpus
    corpus = corpus.strip()
    size = sys.getsizeof(corpus)
    print(f"Corpus for {script_name} has been created with {cur_conv_num} conversations.")
    print(len(corpus))
    # Print size in a human-readable format
    if size < 1024:
        print(f"Corpus for {script_name} has been created with {cur_conv_num} conversations.")
        print(f"Corpus length: {size} bytes.")
    elif size < 1024**2:
        print(f"Corpus for {script_name} has been created with {cur_conv_num} conversations.")
        print(f"Corpus length: {size / 1024:.2f} KB.")
    else:
        print(f"Corpus for {script_name} has been created with {cur_conv_num} conversations.")
        print(f"Corpus length: {size / (1024**2):.2f} MB.")
            
        # break
    corpus = corpus.strip()
    print(f"Corpus for {script_name} has been created with {cur_conv_num} conversations.")
    print(f"Corpus length: {len(corpus)} characters.")
    cur_conv_num = 0
    
    for epi_id, epi_data in data.items():
        for session_id, session_data in epi_data.items():
            cur_conv_num += 1  
            question_found = 0
            question_text = ""
            golden_answer = ""
            question_type = "N/A"
            pool = session_data.get('easy_q', {})
            if not pool:
                continue
                
            questions = pool["ans_w_time"]
            # for q_type, questions in pool.items():
            if questions:
                for key in questions:
                    # if question_found == 2:
                    #     break
                    question_obj = questions[key]
                    # 获取问题文本 (优先使用 'default' 或第一个可用角色)
                    q_dict = question_obj.get('questions', {})
                    if 'default' in q_dict:
                        question_text = q_dict['default']
                    elif q_dict:
                        question_text = next(iter(q_dict.values()))
                    else:
                        continue
                    
                    # 获取答案
                    # print(json.dumps(question_obj, indent=4, ensure_ascii=False))
                    golden_answer = question_obj.get('answer', '')
                    if not golden_answer:
                        continue
                    
                    if question_text:
                        # question_found += 1
                        q_prompt = QA_PROMPT.replace("<<<Chatbot>>>", chatbot).replace("<<<Question>>>", question_text).replace("<<<Date>>>", session_data.get('date', 'Unknown Date'))
                        # 4. 组装成最终的数据格式
                        # 查重
                        for item in raw_data:
                            if item['input_prompt'] == q_prompt:
                                print("Duplicate question found, skipping...")
                                question_found -= 1
                                # assert答案相同
                                if item['info']['golden_answer'] != golden_answer:
                                    print("Warning: Duplicate question with different answers found!")
                                    print(json.dumps(item, indent=4, ensure_ascii=False))
                                    print(golden_answer)
                                    raise ValueError("Duplicate question with different answers found!")
                                break
                        else:
                            raw_data.append({
                                "test_idx": len(raw_data),
                                "input_prompt": q_prompt,
                                "dataset_name": "DialSim-" + script_name,
                                "lang": "en",
                                "info": {
                                    'golden_answer': golden_answer,
                                    'episode': epi_id,
                                    'session_num': session_id,
                                    'conversation_num': cur_conv_num,
                                }
                            })

    # print(f"Total conversations processed: {len(raw_data)}")
    print(f"Total questions processed: {len(raw_data)}")
    with open(f"dialsim_qa_{script_name}_data.json", 'w', encoding='utf-8') as f:
        json.dump(raw_data, f, indent=4, ensure_ascii=False)
    with open(f"dialsim_corpus_{script_name}.txt", 'w', encoding='utf-8') as f:
        f.write(corpus)

Processing script: friends
Corpus for friends has been created with 788 conversations.
1146693
Corpus for friends has been created with 788 conversations.
Corpus length: 2.19 MB.
Corpus for friends has been created with 788 conversations.
Corpus length: 1146693 characters.
Duplicate question found, skipping...
Duplicate question found, skipping...
Duplicate question found, skipping...
Duplicate question found, skipping...
Duplicate question found, skipping...
Duplicate question found, skipping...
Duplicate question found, skipping...
Duplicate question found, skipping...
Duplicate question found, skipping...
Duplicate question found, skipping...
Duplicate question found, skipping...
Duplicate question found, skipping...
Duplicate question found, skipping...
Duplicate question found, skipping...
Duplicate question found, skipping...
Duplicate question found, skipping...
Duplicate question found, skipping...
Duplicate question found, skipping...
Duplicate question found, skipping...
Dupl