In [4]:
import requests
import json

# URLs of the JSON files
urls = [
    "https://raw.githubusercontent.com/choosewhatulike/trainable-agents/main/data/gen_results/interview_single/Beethoven_chatgpt_result/2023-00-00-00-00-00.json",
    "https://raw.githubusercontent.com/choosewhatulike/trainable-agents/main/data/gen_results/interview_single/Beethoven_vicuna-7b_result/2023-00-00-00-00-00.json",
    "https://raw.githubusercontent.com/choosewhatulike/trainable-agents/main/data/gen_results/interview_single/Beethoven_sft_result/2023-00-00-00-00-00.json",
    "https://raw.githubusercontent.com/choosewhatulike/trainable-agents/main/data/gen_results/interview_single/Beethoven_alpaca-7b_result/2023-00-00-00-00-00.json"
]

# Initialize a list to hold the combined parallel dataset
parallel_dataset = []

# Function to process and append data
def process_and_append_data(data):
    for item in data:
        question = item.get('question', "No question found")
        answer = "No answer found"  # Default answer

        # Check if 'reply' is present
        if 'reply' in item:
            reply = item['reply']
            
            # If 'reply' is a list, check if the first item has 'content'
            if isinstance(reply, list) and len(reply) > 0 and 'content' in reply[0]:
                answer = reply[0]['content']
            # If 'reply' is a dictionary, directly access 'content'
            elif isinstance(reply, dict) and 'content' in reply:
                answer = reply['content']
        
        # Append the question and answer as a dict to the parallel dataset
        parallel_dataset.append({
            "question": question,
            "answer": answer
        })

# Loop through each URL
for url in urls:
    # Send a GET request to the URL
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Load JSON content
        data = json.loads(response.text)
        
        # Process and append data to the parallel dataset
        process_and_append_data(data)
    else:
        print("Failed to retrieve JSON file from", url, ". Status code:", response.status_code)

# Save the combined parallel dataset to a JSON file
file_path = 'beethoven_qa_combined_dataset.json'
with open(file_path, 'w') as f:
    json.dump(parallel_dataset, f)

print(f"Combined dataset saved to {file_path}. Total entries: {len(parallel_dataset)}")

Combined dataset saved to beethoven_qa_combined_dataset.json. Total entries: 328


In [5]:
import requests
import json

# URLs of the JSON Lines files
urls = [
    "https://raw.githubusercontent.com/choosewhatulike/trainable-agents/main/data/gen_results/interview_turns/multiturn_Beethoven_alpaca-7b_result/2023-00-00-00-00-00.jsonl",
    "https://raw.githubusercontent.com/choosewhatulike/trainable-agents/main/data/gen_results/interview_turns/multiturn_Beethoven_chatgpt_result/2023-00-00-00-00-00.jsonl",
    "https://raw.githubusercontent.com/choosewhatulike/trainable-agents/main/data/gen_results/interview_turns/multiturn_Beethoven_sft_result/2023-00-00-00-00-00.jsonl",
    "https://raw.githubusercontent.com/choosewhatulike/trainable-agents/main/data/gen_results/interview_turns/multiturn_Beethoven_vicuna-7b_result/2023-00-00-00-00-00.jsonl"
]

# Initialize a list to hold the combined parallel dataset
parallel_dataset = []

# Process each URL
for url in urls:
    response = requests.get(url)
    
    if response.status_code == 200:
        # Split the response text by lines, as each line is a separate JSON string
        lines = response.text.strip().split('\n')
        
        for line in lines:
            interview = json.loads(line)  # Parse each line as JSON
            
            # Iterate through the content, which contains turn-based dialogue
            for turn in interview.get('content', []):
                # Check if the turn is by the character (Beethoven) and has content
                if turn.get('turn_role') == 'character' and turn.get('turn_content'):
                    # Extract the content
                    for content_piece in turn['turn_content']:
                        if 'content' in content_piece:
                            answer = content_piece['content']
                            # Attempt to find the previous question by the interviewer
                            question_index = interview['content'].index(turn) - 1
                            if question_index >= 0:
                                question_turn = interview['content'][question_index]
                                if question_turn.get('turn_role') == 'interviewer' and question_turn.get('turn_content'):
                                    for question_piece in question_turn['turn_content']:
                                        if 'content' in question_piece:
                                            question = question_piece['content']
                                            # Append the question and answer pair to the dataset
                                            parallel_dataset.append({"question": question, "answer": answer})
    else:
        print("Failed to retrieve JSON Lines file from", url, ". Status code:", response.status_code)

# Save the combined parallel dataset to a JSON file
file_path = 'beethoven_interview_turns_combined_dataset.json'
with open(file_path, 'w') as f:
    json.dump(parallel_dataset, f)

print(f"Combined dataset saved to {file_path}. Total entries: {len(parallel_dataset)}")

Combined dataset saved to beethoven_interview_turns_combined_dataset.json. Total entries: 1023


In [6]:
import json

# File paths
existing_dataset_path = 'beethoven_qa_combined_dataset.json'
new_dataset_path = 'beethoven_interview_turns_combined_dataset.json'
merged_dataset_path = 'beethoven_qa_merged_dataset.json'

# Load the existing dataset
with open(existing_dataset_path, 'r') as file:
    existing_dataset = json.load(file)

# Load the new dataset
with open(new_dataset_path, 'r') as file:
    new_dataset = json.load(file)

# Append the entries from the new dataset to the existing dataset
merged_dataset = existing_dataset + new_dataset

# Save the merged dataset to a JSON file
with open(merged_dataset_path, 'w') as file:
    json.dump(merged_dataset, file)

print(f"Merged dataset saved to {merged_dataset_path}. Total entries: {len(merged_dataset)}")

Merged dataset saved to beethoven_qa_merged_dataset.json. Total entries: 1351


In [7]:
import pandas as pd

# Assuming merged_dataset is your loaded dataset
df = pd.DataFrame(merged_dataset, columns=['question', 'answer'])

# Now, when you display df, the columns will be in the order you specified
print(df)

                                               question  \
0                     What do you think of your father?   
1                     What do you think of your mother?   
2                                What is your interest?   
3                              Where are you come from?   
4                                 Who do you live with?   
...                                                 ...   
1346  I've always been fascinated by your music, esp...   
1347  That's fascinating! Can you tell me more about...   
1348  That's really interesting. Did your deafness a...   
1349  That's incredible! It's amazing how you were a...   
1350  That's truly remarkable! It's incredible how y...   

                                                 answer  
0     My father was a strict man, but he was also my...  
1     My mother was a great influence on my life and...  
2     My interest lies in music, my dear sir. It is ...  
3     I hail from the city of Bonn in the Electorate...  
4

In [8]:
# Save your DataFrame to a CSV file
df.to_csv('beethoven_qa_dataset.csv', index=False)