In [8]:
import json

def extract_data(input_file):
    # Load the JSON data
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # Extract queries
    queries = []
    for entry in data['dataset']:
        query_obj = {
            "query_id": entry['query_id'],
            "query": entry['query'],
            "narrative": entry['narrative']
        }
        queries.append(query_obj)
    
    # Extract documents with new unique IDs
    documents = []
    doc_id = 1  # Start document IDs from 1
    
    for entry in data['dataset']:
        for doc in entry['documents']:
            doc_obj = {
                "doc_id": doc_id,
                # "query_id": entry['query_id'],
                "title": doc['title'],
                "content": doc['content']
            }
            documents.append(doc_obj)
            doc_id += 1  # Increment document ID for each document
    
    return queries, documents

def save_to_json(data, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

In [9]:
input_file1 = "data/longformer_summarized.json"
input_file2 = "data/bart_summarized.json"
input_file3 = "data/t5_summarized.json"


# Extract queries and documents
queries1, documents1 = extract_data(input_file1)

save_to_json({"queries": queries1}, "data/queries.json")
save_to_json({"documents": documents1}, "data/documentsLongformer.json")


queries2, documents2 = extract_data(input_file2)
save_to_json({"documents": documents2}, "data/documentsBart.json")

queries3, documents3 = extract_data(input_file3)
save_to_json({"documents": documents3}, "data/documentsT5.json")

print("Saved queries and documents.")

Saved queries and documents.


In [11]:
def extract_documents_from_json(input_file):
    # Load the JSON data
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # Extract documents
    documents = []
    doc_id = 1  # Start document IDs from 1
    
    # Determine if we're working with the document_content structure
    if 'document_content' in str(data):
        # This means we have a structure with document_content
        for document in data.get('documents', []):
            if 'document_content' in document:
                try:
                    # Parse the document_content as JSON
                    content_data = json.loads(document.get('document_content', '{}'))
                    # Process dataset entries
                    for entry in content_data.get('dataset', []):
                        for doc in entry.get('documents', []):
                            doc_obj = {
                                "doc_id": doc_id,
                                "title": doc.get('title', ''),
                                "content": doc.get('content', '')
                            }
                            documents.append(doc_obj)
                            doc_id += 1
                except json.JSONDecodeError:
                    print(f"Could not parse document_content as JSON")
    else:
        # Process regular dataset structure
        for entry in data.get('dataset', []):
            for doc in entry.get('documents', []):
                doc_obj = {
                    "doc_id": doc_id,
                    "title": doc.get('title', ''),
                    "content": doc.get('content', '')
                }
                documents.append(doc_obj)
                doc_id += 1
    
    return documents

def save_to_json(data, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

In [13]:
input_file4 = "data/WikipediaRelevantDocs.json"
documents4 = extract_documents_from_json(input_file4)
save_to_json({"documents": documents4}, "data/documentsOriginal.json")
print(f"Extracted documents and saved to 'documents.json'.")

Extracted documents and saved to 'documents.json'.
