In [1]:
import json
import os
import datetime

In [2]:

# Specify the root directory for the artifacts
root_dir = '/home/moraa/Documents/GenAI/Prompt_Engineering_for_E-Learning_Content_Creation'

# Paths to your extracted data JSON files
pptx_json_path = os.path.join(root_dir, 'artifacts/data_ingestion_pptx/extracted_data.json')
pdf_json_paths = [
    os.path.join(root_dir, 'artifacts/ingest_pdf/extracted_data_1umzTCsbBmuFx4xz9DSMI82oq21tHhbKL.json'),
    os.path.join(root_dir, 'artifacts/ingest_pdf/extracted_data_13oqVt9LYdESPS8XNYFLhSLZMkZk52JXG.json')
]

# Chunking parameters
CHUNK_SIZE = 200  # Number of characters per chunk

# Function to load JSON data
def load_json_data(json_path):
    with open(json_path, 'r') as json_file:
        return json.load(json_file)

# Function to create chunks
def create_chunks(text, chunk_size):
    # Split text into chunks of specified size
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

# Function to create chunk metadata
def create_chunk_metadata(source_file, chunked_data):
    metadata = {
        "source_file": source_file,
        "chunking_date": str(datetime.datetime.now()),
        "number_of_chunks": len(chunked_data),
        "details": []
    }

    for idx, chunk in enumerate(chunked_data):
        metadata['details'].append({
            "chunk_index": idx,
            "chunk_length": len(chunk)
        })

    return metadata

# Specify the folder for chunked data
chunked_folder = os.path.join(root_dir, 'artifacts/chunked')
os.makedirs(chunked_folder, exist_ok=True)

# Process PPTX data
pptx_data = load_json_data(pptx_json_path)
pptx_text = " ".join([item.get('text', '') for item in pptx_data])
pptx_chunks = create_chunks(pptx_text, CHUNK_SIZE)

# Save chunked PPTX data and metadata
pptx_chunked_path = os.path.join(chunked_folder, 'chunked_pptx.json')
with open(pptx_chunked_path, 'w') as chunked_file:
    json.dump(pptx_chunks, chunked_file, indent=4)

pptx_metadata = create_chunk_metadata('downloaded_presentation.pptx', pptx_chunks)
pptx_metadata_path = os.path.join(chunked_folder, 'metadata_chunked_pptx.json')
with open(pptx_metadata_path, 'w') as metadata_file:
    json.dump(pptx_metadata, metadata_file, indent=4)

print(f"Chunked PPTX data saved to {pptx_chunked_path}")
print(f"PPTX chunk metadata saved to {pptx_metadata_path}")

# Process PDF data
for pdf_json_path in pdf_json_paths:
    pdf_data = load_json_data(pdf_json_path)
    pdf_text = " ".join([item.get('text', '') for item in pdf_data])
    pdf_chunks = create_chunks(pdf_text, CHUNK_SIZE)

    # Save chunked PDF data and metadata
    pdf_filename = os.path.basename(pdf_json_path).replace('extracted_data_', '').replace('.json', '')
    pdf_chunked_path = os.path.join(chunked_folder, f'chunked_pdf_{pdf_filename}.json')
    with open(pdf_chunked_path, 'w') as chunked_file:
        json.dump(pdf_chunks, chunked_file, indent=4)

    pdf_metadata = create_chunk_metadata(pdf_filename, pdf_chunks)
    pdf_metadata_path = os.path.join(chunked_folder, f'metadata_chunked_pdf_{pdf_filename}.json')
    with open(pdf_metadata_path, 'w') as metadata_file:
        json.dump(pdf_metadata, metadata_file, indent=4)

    print(f"Chunked PDF data saved to {pdf_chunked_path}")
    print(f"PDF chunk metadata saved to {pdf_metadata_path}")


Chunked PPTX data saved to /home/moraa/Documents/GenAI/Prompt_Engineering_for_E-Learning_Content_Creation/artifacts/chunked/chunked_pptx.json
PPTX chunk metadata saved to /home/moraa/Documents/GenAI/Prompt_Engineering_for_E-Learning_Content_Creation/artifacts/chunked/metadata_chunked_pptx.json
Chunked PDF data saved to /home/moraa/Documents/GenAI/Prompt_Engineering_for_E-Learning_Content_Creation/artifacts/chunked/chunked_pdf_1umzTCsbBmuFx4xz9DSMI82oq21tHhbKL.json
PDF chunk metadata saved to /home/moraa/Documents/GenAI/Prompt_Engineering_for_E-Learning_Content_Creation/artifacts/chunked/metadata_chunked_pdf_1umzTCsbBmuFx4xz9DSMI82oq21tHhbKL.json
Chunked PDF data saved to /home/moraa/Documents/GenAI/Prompt_Engineering_for_E-Learning_Content_Creation/artifacts/chunked/chunked_pdf_13oqVt9LYdESPS8XNYFLhSLZMkZk52JXG.json
PDF chunk metadata saved to /home/moraa/Documents/GenAI/Prompt_Engineering_for_E-Learning_Content_Creation/artifacts/chunked/metadata_chunked_pdf_13oqVt9LYdESPS8XNYFLhSLZMk