In [1]:
import json
from transformers import BertTokenizerFast
import os
from tqdm import tqdm

In [6]:
# Initialize the tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

def preprocess_dataset(input_path, output_dir, is_train=False):
    with open(input_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    processed_data = []
    chunk_count = 0  # Track the number of chunks

    for i, entry in enumerate(tqdm(data, desc="Processing data")):
        context = entry['context']
        target_dialogue = entry['target_dialogue']
        speaker_positions = entry['speaker_position']

        # Tokenize and get offset mappings
        encoding = tokenizer(
            context,
            truncation=True,
            max_length=512,
            padding='max_length',
            return_offsets_mapping=True
        )

        input_ids = encoding['input_ids']
        attention_mask = encoding['attention_mask']
        offsets = encoding['offset_mapping']
        tokens = tokenizer.convert_ids_to_tokens(input_ids)

        # Align target dialogue with token indices
        target_start_char, target_end_char = target_dialogue
        target_token_indices = [
            j for j, (start, end) in enumerate(offsets)
            if start >= target_start_char and end <= target_end_char
        ]
        target_indices = [target_token_indices[0], target_token_indices[-1]] if target_token_indices else [0, 0]

        # Align speaker positions with token indices
        speaker_indices = [
            [
                j for j, (start, end) in enumerate(offsets)
                if (start >= sp_start and end <= sp_end)
            ]
            for sp_start, sp_end in speaker_positions
        ]
        valid_speaker_indices = [
            [idx[0], idx[-1]] for idx in speaker_indices if idx and (idx[-1] - idx[0]) < 20
        ]
        if not valid_speaker_indices:
            print("empty valid_speaker_indices")
            continue
        target_tokens = tokens[target_indices[0]:target_indices[1] + 1]
        speaker_tokens = [
            tokens[start:end + 1] for (start, end) in valid_speaker_indices
        ]
        # Store the preprocessed entry
        processed_data.append({
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'target_indices': target_indices,
            'target_tokens': target_tokens,
            'speaker_indices': valid_speaker_indices,
            'speaker_tokens': speaker_tokens
        })
        # If is_train is True, save every 2048 entries as a chunk
        if is_train and len(processed_data) >= 2048:
            chunk_path = os.path.join(output_dir, f"chunk_{chunk_count}.json")
            with open(chunk_path, 'w', encoding='utf-8') as f:
                json.dump(processed_data, f, ensure_ascii=False, indent=4)
            print(f"Saved chunk {chunk_count} with 2048 entries to {chunk_path}")
            processed_data = []  # Reset for the next chunk
            chunk_count += 1
    # Save the preprocessed dataset
    if processed_data:
        if is_train:
            chunk_path = os.path.join(output_dir, f"chunk_{chunk_count}.json")
            with open(chunk_path, 'w', encoding='utf-8') as f:
                json.dump(processed_data, f, ensure_ascii=False, indent=4)
            print(f"Saved final chunk {chunk_count} to {chunk_path}")
        else:
            output_path = os.path.join(output_dir, os.path.basename(input_path).replace('processed_', ''))
            with open(output_path, 'w', encoding='utf-8') as f:
                json.dump(processed_data, f, ensure_ascii=False, indent=4)
            print(f"Saved processed data to {output_path}")

In [9]:

input_path = r"C:\Users\Lenovo\OneDrive\NUS\CS-24fall\project\AudiobookGeneration_cs5647\LiteraryTextsDataset\dataset\processed_train.json"
output_path = r"C:\Users\Lenovo\OneDrive\NUS\CS-24fall\project\AudiobookGeneration_cs5647\Dataset_SID_001\train"
preprocess_dataset(input_path, output_path, is_train=True)

Processing data:   2%|▏         | 445/21226 [00:00<00:31, 663.23it/s]

empty valid_speaker_indices
empty valid_speaker_indices


Processing data:   7%|▋         | 1444/21226 [00:02<00:27, 732.24it/s]

empty valid_speaker_indices


Processing data:   8%|▊         | 1676/21226 [00:02<00:27, 702.69it/s]

empty valid_speaker_indices
empty valid_speaker_indices
empty valid_speaker_indices


Processing data:   9%|▉         | 1892/21226 [00:02<00:28, 684.00it/s]

empty valid_speaker_indices
empty valid_speaker_indices
empty valid_speaker_indices
empty valid_speaker_indices
empty valid_speaker_indices
empty valid_speaker_indices
empty valid_speaker_indices
empty valid_speaker_indices


Processing data:  10%|█         | 2163/21226 [00:04<01:52, 169.60it/s]

Saved chunk 0 with 2048 entries to C:\Users\Lenovo\OneDrive\NUS\CS-24fall\project\AudiobookGeneration_cs5647\Dataset_SID_001\train\chunk_0.json


Processing data:  12%|█▏        | 2515/21226 [00:05<00:40, 458.09it/s]

empty valid_speaker_indices


Processing data:  15%|█▌        | 3199/21226 [00:06<00:26, 674.15it/s]

empty valid_speaker_indices


Processing data:  15%|█▌        | 3269/21226 [00:06<00:26, 681.27it/s]

empty valid_speaker_indices


Processing data:  17%|█▋        | 3513/21226 [00:06<00:34, 517.87it/s]

empty valid_speaker_indices
empty valid_speaker_indices


Processing data:  19%|█▉        | 4010/21226 [00:07<00:23, 718.27it/s]

empty valid_speaker_indices
empty valid_speaker_indices


Processing data:  20%|█▉        | 4221/21226 [00:09<01:35, 177.90it/s]

Saved chunk 1 with 2048 entries to C:\Users\Lenovo\OneDrive\NUS\CS-24fall\project\AudiobookGeneration_cs5647\Dataset_SID_001\train\chunk_1.json


Processing data:  22%|██▏       | 4674/21226 [00:09<00:29, 552.83it/s]

empty valid_speaker_indices


Processing data:  23%|██▎       | 4833/21226 [00:10<00:24, 659.96it/s]

empty valid_speaker_indices
empty valid_speaker_indices
empty valid_speaker_indices
empty valid_speaker_indices
empty valid_speaker_indices
empty valid_speaker_indices


Processing data:  25%|██▍       | 5217/21226 [00:10<00:22, 720.08it/s]

empty valid_speaker_indices
empty valid_speaker_indices


Processing data:  26%|██▌       | 5441/21226 [00:10<00:21, 727.43it/s]

empty valid_speaker_indices
empty valid_speaker_indices
empty valid_speaker_indices
empty valid_speaker_indices
empty valid_speaker_indices
empty valid_speaker_indices


Processing data:  27%|██▋       | 5819/21226 [00:11<00:21, 731.21it/s]

empty valid_speaker_indices
empty valid_speaker_indices


Processing data:  28%|██▊       | 5893/21226 [00:11<00:21, 698.84it/s]

empty valid_speaker_indices
empty valid_speaker_indices
empty valid_speaker_indices


Processing data:  30%|██▉       | 6323/21226 [00:13<01:22, 180.31it/s]

Saved chunk 2 with 2048 entries to C:\Users\Lenovo\OneDrive\NUS\CS-24fall\project\AudiobookGeneration_cs5647\Dataset_SID_001\train\chunk_2.json


Processing data:  33%|███▎      | 7010/21226 [00:14<00:23, 603.80it/s]

empty valid_speaker_indices


Processing data:  36%|███▌      | 7689/21226 [00:15<00:18, 730.99it/s]

empty valid_speaker_indices


Processing data:  38%|███▊      | 7987/21226 [00:15<00:18, 708.55it/s]

empty valid_speaker_indices
empty valid_speaker_indices
empty valid_speaker_indices
empty valid_speaker_indices


Processing data:  39%|███▊      | 8224/21226 [00:16<00:17, 754.95it/s]

empty valid_speaker_indices
empty valid_speaker_indices


Processing data:  39%|███▉      | 8384/21226 [00:17<01:03, 200.95it/s]

Saved chunk 3 with 2048 entries to C:\Users\Lenovo\OneDrive\NUS\CS-24fall\project\AudiobookGeneration_cs5647\Dataset_SID_001\train\chunk_3.json


Processing data:  40%|███▉      | 8448/21226 [00:17<00:52, 243.62it/s]

empty valid_speaker_indices


Processing data:  41%|████▏     | 8762/21226 [00:18<00:24, 511.02it/s]

empty valid_speaker_indices
empty valid_speaker_indices
empty valid_speaker_indices
empty valid_speaker_indices


Processing data:  43%|████▎     | 9053/21226 [00:18<00:19, 618.47it/s]

empty valid_speaker_indices
empty valid_speaker_indices
empty valid_speaker_indices
empty valid_speaker_indices
empty valid_speaker_indices


Processing data:  45%|████▌     | 9624/21226 [00:19<00:17, 667.85it/s]

empty valid_speaker_indices


Processing data:  46%|████▋     | 9847/21226 [00:19<00:15, 713.37it/s]

empty valid_speaker_indices
empty valid_speaker_indices
empty valid_speaker_indices
empty valid_speaker_indices


Processing data:  49%|████▉     | 10357/21226 [00:22<01:21, 133.64it/s]

Saved chunk 4 with 2048 entries to C:\Users\Lenovo\OneDrive\NUS\CS-24fall\project\AudiobookGeneration_cs5647\Dataset_SID_001\train\chunk_4.json
empty valid_speaker_indices


Processing data:  50%|█████     | 10712/21226 [00:22<00:25, 415.73it/s]

empty valid_speaker_indices
empty valid_speaker_indices


Processing data:  52%|█████▏    | 11030/21226 [00:23<00:15, 662.45it/s]

empty valid_speaker_indices


Processing data:  53%|█████▎    | 11187/21226 [00:23<00:14, 706.56it/s]

empty valid_speaker_indices
empty valid_speaker_indices


Processing data:  59%|█████▊    | 12468/21226 [00:26<00:54, 161.48it/s]

Saved chunk 5 with 2048 entries to C:\Users\Lenovo\OneDrive\NUS\CS-24fall\project\AudiobookGeneration_cs5647\Dataset_SID_001\train\chunk_5.json


Processing data:  61%|██████    | 12910/21226 [00:27<00:16, 513.70it/s]

empty valid_speaker_indices


Processing data:  64%|██████▍   | 13582/21226 [00:28<00:10, 701.64it/s]

empty valid_speaker_indices


Processing data:  67%|██████▋   | 14173/21226 [00:29<00:10, 669.74it/s]

empty valid_speaker_indices
empty valid_speaker_indices
empty valid_speaker_indices


Processing data:  67%|██████▋   | 14311/21226 [00:29<00:10, 670.70it/s]

empty valid_speaker_indices
empty valid_speaker_indices


Processing data:  68%|██████▊   | 14520/21226 [00:31<00:37, 177.11it/s]

Saved chunk 6 with 2048 entries to C:\Users\Lenovo\OneDrive\NUS\CS-24fall\project\AudiobookGeneration_cs5647\Dataset_SID_001\train\chunk_6.json


Processing data:  72%|███████▏  | 15372/21226 [00:32<00:08, 704.87it/s]

empty valid_speaker_indices


Processing data:  74%|███████▍  | 15731/21226 [00:32<00:08, 655.26it/s]

empty valid_speaker_indices
empty valid_speaker_indices


Processing data:  75%|███████▍  | 15885/21226 [00:33<00:07, 706.19it/s]

empty valid_speaker_indices
empty valid_speaker_indices
empty valid_speaker_indices
empty valid_speaker_indices
empty valid_speaker_indices


Processing data:  76%|███████▌  | 16030/21226 [00:33<00:07, 688.20it/s]

empty valid_speaker_indices


Processing data:  77%|███████▋  | 16317/21226 [00:33<00:07, 695.04it/s]

empty valid_speaker_indices
empty valid_speaker_indices
empty valid_speaker_indices


Processing data:  78%|███████▊  | 16584/21226 [00:35<00:30, 153.77it/s]

Saved chunk 7 with 2048 entries to C:\Users\Lenovo\OneDrive\NUS\CS-24fall\project\AudiobookGeneration_cs5647\Dataset_SID_001\train\chunk_7.json
empty valid_speaker_indices


Processing data:  80%|████████  | 17064/21226 [00:36<00:07, 552.80it/s]

empty valid_speaker_indices
empty valid_speaker_indices


Processing data:  81%|████████▏ | 17289/21226 [00:36<00:06, 647.45it/s]

empty valid_speaker_indices
empty valid_speaker_indices


Processing data:  82%|████████▏ | 17496/21226 [00:37<00:05, 643.73it/s]

empty valid_speaker_indices


Processing data:  83%|████████▎ | 17714/21226 [00:37<00:05, 696.59it/s]

empty valid_speaker_indices
empty valid_speaker_indices


Processing data:  86%|████████▌ | 18256/21226 [00:38<00:03, 767.90it/s]

empty valid_speaker_indices
empty valid_speaker_indices


Processing data:  88%|████████▊ | 18632/21226 [00:40<00:14, 182.08it/s]

Saved chunk 8 with 2048 entries to C:\Users\Lenovo\OneDrive\NUS\CS-24fall\project\AudiobookGeneration_cs5647\Dataset_SID_001\train\chunk_8.json


Processing data:  91%|█████████ | 19290/21226 [00:40<00:03, 630.86it/s]

empty valid_speaker_indices
empty valid_speaker_indices
empty valid_speaker_indices
empty valid_speaker_indices


Processing data:  92%|█████████▏| 19584/21226 [00:41<00:02, 677.63it/s]

empty valid_speaker_indices
empty valid_speaker_indices


Processing data:  94%|█████████▍| 19917/21226 [00:41<00:02, 627.79it/s]

empty valid_speaker_indices


Processing data:  96%|█████████▋| 20471/21226 [00:42<00:01, 670.20it/s]

empty valid_speaker_indices


Processing data:  97%|█████████▋| 20673/21226 [00:44<00:03, 164.12it/s]

Saved chunk 9 with 2048 entries to C:\Users\Lenovo\OneDrive\NUS\CS-24fall\project\AudiobookGeneration_cs5647\Dataset_SID_001\train\chunk_9.json
empty valid_speaker_indices
empty valid_speaker_indices
empty valid_speaker_indices
empty valid_speaker_indices


Processing data:  99%|█████████▊| 20948/21226 [00:44<00:00, 394.11it/s]

empty valid_speaker_indices


Processing data: 100%|█████████▉| 21154/21226 [00:45<00:00, 534.08it/s]

empty valid_speaker_indices
empty valid_speaker_indices


Processing data: 100%|██████████| 21226/21226 [00:45<00:00, 467.83it/s]


empty valid_speaker_indices
Saved final chunk 10 to C:\Users\Lenovo\OneDrive\NUS\CS-24fall\project\AudiobookGeneration_cs5647\Dataset_SID_001\train\chunk_10.json


In [10]:
input_path = r"C:\Users\Lenovo\OneDrive\NUS\CS-24fall\project\AudiobookGeneration_cs5647\LiteraryTextsDataset\dataset\processed_val.json"
output_path = r"C:\Users\Lenovo\OneDrive\NUS\CS-24fall\project\AudiobookGeneration_cs5647\Dataset_SID_001"
preprocess_dataset(input_path, output_path)

Processing data:  13%|█▎        | 228/1793 [00:00<00:02, 766.45it/s]

empty valid_speaker_indices
empty valid_speaker_indices
empty valid_speaker_indices


Processing data:  22%|██▏       | 388/1793 [00:00<00:01, 782.78it/s]

empty valid_speaker_indices


Processing data:  31%|███       | 555/1793 [00:00<00:01, 805.60it/s]

empty valid_speaker_indices


Processing data:  49%|████▉     | 884/1793 [00:01<00:01, 802.95it/s]

empty valid_speaker_indices


Processing data:  63%|██████▎   | 1125/1793 [00:01<00:00, 776.96it/s]

empty valid_speaker_indices
empty valid_speaker_indices
empty valid_speaker_indices
empty valid_speaker_indices


Processing data:  72%|███████▏  | 1282/1793 [00:01<00:00, 773.08it/s]

empty valid_speaker_indices
empty valid_speaker_indices
empty valid_speaker_indices


Processing data:  80%|████████  | 1437/1793 [00:01<00:00, 763.27it/s]

empty valid_speaker_indices


Processing data: 100%|██████████| 1793/1793 [00:02<00:00, 779.46it/s]


Saved processed data to C:\Users\Lenovo\OneDrive\NUS\CS-24fall\project\AudiobookGeneration_cs5647\Dataset_SID_001\val.json


In [11]:
input_path = r"C:\Users\Lenovo\OneDrive\NUS\CS-24fall\project\AudiobookGeneration_cs5647\LiteraryTextsDataset\dataset\processed_test.json"
output_path = r"C:\Users\Lenovo\OneDrive\NUS\CS-24fall\project\AudiobookGeneration_cs5647\Dataset_SID_001"
preprocess_dataset(input_path, output_path)

Processing data:  17%|█▋        | 614/3669 [00:00<00:03, 785.42it/s]

empty valid_speaker_indices
empty valid_speaker_indices


Processing data:  28%|██▊       | 1014/3669 [00:01<00:03, 782.94it/s]

empty valid_speaker_indices
empty valid_speaker_indices


Processing data:  36%|███▋      | 1331/3669 [00:01<00:03, 740.06it/s]

empty valid_speaker_indices
empty valid_speaker_indices


Processing data:  42%|████▏     | 1555/3669 [00:02<00:02, 721.08it/s]

empty valid_speaker_indices


Processing data:  53%|█████▎    | 1945/3669 [00:02<00:02, 757.12it/s]

empty valid_speaker_indices
empty valid_speaker_indices
empty valid_speaker_indices


Processing data:  57%|█████▋    | 2099/3669 [00:02<00:02, 759.13it/s]

empty valid_speaker_indices


Processing data:  63%|██████▎   | 2318/3669 [00:03<00:02, 633.25it/s]

empty valid_speaker_indices
empty valid_speaker_indices


Processing data:  71%|███████▏  | 2619/3669 [00:03<00:01, 728.92it/s]

empty valid_speaker_indices
empty valid_speaker_indices
empty valid_speaker_indices
empty valid_speaker_indices


Processing data:  78%|███████▊  | 2857/3669 [00:03<00:01, 760.35it/s]

empty valid_speaker_indices
empty valid_speaker_indices


Processing data:  82%|████████▏ | 3018/3669 [00:04<00:00, 780.97it/s]

empty valid_speaker_indices
empty valid_speaker_indices


Processing data:  91%|█████████ | 3340/3669 [00:04<00:00, 786.22it/s]

empty valid_speaker_indices
empty valid_speaker_indices
empty valid_speaker_indices


Processing data:  95%|█████████▌| 3503/3669 [00:04<00:00, 777.64it/s]

empty valid_speaker_indices
empty valid_speaker_indices


Processing data: 100%|██████████| 3669/3669 [00:04<00:00, 741.64it/s]

empty valid_speaker_indices
empty valid_speaker_indices





Saved processed data to C:\Users\Lenovo\OneDrive\NUS\CS-24fall\project\AudiobookGeneration_cs5647\Dataset_SID_001\test.json


In [3]:
import os
import json

def count_entries_in_json(file_path):
    """
    Counts the number of entries in a JSON file.
    
    Args:
        file_path (str): Path to the JSON file.

    Returns:
        int: Number of entries in the JSON file.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
        return len(data)

def generate_dataset_report(train_dir, test_file, val_file):
    """
    Generates a report summarizing the number of entries in the dataset.

    Args:
        train_dir (str): Path to the directory containing training JSON files.
        test_file (str): Path to the test JSON file.
        val_file (str): Path to the validation JSON file.
    """
    total_train_entries = 0
    train_files = [f for f in os.listdir(train_dir) if f.endswith('.json')]

    # Count total entries in train directory
    for train_file in train_files:
        train_file_path = os.path.join(train_dir, train_file)
        entries = count_entries_in_json(train_file_path)
        total_train_entries += entries

    # Count entries in test and validation files
    test_entries = count_entries_in_json(test_file)
    val_entries = count_entries_in_json(val_file)

    # Generate and print the report
    print("===== Dataset Report =====")
    print(f"Total Training Entries: {total_train_entries}")
    print(f"Total Validation Entries: {val_entries}")
    print(f"Total Test Entries: {test_entries}")
    print("==========================")

# Example usage
train_dir = r"C:\Users\Lenovo\OneDrive\NUS\CS-24fall\project\AudiobookGeneration_cs5647\Dataset_SID_001\train"
val_file = r"C:\Users\Lenovo\OneDrive\NUS\CS-24fall\project\AudiobookGeneration_cs5647\Dataset_SID_001\val.json"
test_file = r"C:\Users\Lenovo\OneDrive\NUS\CS-24fall\project\AudiobookGeneration_cs5647\Dataset_SID_001\test.json"

generate_dataset_report(train_dir, test_file, val_file)


===== Dataset Report =====
Total Training Entries: 21111
Total Validation Entries: 1779
Total Test Entries: 3641
