In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import glob
import json
import math
import gc
from tqdm import tqdm
from datetime import datetime

In [None]:
base_drive_path = '/content/drive/MyDrive'
CHUNK_SIZE = 18

wiki_dir = os.path.join(base_drive_path, 'wiki-pages')
train_path = os.path.join(base_drive_path, 'train.jsonl')
dev_path = os.path.join(base_drive_path, 'shared_task_dev.jsonl')
dev_public_path = os.path.join(base_drive_path, 'shared_task_dev_public.jsonl')

output_dir = os.path.join(base_drive_path, 'newones/processed_data')
os.makedirs(output_dir, exist_ok=True)
# ===== HELPERS =====
def log(message):

    print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] {message}")

In [None]:
# ===== LOAD WIKI CHUNK =====
def load_wiki_dumps_chunk(wiki_files_chunk):
    wiki_dict = {}
    total_loaded_pages = 0

    for wiki_file in tqdm(wiki_files_chunk, desc="Loading wiki chunk"):
        if not os.path.exists(wiki_file):
            log(f"Warning: File not found at {wiki_file}. Skipping.")
            continue

        try:
            with open(wiki_file, 'r', encoding='utf-8') as f:
                for line in f:
                    try:
                        page = json.loads(line)
                        page_id = str(page.get('id', ''))
                        lines_str = page.get('lines', '')
                        sentences = []
                        if lines_str:
                            for ln in lines_str.strip().split('\n'):
                                parts = ln.split('\t', 1)
                                if len(parts) == 2:
                                    sentences.append(parts[1])
                            wiki_dict[page_id] = sentences
                            total_loaded_pages += 1
                    except json.JSONDecodeError:
                        log(f"Error decoding JSON in {wiki_file}, skipping line.")
        except Exception as e:
            log(f"Error reading file {wiki_file}: {e}")

    log(f"Loaded {total_loaded_pages} wiki pages from this chunk.")
    return wiki_dict

In [None]:
# ===== PROCESS DATASET =====
def process_and_append_dataset(dataset, remaining_indices, wiki_dict, output_path):
    newly_completed = set()

    for idx in list(remaining_indices):
        example = dataset[idx]

        if example.get("label") == "NOT ENOUGH INFO":
            continue

        evidences_text = []
        evidences = example.get('evidence', [])
        found_any = False

        for evidence_set in evidences:
            evidence_sentences = []
            for ev in evidence_set:
                page_title = ev[2]
                sent_id = ev[3]

                if page_title and sent_id is not None:
                    page_title_str = str(page_title)
                    if page_title_str in wiki_dict:
                        sentences = wiki_dict[page_title_str]
                        if isinstance(sent_id, int) and 0 <= sent_id < len(sentences):
                            evidence_sentences.append(sentences[sent_id])
                            found_any = True
            evidences_text.append(evidence_sentences)

        # If we found any evidence text, mark as completed
        if found_any:
            example['evidence_text'] = evidences_text
            newly_completed.add(idx)

    # Write only newly completed examples
    if newly_completed:
        with open(output_path, 'a', encoding='utf-8') as f_out:
            for idx in newly_completed:
                f_out.write(json.dumps(dataset[idx]) + "\n")
            f_out.flush()
            os.fsync(f_out.fileno())  # Force write to Drive

    return newly_completed


In [None]:
# ===== MAIN PROCESS (Handles both train & dev) =====
def main(dataset_path, output_dir, wiki_dir, chunk_size=CHUNK_SIZE):
    output_path = os.path.join(output_dir, os.path.basename(dataset_path))

    # Clear old output if exists
    if os.path.exists(output_path):
        os.remove(output_path)
        log(f"Cleared previous output file: {output_path}")


    with open(dataset_path, 'r', encoding='utf-8') as f:
        dataset = [json.loads(line) for line in f]

    # Pre-handle "NOT ENOUGH INFO" cases
    remaining_indices = set()
    with open(output_path, 'a', encoding='utf-8') as f_out:
        for idx, example in enumerate(dataset):
            if example.get("label") == "NOT ENOUGH INFO":
                example["evidence_text"] = [[]]
                f_out.write(json.dumps(example) + "\n")
            else:
                remaining_indices.add(idx)
        f_out.flush()
        os.fsync(f_out.fileno())

    # Prepare wiki files list
    all_wiki_files = sorted(glob.glob(os.path.join(wiki_dir, 'wiki-*.jsonl')))
    num_chunks = math.ceil(len(all_wiki_files) / chunk_size)

    log(f"Total wiki files found: {len(all_wiki_files)}")
    log(f"Processing {os.path.basename(dataset_path)} in {num_chunks} chunks (size {chunk_size})...")

    # Iterate over wiki chunks
    for i in range(num_chunks):
        if not remaining_indices:
            log("✅ All examples processed early, stopping.")
            break

        start_index = i * chunk_size
        end_index = start_index + chunk_size
        current_wiki_chunk_files = all_wiki_files[start_index:end_index]

        log(f"--- Chunk {i+1}/{num_chunks} (files {start_index+1} to {end_index}) ---")

        # Load current chunk
        wiki_data_chunk = load_wiki_dumps_chunk(current_wiki_chunk_files)

        # Process only unprocessed examples
        newly_completed = process_and_append_dataset(dataset, remaining_indices, wiki_data_chunk, output_path)

        # Remove completed examples from the set
        remaining_indices -= newly_completed

        log(f"Completed {len(newly_completed)} new examples this chunk. Remaining: {len(remaining_indices)}")

        # Free memory
        del wiki_data_chunk
        gc.collect()

    log(f"✅ Processing complete! Output saved to: {output_path}")


# ===== RUN =====
if __name__ == "__main__":
    # Example usage for train
    main(train_path, output_dir, wiki_dir)
    # Example usage for dev
    main(dev_path, output_dir, wiki_dir)


[2025-08-10 14:10:59] Total wiki files found: 109
[2025-08-10 14:10:59] Processing train.jsonl in 7 chunks (size 18)...
[2025-08-10 14:10:59] --- Chunk 1/7 (files 1 to 18) ---


Loading wiki chunk: 100%|██████████| 18/18 [00:26<00:00,  1.49s/it]


[2025-08-10 14:11:26] Loaded 892924 wiki pages from this chunk.
[2025-08-10 14:11:27] Completed 19397 new examples this chunk. Remaining: 90413
[2025-08-10 14:11:29] --- Chunk 2/7 (files 19 to 36) ---


Loading wiki chunk: 100%|██████████| 18/18 [00:24<00:00,  1.33s/it]


[2025-08-10 14:11:53] Loaded 898349 wiki pages from this chunk.
[2025-08-10 14:11:54] Completed 18828 new examples this chunk. Remaining: 71585
[2025-08-10 14:11:56] --- Chunk 3/7 (files 37 to 54) ---


Loading wiki chunk: 100%|██████████| 18/18 [00:21<00:00,  1.21s/it]


[2025-08-10 14:12:17] Loaded 898511 wiki pages from this chunk.
[2025-08-10 14:12:18] Completed 20372 new examples this chunk. Remaining: 51213
[2025-08-10 14:12:20] --- Chunk 4/7 (files 55 to 72) ---


Loading wiki chunk: 100%|██████████| 18/18 [00:23<00:00,  1.31s/it]


[2025-08-10 14:12:43] Loaded 892736 wiki pages from this chunk.
[2025-08-10 14:12:44] Completed 16175 new examples this chunk. Remaining: 35038
[2025-08-10 14:12:46] --- Chunk 5/7 (files 73 to 90) ---


Loading wiki chunk: 100%|██████████| 18/18 [00:22<00:00,  1.27s/it]


[2025-08-10 14:13:09] Loaded 898553 wiki pages from this chunk.
[2025-08-10 14:13:09] Completed 15120 new examples this chunk. Remaining: 19918
[2025-08-10 14:13:11] --- Chunk 6/7 (files 91 to 108) ---


Loading wiki chunk: 100%|██████████| 18/18 [00:22<00:00,  1.26s/it]


[2025-08-10 14:13:33] Loaded 898510 wiki pages from this chunk.
[2025-08-10 14:13:34] Completed 19179 new examples this chunk. Remaining: 739
[2025-08-10 14:13:36] --- Chunk 7/7 (files 109 to 126) ---


Loading wiki chunk: 100%|██████████| 1/1 [00:00<00:00,  2.88it/s]


[2025-08-10 14:13:36] Loaded 16523 wiki pages from this chunk.
[2025-08-10 14:13:36] Completed 64 new examples this chunk. Remaining: 675
[2025-08-10 14:13:38] ✅ Processing complete! Output saved to: /content/drive/MyDrive/newones/processed_data/train.jsonl
[2025-08-10 14:13:38] Total wiki files found: 109
[2025-08-10 14:13:38] Processing shared_task_dev.jsonl in 7 chunks (size 18)...
[2025-08-10 14:13:38] --- Chunk 1/7 (files 1 to 18) ---


Loading wiki chunk: 100%|██████████| 18/18 [00:23<00:00,  1.31s/it]


[2025-08-10 14:14:02] Loaded 892924 wiki pages from this chunk.
[2025-08-10 14:14:02] Completed 2319 new examples this chunk. Remaining: 11013
[2025-08-10 14:14:04] --- Chunk 2/7 (files 19 to 36) ---


Loading wiki chunk: 100%|██████████| 18/18 [00:20<00:00,  1.12s/it]


[2025-08-10 14:14:24] Loaded 898349 wiki pages from this chunk.
[2025-08-10 14:14:24] Completed 2328 new examples this chunk. Remaining: 8685
[2025-08-10 14:14:25] --- Chunk 3/7 (files 37 to 54) ---


Loading wiki chunk: 100%|██████████| 18/18 [00:22<00:00,  1.22s/it]


[2025-08-10 14:14:47] Loaded 898511 wiki pages from this chunk.
[2025-08-10 14:14:47] Completed 2204 new examples this chunk. Remaining: 6481
[2025-08-10 14:14:49] --- Chunk 4/7 (files 55 to 72) ---


Loading wiki chunk: 100%|██████████| 18/18 [00:20<00:00,  1.15s/it]


[2025-08-10 14:15:09] Loaded 892736 wiki pages from this chunk.
[2025-08-10 14:15:10] Completed 1741 new examples this chunk. Remaining: 4740
[2025-08-10 14:15:11] --- Chunk 5/7 (files 73 to 90) ---


Loading wiki chunk: 100%|██████████| 18/18 [00:21<00:00,  1.17s/it]


[2025-08-10 14:15:32] Loaded 898553 wiki pages from this chunk.
[2025-08-10 14:15:32] Completed 2038 new examples this chunk. Remaining: 2702
[2025-08-10 14:15:34] --- Chunk 6/7 (files 91 to 108) ---


Loading wiki chunk: 100%|██████████| 18/18 [00:21<00:00,  1.18s/it]


[2025-08-10 14:15:55] Loaded 898510 wiki pages from this chunk.
[2025-08-10 14:15:55] Completed 2593 new examples this chunk. Remaining: 109
[2025-08-10 14:15:57] --- Chunk 7/7 (files 109 to 126) ---


Loading wiki chunk: 100%|██████████| 1/1 [00:00<00:00,  3.36it/s]


[2025-08-10 14:15:57] Loaded 16523 wiki pages from this chunk.
[2025-08-10 14:15:57] Completed 6 new examples this chunk. Remaining: 103
[2025-08-10 14:15:58] ✅ Processing complete! Output saved to: /content/drive/MyDrive/newones/processed_data/shared_task_dev.jsonl
