In [12]:
import json
import os
from pathlib import Path
from google.colab import drive

In [13]:
drive.mount('/content/drive', force_remount=True)
pathToDataset = "drive/MyDrive/pubmed-dataset"
pathToExport = "drive/MyDrive/chunking-dataset"

Mounted at /content/drive


In [15]:
def save_articles(articles, output_folder, dataset_type, file_count):
    file_name = f'{dataset_type}_part-{file_count}.json'
    output_path = os.path.join(output_folder, file_name)
    with open(output_path, 'w') as outfile:
        json.dump(articles, outfile)
    print(f'File {output_path} created with {len(articles)} articles')

def split_file(input_file, output_base_folder, dataset_type, articles_per_file, max_files=10):
    output_folder = Path(output_base_folder) / dataset_type
    output_folder.mkdir(parents=True, exist_ok=True)

    articles = []
    file_count = 0

    with open(input_file, 'r') as file:
        for line in file:
            if file_count >= max_files:
                break

            try:
                article = json.loads(line)
                articles.append(article)

                if len(articles) == articles_per_file:
                    save_articles(articles, output_folder, dataset_type, file_count)
                    articles = []
                    file_count += 1

            except json.JSONDecodeError as e:
                print(f"Error decoding JSON in file {input_file}, line: {line}")
                print(f"Error message: {e}")

    if articles and file_count < max_files:
        save_articles(articles, output_folder, dataset_type, file_count)

# Utilisation de la fonction
base_path = Path(pathToDataset)
output_base = Path(pathToExport)

split_file(base_path / "test.txt", output_base, "test", 1000, 7)
split_file(base_path / "train.txt", output_base, "train", 1000, 500)
split_file(base_path / "val.txt", output_base, "val", 1000, 7)

File drive/MyDrive/chunking-dataset/test/test_part-0.json created with 1000 articles
File drive/MyDrive/chunking-dataset/test/test_part-1.json created with 1000 articles
File drive/MyDrive/chunking-dataset/test/test_part-2.json created with 1000 articles
File drive/MyDrive/chunking-dataset/test/test_part-3.json created with 1000 articles
File drive/MyDrive/chunking-dataset/test/test_part-4.json created with 1000 articles
File drive/MyDrive/chunking-dataset/test/test_part-5.json created with 1000 articles
File drive/MyDrive/chunking-dataset/test/test_part-6.json created with 658 articles
File drive/MyDrive/chunking-dataset/train/train_part-0.json created with 1000 articles
File drive/MyDrive/chunking-dataset/train/train_part-1.json created with 1000 articles
File drive/MyDrive/chunking-dataset/train/train_part-2.json created with 1000 articles
File drive/MyDrive/chunking-dataset/train/train_part-3.json created with 1000 articles
File drive/MyDrive/chunking-dataset/train/train_part-4.jso