In [1]:
import json
import random

In [3]:
def split_jsonl(input_file, train_proportion=0.8, val_proportion=0.1, test_proportion=0.1):
    if not (train_proportion + val_proportion + test_proportion) == 1:
        raise ValueError("Train, validation, and test proportions must sum to 1.")

    data = []
    with open(input_file, 'r') as f:
        for line in f:
            try:
                data.append(json.loads(line.strip()))
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON line: {line.strip()} - {e}")

    random.shuffle(data)

    total_samples = len(data)
    train_split = int(train_proportion * total_samples)
    val_split_end = train_split + int(val_proportion * total_samples)

    train_data = data[:train_split]
    val_data = data[train_split:val_split_end]
    test_data = data[val_split_end:]

    base_name, ext = input_file.rsplit('.', 1)
    train_file = f"{base_name.split('/')[0]}/train.{ext}"
    val_file = f"{base_name.split('/')[0]}/valid.{ext}"
    test_file = f"{base_name.split('/')[0]}/test.{ext}"

    with open(train_file, 'w') as f:
        for item in train_data:
            f.write(json.dumps(item) + '\n')

    with open(val_file, 'w') as f:
        for item in val_data:
            f.write(json.dumps(item) + '\n')

    with open(test_file, 'w') as f:
        for item in test_data:
            f.write(json.dumps(item) + '\n')

    return train_file, val_file, test_file

In [4]:
split_jsonl("Phase-1-Data/Data_for_MLX_fine_tuning.jsonl")

('Phase-1-Data/train.jsonl',
 'Phase-1-Data/valid.jsonl',
 'Phase-1-Data/test.jsonl')

In [5]:
split_jsonl("Phase-2-Data/Data_for_MLX_fine_tuning.jsonl")

('Phase-2-Data/train.jsonl',
 'Phase-2-Data/valid.jsonl',
 'Phase-2-Data/test.jsonl')

In [6]:
split_jsonl("Phase-3-Data/Data_for_MLX_fine_tuning.jsonl")

('Phase-3-Data/train.jsonl',
 'Phase-3-Data/valid.jsonl',
 'Phase-3-Data/test.jsonl')