In [None]:
data_path = 'Final Data/data.json'

train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

In [None]:
import json
import random
from collections import defaultdict
from pathlib import Path
import matplotlib.pyplot as plt

In [None]:
# === 1. Load dataset ===
with open(data_path, 'r') as f:
    data = json.load(f)

In [None]:
# === 2. Group UUIDs by label ===
label_to_uuids = defaultdict(list)
for uid, profile in data.items():
    label = profile.get('label')
    if label:
        label_to_uuids[label].append(uid)



: 

In [None]:
# === 3. Stratified split ===
train_set, val_set, test_set = {}, {}, {}
random.seed(42)

for label, uuids in label_to_uuids.items():
    random.shuffle(uuids)
    n = len(uuids)
    train_end = int(train_ratio * n)
    val_end = train_end + int(val_ratio * n)

    train_uuids = uuids[:train_end]
    val_uuids = uuids[train_end:val_end]
    test_uuids = uuids[val_end:]

    for uid in train_uuids:
        train_set[uid] = data[uid]
    for uid in val_uuids:
        val_set[uid] = data[uid]
    for uid in test_uuids:
        test_set[uid] = data[uid]

print(f"Train size: {len(train_set)}")
print(f"Validation size: {len(val_set)}")
print(f"Test size: {len(test_set)}")


# === 4. Plot label distributions ===
def plot_label_distribution(dataset, title):
    label_counts = defaultdict(int)
    for profile in dataset.values():
        label = profile.get('label')
        if label:
            label_counts[label] += 1

    labels = list(label_counts.keys())
    counts = [label_counts[label] for label in labels]

    plt.figure(figsize=(12, 6))
    plt.barh(labels, counts, color='skyblue')
    plt.xlabel("Count")
    plt.title(title)
    plt.tight_layout()
    plt.show()

plot_label_distribution(train_set, "Train Set Label Distribution")
plot_label_distribution(val_set, "Validation Set Label Distribution")
plot_label_distribution(test_set, "Test Set Label Distribution")
