# VidProM Prompt Scorer Quality Labeling
Stratified sampling from 100k clustered prompts for manual quality annotation.

In [1]:
import pandas as pd
import numpy as np
import random

SEED = 42
SAMPLE_PER_CLUSTER = 25
DATA_PATH = "../data/sample_clustered.csv"
OUTPUT_PATH = "../data/to_label.csv"

random.seed(SEED)
np.random.seed(SEED)

## Load Data

In [2]:
df = pd.read_csv(DATA_PATH)
print(f"Dataset: {df.shape[0]:,} prompts, {df['cluster'].nunique()} clusters")
df.head(3)

Dataset: 100,000 prompts, 10 clusters


Unnamed: 0,uuid,prompt,time,toxicity,obscene,identity_attack,insult,threat,sexual_explicit,cluster
0,82dd39cd-6603-5bb7-8445-a6ad33558888,a majestic oak tree swaying in a stormy winds,Wed Nov 29 23:58:48 2023,0.00208,0.0005,0.00013,0.00122,6e-05,6e-05,9
1,1752894f-9954-580e-9516-f9a991edb6d6,Two people is glad in the beautiful beach -- a...,Wed Dec 13 01:54:52 2023,0.00045,0.00025,0.00015,0.0002,0.00011,7e-05,9
2,d6876be6-8350-5c17-9187-0ec77af7b4e9,"In the heart of the Enchanted Forest,little gi...",Sat Nov 4 04:26:28 2023,0.00056,0.00016,0.0001,0.00023,4e-05,7e-05,9


## Stratified Sample
25 prompts per cluster ‚Üí 250 total for manual labeling

In [3]:
sample_df = (
    df.groupby("cluster")
    .sample(n=SAMPLE_PER_CLUSTER, random_state=SEED)
    .reset_index(drop=True)
)

sample_df[['uuid', 'prompt', 'cluster']].to_csv(OUTPUT_PATH, index=False)
print(f"Saved {len(sample_df)} prompts ‚Üí {OUTPUT_PATH}")

Saved 250 prompts ‚Üí ../data/to_label.csv


## Export Batches for Manual Labeling

In [4]:
to_label = pd.read_csv(OUTPUT_PATH)
BATCH_SIZE = 25

for i in range(0, len(to_label), BATCH_SIZE):
    batch = to_label.iloc[i:i+BATCH_SIZE]
    print(f"\n-- Batch {i//BATCH_SIZE + 1} ---")
    for _, row in batch.iterrows():
        print(f"{row['uuid']}, {row['prompt']}")


-- Batch 1 ---
0d73088c-5152-53fb-b971-7383cfdd748b, video theme  Message: Pigeon00b (Font: RETRO)  
cc3eb2bc-12fc-5201-ae35-546d7b6400e7, girl fight with eval dragon  Message: 1 Attachment  
1979db12-bf44-5a7b-a9cc-952f0d1c71ec, a man smiling  Message: 1 Attachment  
bd6800be-46e0-59e7-9c05-565dfce6191e, make me a logo with the name patrao  
7dbe13d4-f6d3-5214-918b-feee429ef7cd, viking runic carved motifs, ancient art, museum display  Message: The Stone (Font: MODERN)  
354f6641-014f-5f98-8c1d-39f0bc1ea3a7, Attached is the text of the AI Technology Aler  Message: 1 Attachment  
f90259a2-42f2-5547-ba3e-3c408d5629cd, sleep girl  Message: 1 Attachment  
12564813-b7df-58e8-9a26-021a6fb95ac5, BLUE SKY WITH WHITE CLOUDS  Message: CDATA (Font: COMICS)  
bec628dd-79db-5741-a7c4-54716a45d16f, image:retouch_2024021914011823.jpg  
05fb9ae5-668d-5419-9d94-5c54748b94d9, Starry Night by Van Gogh, painting, large moon, /animate, -w0.7  Message: SEAN BOON (Font: MODERN)  
58caf766-a129-58ff-8714-d25

## Load & Validate Labels

In [5]:
import json

with open("../data/labels_raw.json") as f:
    labels = json.load(f)

labels_df = pd.DataFrame(labels)

print(f"Labels loaded: {len(labels_df)} entries")
print(f"\nScore distributions:")
print(labels_df[['specificity', 'clarity', 'visual_richness', 'overall']].describe().round(2))

to_label = pd.read_csv(OUTPUT_PATH)
matched = labels_df['uuid'].isin(to_label['uuid']).sum()
print(f"\nMatched UUIDs: {matched}/250")

Labels loaded: 250 entries

Score distributions:
       specificity  clarity  visual_richness  overall
count       250.00   250.00           250.00   250.00
mean          3.19     3.73             1.73     2.75
std           1.16     0.91             0.92     0.94
min           1.00     1.00             1.00     1.00
25%           2.00     3.00             1.00     2.00
50%           3.00     4.00             2.00     3.00
75%           4.00     4.00             2.00     3.00
max           5.00     5.00             5.00     5.00

Matched UUIDs: 250/250


## Merge & Save Labeled Dataset

In [6]:
labeled_df = sample_df.merge(labels_df, on='uuid', how='inner')

print(f"Final dataset: {len(labeled_df)} rows, {len(labeled_df.columns)} columns")
print(f"Columns: {labeled_df.columns.tolist()}")

labeled_df.to_parquet("../data/labeled_prompts.parquet", index=False)
print("\nSaved -> ../data/labeled_prompts.parquet")

Final dataset: 250 rows, 14 columns
Columns: ['uuid', 'prompt', 'time', 'toxicity', 'obscene', 'identity_attack', 'insult', 'threat', 'sexual_explicit', 'cluster', 'specificity', 'clarity', 'visual_richness', 'overall']

Saved -> ../data/labeled_prompts.parquet
