# VidProM Prompt Feature Engineering
Extract numerical features from raw prompts for ML training.

In [1]:
import pandas as pd
import numpy as np
import re

SEED = 42
DATA_PATH = "../data/labeled_prompts.parquet"
OUTPUT_PATH = "../data/features.parquet"

## Load Data

In [2]:
df = pd.read_parquet(DATA_PATH)
print(f"Dataset: {df.shape[0]} rows, {df.shape[1]} columns")
df.head(3)

Dataset: 250 rows, 14 columns


Unnamed: 0,uuid,prompt,time,toxicity,obscene,identity_attack,insult,threat,sexual_explicit,cluster,specificity,clarity,visual_richness,overall
0,0d73088c-5152-53fb-b971-7383cfdd748b,video theme Message: Pigeon00b (Font: RETRO),Mon Oct 23 05:09:39 2023,0.00061,0.04596,0.00756,0.04166,0.00218,0.0016,0,1,2,1,1
1,cc3eb2bc-12fc-5201-ae35-546d7b6400e7,girl fight with eval dragon Message: 1 Attach...,Mon Nov 6 22:07:33 2023,0.00385,0.01953,0.00244,0.0079,0.00135,0.00275,0,2,3,1,2
2,1979db12-bf44-5a7b-a9cc-952f0d1c71ec,a man smiling Message: 1 Attachment,Thu Oct 5 19:29:41 2023,0.00318,0.00073,0.00092,0.00132,0.00055,0.00035,0,2,2,1,2


## Basic Text Features

In [3]:
def extract_basic_features(text):
    text = str(text)
    words = text.split()
    return {
        'word_count':       len(words),
        'char_count':       len(text),
        'avg_word_length':  np.mean([len(w) for w in words]) if words else 0,
        'comma_count':      text.count(','),
        'has_numbers':      int(bool(re.search(r'\d', text))),
    }

basic = df['prompt'].apply(extract_basic_features).apply(pd.Series)
print(basic.describe().round(2))

       word_count  char_count  avg_word_length  comma_count  has_numbers
count      250.00      250.00           250.00       250.00       250.00
mean        17.80      122.40             5.85         2.13         0.41
std         22.05      204.29             7.36         7.07         0.49
min          1.00        9.00             2.67         0.00         0.00
25%          7.00       43.25             4.36         0.00         0.00
50%         12.00       69.00             5.00         0.00         0.00
75%         21.00      127.00             5.60         2.00         1.00
max        208.00     2425.00           113.00        92.00         1.00


## Domain Features
Video generation specific keywords: style, camera, lighting, color.

In [4]:
STYLE_KEYWORDS    = ['cinematic', 'anime', 'realistic', 'photorealistic',
                     '8k', '4k', 'hd', 'uhd', 'ultra', 'hyperrealistic']
CAMERA_KEYWORDS   = ['close-up', 'closeup', 'aerial', 'slow motion',
                     'slowmo', 'pan', 'zoom', 'tracking shot', 'wide shot',
                     'bird', 'eye level', 'handheld']
LIGHTING_KEYWORDS = ['golden hour', 'dramatic lighting', 'neon', 'sunlight',
                     'moonlight', 'shadow', 'backlit', 'rim light', 'fog', 'haze']
COLOR_KEYWORDS    = ['red', 'blue', 'green', 'yellow', 'purple', 'orange',
                     'black', 'white', 'dark', 'bright', 'vibrant', 'colorful',
                     'monochrome', 'pastel']

def extract_domain_features(text):
    t = str(text).lower()
    return {
        'has_style':    int(any(k in t for k in STYLE_KEYWORDS)),
        'has_camera':   int(any(k in t for k in CAMERA_KEYWORDS)),
        'has_lighting': int(any(k in t for k in LIGHTING_KEYWORDS)),
        'has_color':    int(any(k in t for k in COLOR_KEYWORDS)),
        'domain_score': sum([
            any(k in t for k in STYLE_KEYWORDS),
            any(k in t for k in CAMERA_KEYWORDS),
            any(k in t for k in LIGHTING_KEYWORDS),
            any(k in t for k in COLOR_KEYWORDS),
        ])
    }

domain =df['prompt'].apply(extract_domain_features).apply(pd.Series)
print(domain.describe().round(2))
print(f"\nPrompts with no domain keywords: {(domain['domain_score'] == 0).sum()}")


       has_style  has_camera  has_lighting  has_color  domain_score
count     250.00      250.00        250.00     250.00        250.00
mean        0.18        0.11          0.05       0.22          0.55
std         0.38        0.32          0.21       0.41          0.79
min         0.00        0.00          0.00       0.00          0.00
25%         0.00        0.00          0.00       0.00          0.00
50%         0.00        0.00          0.00       0.00          0.00
75%         0.00        0.00          0.00       0.00          1.00
max         1.00        1.00          1.00       1.00          4.00

Prompts with no domain keywords: 148


## NSFW Features
Pre-computed toxicity scores from VidProM dataset.

In [5]:
NSFW_COLS = ['toxicity', 'obscene', 'identity_attack', 'insult', 'threat', 'sexual_explicit']

nsfw = df[NSFW_COLS].copy()
nsfw['nsfw_max'] = nsfw.max(axis=1)
nsfw['nsfw_sum'] = nsfw.sum(axis=1)

print(nsfw.describe().round(4))

       toxicity   obscene  identity_attack    insult    threat  \
count  250.0000  250.0000         250.0000  250.0000  250.0000   
mean     0.0494    0.0057           0.0063    0.0199    0.0056   
std      0.1335    0.0282           0.0369    0.0753    0.0337   
min      0.0002    0.0001           0.0001    0.0002    0.0000   
25%      0.0008    0.0004           0.0002    0.0008    0.0001   
50%      0.0031    0.0012           0.0005    0.0023    0.0002   
75%      0.0138    0.0039           0.0016    0.0070    0.0007   
max      0.9597    0.4346           0.5336    0.8134    0.3678   

       sexual_explicit  nsfw_max  nsfw_sum  
count         250.0000  250.0000  250.0000  
mean            0.0059    0.0514    0.1442  
std             0.0591    0.1336    0.3923  
min             0.0000    0.0003    0.0012  
25%             0.0001    0.0016    0.0052  
50%             0.0002    0.0054    0.0156  
75%             0.0005    0.0201    0.0550  
max             0.8787    0.9597    3.3792  


## Sentence Embeddings
  Pre-computed 50-dim PCA embeddings (all-MiniLM-L6-v2 -> PCA, 68% variance explained).

In [7]:
from sklearn.decomposition import PCA

raw = np.load("../data/embeddings_250.npy")
emb_df = pd.DataFrame(raw, columns=[f'emb_{i}' for i in range(50)])

print(f"Embeddings shape: {emb_df.shape}")
emb_df.head(3)

Embeddings shape: (250, 50)


Unnamed: 0,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,emb_9,...,emb_40,emb_41,emb_42,emb_43,emb_44,emb_45,emb_46,emb_47,emb_48,emb_49
0,-0.360554,-0.080609,-0.072599,0.090866,-0.13717,-0.052394,-0.097058,0.217143,-0.152698,-0.049818,...,0.024791,-0.008551,-0.092096,-0.045415,0.024524,-0.079473,0.020756,-0.091413,-0.003983,-0.076077
1,-0.19386,-0.340147,0.174173,-0.27357,-0.03428,-0.126836,-0.047637,0.030635,-0.019172,-0.176503,...,-0.096169,0.05565,-0.044716,0.096466,0.019902,-0.061606,0.052919,0.004664,0.03206,-0.023467
2,-0.353302,-0.359803,0.069037,0.090783,-0.101325,-0.274569,-0.264642,0.032737,0.197793,-0.058032,...,-0.057126,0.022444,-0.00278,0.071672,-0.022335,-0.084474,-0.026324,0.052852,0.003263,0.02358


## Combine & Save Features

In [8]:
features_df = pd.concat([basic, domain, nsfw, emb_df], axis=1)
features_df.index = df.index

print(f"Feature matrix: {features_df.shape}")
print(f"Feature columns: {features_df.columns.tolist()[:10]} ... ({len(features_df.columns)} total)")

features_df.to_parquet(OUTPUT_PATH, index=False)
print(f"\nSaved features to: {OUTPUT_PATH}")

Feature matrix: (250, 68)
Feature columns: ['word_count', 'char_count', 'avg_word_length', 'comma_count', 'has_numbers', 'has_style', 'has_camera', 'has_lighting', 'has_color', 'domain_score'] ... (68 total)

Saved features to: ../data/features.parquet
