In [1]:
from src.data_loader import repo_root_from_cwd, load_catalog
from src.feature_engineering import (
    CoreFeatureConfig, CastFeatureConfig,
    build_core_feature_matrix, build_cast_feature_matrix,
    save_feature_bundle
)

REPO_ROOT = repo_root_from_cwd()
OUT_DIR = REPO_ROOT / "data" / "processed"

catalog = load_catalog(REPO_ROOT)
catalog.shape

(19925, 16)

In [2]:
# Core
core_cfg = CoreFeatureConfig(top_genres=30, top_countries=25)
X_core, core_meta = build_core_feature_matrix(catalog, core_cfg)
save_feature_bundle(X_core, core_meta, OUT_DIR, name="X_core")

X_core.shape, X_core.columns[:10]

((19925, 83),
 Index(['Genre:Drama', 'Genre:International Movies', 'Genre:Comedy',
        'Genre:Dramas', 'Genre:Comedies', 'Genre:Action', 'Genre:Suspense',
        'Genre:International TV Shows', 'Genre:Kids', 'Genre:Documentary'],
       dtype='str'))

*The cast feature matrix is intentionally not used for primary clustering due to sparsity & instability.
It is persisted separately for secondary analysis or future similarity work*

In [3]:
# Cast
cast_cfg = CastFeatureConfig(top_actors=200)
X_cast, cast_meta = build_cast_feature_matrix(catalog, cast_cfg)
save_feature_bundle(X_cast, cast_meta, OUT_DIR, name="X_cast")

X_cast.shape

(19925, 206)