# Settings

In [3]:
# If True, the samples listed in task_dataset.csv will not be deleted on cleanup
dont_remove_task_dataset = True

# Generate dataset

In [3]:
import json

import os
import sys


# Allow loading dialogue middleware packages
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(parent_dir)


from benchmark import dataset_utils
import datasets as ds
import pandas as pd
from tqdm import tqdm


workspace_root = dataset_utils._find_workspace_root()
reviews_df = pd.read_csv(f"{workspace_root}/benchmark/experimentation/movie_reviews.csv")
to_process = reviews_df[['genre', 'title_idx', 'title']].drop_duplicates()

d_genre = ds.load_dataset("aneeshas/imsdb-genre-movie-scripts")
train_dataset = d_genre["train"]

for i, (genre, title_idx, title) in tqdm(to_process.iterrows(), total=len(to_process)):
    script = train_dataset[title_idx][genre]

    title = dataset_utils.normalize_title(title)
    genre = genre.lower()

    if not script:
        continue

    title_path = f"{genre}/{title}"
    os.makedirs(title_path, exist_ok=True)
    
    try:
        normalized_script = dataset_utils.normalize_script(script)
        sections = dataset_utils.split_into_sections(normalized_script)
        for section_idx, section in enumerate(sections):
            section_elements = dataset_utils.parse_section(section)
            scene, state = dataset_utils.section_to_scene(section_elements)

            with open(f"{title_path}/{genre}_{title}_scene_{section_idx}.json", "w") as f_scene:
                json.dump(scene.serialize(), f_scene)
            with open(f"{title_path}/{genre}_{title}_state_{section_idx}.json", "w") as f_state:
                json.dump(state.serialize(), f_state)

    except Exception as ex:
        print(f"Error in script {title_idx}: {ex}")
        break


100%|██████████| 674/674 [00:56<00:00, 11.83it/s]


# Cleanup

In [19]:
import os
import pathlib
import sys


# Allow loading dialogue middleware packages
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(parent_dir)


from benchmark import dataset_utils
import pandas as pd


dataset_root = pathlib.Path(f"{dataset_utils._find_workspace_root()}/benchmark/dataset")

to_leave = set()
if dont_remove_task_dataset:
    task_dataset_df = pd.read_csv(f"{dataset_root}/task_dataset.csv")
    to_leave_df = task_dataset_df[['genre', 'title', 'scene_idx']].drop_duplicates()

    for _, row in to_leave_df.iterrows():
        to_leave.add((row.genre.lower(), dataset_utils.normalize_title(row.title), row.scene_idx))

for path in dataset_root.rglob("**/*.json"):
    relative_path = path.relative_to(dataset_root)
    genre, title, scene_file = str(relative_path).split("/")
    scene_idx = int(scene_file.split("_")[-1].split(".json")[0])

    if (genre, title, scene_idx) in to_leave:
        print(f"SKIPPING: genre = {genre} title = {title} scene_idx = {scene_idx}")
    else:
        path.unlink()



{('comedy', 'coco', 7), ('comedy', 'adaptation', 57), ('comedy', 'entrapment', 4), ('drama', 'amadeus', 124), ('drama', '127_hours', 86), ('drama', 'bound', 2), ('sci-fi', 'slither', 53), ('sci-fi', '2012', 4), ('action', 'black_rain', 18), ('sci-fi', 'next', 4), ('sci-fi', 'dune', 6), ('action', 'bad_country', 232), ('horror', 'mimic', 151), ('horror', 'jurassic_park', 41), ('comedy', 'fargo', 43), ('action', '2012', 116), ('horror', 'stillness_in_the_water', 102), ('horror', 'it', 1), ('action', 'alien_iii', 123), ('drama', 'adaptation', 19), ('sci-fi', 'the_abyss_-_by_james_cameron', 67), ('comedy', 'cedar_rapids', 116), ('sci-fi', 'arac_attack', 157), ('action', 'the_crow', 874), ('sci-fi', 'interstellar', 33)}
SKIPPING: genre = action title = black_rain scene_idx = 18
SKIPPING: genre = action title = black_rain scene_idx = 18
SKIPPING: genre = action title = bad_country scene_idx = 232
SKIPPING: genre = action title = bad_country scene_idx = 232
SKIPPING: genre = action title = th