In [None]:
import polars as pl
import dotenv
import os
from libraries.client_stashapp import get_stashapp_client, StashAppClient
from libraries.StashDbClient import StashDbClient

dotenv.load_dotenv()

stash_client = StashAppClient()
stash_raw_client = get_stashapp_client()


# Merging scenes with same StashDB ID

In [2]:
# Introduce functions
def compare_and_merge_scenes(scenes):
    """Compare and merge multiple scenes, returning a merged version with the most complete metadata"""
    
    # Helper function to format value for display
    def format_value(val):
        if isinstance(val, list):
            return f"[{len(val)} items]" if len(val) > 3 else str(val)
        return str(val) if val is not None else "None"
    
    # Helper function to get tag names for display
    def get_tag_names(tags):
        return sorted([t['name'] for t in tags])
    
    # Helper function to get performer names for display
    def get_performer_names(performers):
        return sorted([p.get('name', f"ID: {p['id']}") for p in performers])
    
    # Fields to compare (excluding technical details and paths)
    fields_to_compare = [
        'title', 'code', 'details', 'director', 'date',
        'rating100', 'organized', 'o_counter', 'organized', 'studio_id',
        'gallery_ids', 'play_duration', 'play_count'
    ]
    
    print("=== Scene Comparison ===\n")
    
    # Compare basic fields
    merged = {}
    for field in fields_to_compare:
        values = [scene.get(field) for scene in scenes]
        
        # For numerical fields, handle None values specially
        if field in ['o_counter', 'play_count', 'play_duration', 'rating100']:
            if all(v is None for v in values):
                merged[field] = None
            else:
                merged[field] = max(v or 0 for v in values)
        else:
            # Choose the first non-None value
            merged[field] = next((v for v in values if v is not None), None)
        
        # Only show detailed comparison if values differ
        if len(set(str(v) for v in values)) > 1:
            print(f"{field}:")
            for i, val in enumerate(values, 1):
                print(f"  Scene {i}: {format_value(val)}")
            print(f"  Merged: {format_value(merged[field])}\n")
        else:
            print(f"{field}: {format_value(values[0])}")
    
    print("\n=== Special Fields ===\n")
    
    # Handle tags
    all_tags = []
    print("tags:")
    for i, scene in enumerate(scenes, 1):
        tags = scene.get('tags', [])
        all_tags.extend(tags)
        print(f"  Scene {i}: {get_tag_names(tags)}")
    
    merged_tags = list({t['id']: t for t in all_tags}.values())
    print(f"  Merged: {get_tag_names(merged_tags)}\n")
    merged['tag_ids'] = [t['id'] for t in merged_tags]
    
    # Handle performers
    all_performers = []
    print("performers:")
    for i, scene in enumerate(scenes, 1):
        performers = scene.get('performers', [])
        all_performers.extend(performers)
        print(f"  Scene {i}: {get_performer_names(performers)}")
    
    merged_performers = list({p['id']: p for p in all_performers}.values())
    print(f"  Merged: {get_performer_names(merged_performers)}\n")
    merged['performer_ids'] = [p['id'] for p in merged_performers]
    
    # Handle other special fields
    special_fields = {
        'stash_ids': lambda scenes: list({(s['endpoint'], s['stash_id']): s 
                                        for scene in scenes 
                                        for s in scene.get('stash_ids', [])}.values()),
        'scene_markers': lambda scenes: sorted([m for scene in scenes 
                                              for m in scene.get('scene_markers', [])],
                                             key=lambda m: m['seconds']),
        'o_history': lambda scenes: sorted(list(set([h for scene in scenes 
                                                   for h in scene.get('o_history', [])]))),
        'play_history': lambda scenes: sorted(list(set([h for scene in scenes 
                                                      for h in scene.get('play_history', [])]))),
        'urls': lambda scenes: sorted(list(set([u for scene in scenes 
                                              for u in scene.get('urls', [])])))
    }
    
    for field, merge_func in special_fields.items():
        values = [scene.get(field, []) for scene in scenes]
        merged_items = merge_func(scenes)
        
        if any(len(v) != len(values[0]) for v in values) or len(merged_items) != len(values[0]):
            print(f"{field}:")
            for i, items in enumerate(values, 1):
                print(f"  Scene {i}: {format_value(items) if len(items) <= 3 else f'{len(items)} items'}")
            print(f"  Merged: {format_value(merged_items) if len(merged_items) <= 3 else f'{len(merged_items)} items'}\n")
        else:
            print(f"{field}: {len(values[0])} items")
        
        merged[field] = merged_items
    
    # Keep other fields from first scene that we haven't explicitly handled
    for key in scenes[0]:
        if key not in merged and key not in ['files', 'paths', 'sceneStreams', 'tags', 'performers']:
            merged[key] = scenes[0][key]
    
    return merged

In [16]:
# Get basic info for all scenes
all_scenes_basic_info = stash_raw_client.find_scenes({}, fragment="id title studio { id name } stash_ids { endpoint stash_id }")

# Map the data with explicit type handling
all_scenes_basic_info_mapped = [
    {
        "stashapp_id": str(scene['id']),  # Ensure ID is string
        "stashapp_title": str(scene['title']) if scene['title'] else "",  # Handle None titles
        "stashapp_studio_id": str(scene['studio']['id']) if scene['studio'] else None,  # Ensure studio ID is string
        "stashapp_studio_name": str(scene['studio']['name']) if scene['studio'] else None,
        "stashapp_stashdb_id": next((str(stash_id['stash_id']) for stash_id in scene['stash_ids'] 
                           if stash_id['endpoint'] == 'https://stashdb.org/graphql'), None)
    }
    for scene in all_scenes_basic_info
]

# Create DataFrame with increased schema inference length
all_scenes_basic_info_mapped_df = pl.DataFrame(
    all_scenes_basic_info_mapped,
    infer_schema_length=10000  # Increase schema inference length
)

scenes_with_dupe_stashdb_ids_basic_info = all_scenes_basic_info_mapped_df.filter(pl.col('stashapp_stashdb_id').is_in(all_scenes_basic_info_mapped_df.group_by('stashapp_stashdb_id').agg(pl.len().alias('count')).filter(pl.col('count') > 1).get_column('stashapp_stashdb_id'))).sort('stashapp_stashdb_id')

In [None]:
scenes_with_dupe_stashdb_ids_basic_info
print(f"Studios with duplicate scenes: {scenes_with_dupe_stashdb_ids_basic_info.select(pl.col('stashapp_studio_id')).unique().to_series().to_list()}")

studio_id = scenes_with_dupe_stashdb_ids_basic_info.select(pl.col('stashapp_studio_id')).unique().to_series().to_list()[0]
print(f"Studio ID: {studio_id}")

all_scenes = stash_client.find_scenes_by_studio(studio_id)
print(f"Number of scenes: {len(all_scenes)}")

scenes_with_dupe_stashdb_ids = all_scenes.filter(pl.col('stashapp_stashdb_id').is_in(all_scenes.group_by('stashapp_stashdb_id').agg(pl.len().alias('count')).filter(pl.col('count') > 1).get_column('stashapp_stashdb_id'))).sort('stashapp_stashdb_id')
print(f"Number of scenes with duplicate StashDB IDs: {len(scenes_with_dupe_stashdb_ids)}")

In [None]:
grouped_scenes = scenes_with_dupe_stashdb_ids.group_by('stashapp_stashdb_id').agg([
    pl.col('stashapp_id').alias('scene_ids'),
    pl.col('stashapp_title').alias('titles')
])

grouped_scenes = grouped_scenes
grouped_scenes

In [None]:
# Merge scenes with same StashDB ID
for group in grouped_scenes.iter_rows(named=True):
    stashdb_id = group['stashapp_stashdb_id']
    scene_ids = group['scene_ids']
    titles = group['titles']
    
    print(f"\nProcessing group with StashDB ID: {stashdb_id}")
    print(f"Scene titles: {titles[0]}")  # All titles should be the same
    print(f"Scene IDs: {scene_ids}")
    
    # Get full scene data for each ID in the group
    scenes = [stash_raw_client.find_scene(str(scene_id)) for scene_id in scene_ids]
    if not scenes:
        raise ValueError(f"No scenes found for IDs: {scene_ids}")
    
    if len(scenes) < 1:
        raise ValueError(f"No scenes found for IDs: {scene_ids}")
    
    if len(scenes) < 2:
        raise ValueError(f"Only one scene found for IDs: {scene_ids}")
    
    # Sort scenes by ID to determine source and destination
    sorted_scene_ids = [str(scene_id) for scene_id in sorted([int(scene['id']) for scene in scenes])]
    destination_scene_id = sorted_scene_ids[0]
    source_scene_ids = sorted_scene_ids[1:]
    
    print(f"Merging scenes {source_scene_ids} into {destination_scene_id}")
    
    # Compare and merge the scenes
    merged_scene = compare_and_merge_scenes(scenes)
    
    # Prepare the merge input
    scene_merge_input = {
        "source": source_scene_ids,
        "destination": destination_scene_id,
        "values": {
            "id": destination_scene_id,
            "title": merged_scene['title'],
            "code": merged_scene['code'],
            "details": merged_scene['details'],
            "director": merged_scene['director'],
            "urls": merged_scene['urls'],
            "date": merged_scene['date'],
            "rating100": merged_scene['rating100'],
            "o_counter": merged_scene['o_counter'],
            "organized": merged_scene['organized'],
            "gallery_ids": merged_scene['gallery_ids'] if 'gallery_ids' in merged_scene and merged_scene['gallery_ids'] is not None else [],
            "performer_ids": merged_scene['performer_ids'],
            "tag_ids": merged_scene['tag_ids'],
            "stash_ids": merged_scene['stash_ids'],
            "play_duration": merged_scene['play_duration'],
            "play_count": merged_scene['play_count'],
        },
        "play_history": True, 
        "o_history": True,
    }
    
    # Execute the merge
    try:
        query = """
        mutation SceneMerge($merge_input: SceneMergeInput!) {
            sceneMerge(input: $merge_input) {
                id
            }
        }
        """

        result = stash_raw_client.call_GQL(query, {"merge_input": scene_merge_input})
        print(f"Successfully merged scenes: {result}")
    except Exception as e:
        print(f"Error merging scenes: {e}")
    
    print("-" * 80)

# Finding scenes with multiple files


In [25]:
oshash_and_duration_match_tag = stash_raw_client.find_tag({ "name": "Duplicate: OSHASH And Duration Match" })
duration_match_tag = stash_raw_client.find_tag({ "name": "Duplicate: Duration Match" })
scenes_with_multiple_versions = stash_raw_client.find_tag({ "name": "Scene: Multiple Versions" })
duration_mismatch_tag = stash_raw_client.find_tag({ "name": "Duplicate: Duration Mismatch" })

In [26]:
scenes_with_dupes = stash_raw_client.find_scenes({ 
  "file_count": {
    "modifier": "GREATER_THAN",
    "value": 1
  },
  "tags": {
    "value": [],
    "modifier": "INCLUDES",
    "excludes": [scenes_with_multiple_versions['id']]
  }
}, fragment="id title date studio { name } files { id duration path width height size fingerprints { type value } }")

In [None]:
# Create a list to store all file records
file_records = []

for scene in scenes_with_dupes:
    for i, file in enumerate(scene['files']):
        # Extract fingerprints
        oshash = next((fp['value'] for fp in file['fingerprints'] if fp['type'] == 'oshash'), None)
        phash = next((fp['value'] for fp in file['fingerprints'] if fp['type'] == 'phash'), None)
        
        # Create a record for each file
        record = {
            'scene_id': scene['id'],
            'title': scene['title'],
            'date': scene['date'],
            'studio_name': scene['studio']['name'] if scene['studio'] else None,
            'file_id': file['id'],
            'file_path': file['path'],
            'resolution_width': file['width'],
            'resolution_height': file['height'],
            'size': file['size'],
            'duration': file['duration'],
            'oshash': oshash,
            'phash': phash,
            'is_primary': i == 0  # True if this is the first file in the scene's files list
        }
        file_records.append(record)

# Create Polars DataFrame
scenes_with_multiple_files_df = pl.DataFrame(file_records)
scenes_with_multiple_files_df

# Deleting files where OSHASH is identical to another file in the scene


In [None]:
import os

# Filter to only show rows that have matching oshash values with other rows
duplicate_files_df = scenes_with_multiple_files_df.filter(
    pl.col('oshash').is_in(
        scenes_with_multiple_files_df.group_by('oshash')
        .agg(pl.len().alias('dupe_count'))  # Changed to use len() and a unique alias
        .filter(pl.col('dupe_count') > 1)
        .get_column('oshash')
    )
).sort(['oshash', 'file_id'])

duplicate_files_df = duplicate_files_df.with_columns(pl.col('file_path').map_elements(lambda x: os.path.exists(x), return_dtype=pl.Boolean).alias('file_path_exists'))

print("\nFiles with duplicate oshash values:")
print(duplicate_files_df)

In [None]:
# Group by scene_id and oshash to find duplicates within each scene
grouped_files = duplicate_files_df.group_by(['scene_id', 'oshash']).agg([
    pl.col('file_id').min().alias('keep_file_id'),  # File to keep
    pl.col('file_id').alias('all_file_ids'),        # All file IDs
    pl.col('file_path').alias('all_file_paths'),    # All file paths
    pl.col('size').alias('all_file_sizes'),         # File sizes
    pl.col('file_path_exists').alias('all_file_exists'),  # Path existence check
    pl.col('is_primary').alias('all_is_primary')    # Primary file flags
])

# Lists to store kept and deleted file information
kept_files = []
files_to_delete = []

# Print summary and collect file information
print("\nFiles to be deleted:")
for row in grouped_files.iter_rows(named=True):
    file_ids = row['all_file_ids']
    file_paths = row['all_file_paths']
    file_sizes = row['all_file_sizes']
    is_primary = row['all_is_primary']
    
    # Find primary file index if it exists
    primary_indices = [i for i, p in enumerate(is_primary) if p]
    if primary_indices:
        # Keep the primary file
        keep_index = primary_indices[0]
        keep_file_id = file_ids[keep_index]
    else:
        # If no primary file, keep the one with lowest file_id
        keep_file_id = row['keep_file_id']
        keep_index = file_ids.index(keep_file_id)
    
    kept_files.append({
        'scene_id': row['scene_id'],
        'file_id': keep_file_id,
        'file_path': file_paths[keep_index],
        'size': file_sizes[keep_index],
        'is_primary': is_primary[keep_index]
    })
    
    # Get indices of files to delete (all except kept file, never delete primary files)
    delete_indices = [i for i, (file_id, p) in enumerate(zip(file_ids, is_primary)) 
                     if file_id != keep_file_id and not p]
    
    if delete_indices:  # Only show if there are files to delete
        primary_status = " (Primary)" if is_primary[keep_index] else ""
        print(f"\nScene {row['scene_id']} - Keeping{primary_status}: {file_paths[keep_index]} (ID: {keep_file_id}, Size: {file_sizes[keep_index]:,} bytes) Exists: {os.path.exists(file_paths[keep_index])}")
        for idx in delete_indices:
            print(f"  Will delete: {file_paths[idx]} (ID: {file_ids[idx]}, Size: {file_sizes[idx]:,} bytes) Exists: {os.path.exists(file_paths[idx])}")
            files_to_delete.append({
                'scene_id': row['scene_id'],
                'file_id': file_ids[idx],
                'file_path': file_paths[idx],
                'size': file_sizes[idx],
                'is_primary': is_primary[idx]
            })

# Calculate total space that would be freed
total_space = sum(file['size'] for file in files_to_delete)
print(f"\nTotal space that would be freed: {total_space:,} bytes ({total_space/1024/1024/1024:.2f} GB)")
print(f"Number of files to delete: {len(files_to_delete)}")

# Create DataFrames for kept and deleted files
kept_files_df = pl.DataFrame(kept_files)
delete_files_df = pl.DataFrame(files_to_delete)

In [30]:
for row in delete_files_df.iter_rows(named=True):
    stash_raw_client.destroy_files(row['file_id'])

In [None]:
scenes_with_duration_mismatch_tag_but_only_one_file = stash_raw_client.find_scenes({
    "tags": { "value": [duration_mismatch_tag["id"]], "modifier": "INCLUDES" },
    "file_count": { "modifier": "EQUALS", "value": 1 }
}, fragment="id title tags { id name }")
scenes_with_duration_mismatch_tag_but_only_one_file_ids = [scene['id'] for scene in scenes_with_duration_mismatch_tag_but_only_one_file]
scenes_with_duration_mismatch_tag_but_only_one_file

In [None]:
stash_client.update_tags_for_scenes(
    scenes_with_duration_mismatch_tag_but_only_one_file_ids,
    [],
    [duration_mismatch_tag["name"]]
)

# Finding scenes where durations do not match

In [None]:
# Create a list to store all file records
file_records = []

for scene in scenes_with_dupes:
    for i, file in enumerate(scene['files']):
        # Extract fingerprints
        oshash = next((fp['value'] for fp in file['fingerprints'] if fp['type'] == 'oshash'), None)
        phash = next((fp['value'] for fp in file['fingerprints'] if fp['type'] == 'phash'), None)
        
        # Create a record for each file
        record = {
            'scene_id': scene['id'],
            'title': scene['title'],
            'date': scene['date'],
            'studio_name': scene['studio']['name'] if scene['studio'] else None,
            'file_id': file['id'],
            'file_path': file['path'],
            'resolution_width': file['width'],
            'resolution_height': file['height'],
            'size': file['size'],
            'duration': round(file['duration']),  # Round duration to nearest second
            'oshash': oshash,
            'phash': phash,
            'is_primary': i == 0  # True if this is the first file in the scene's files list
        }
        file_records.append(record)

# Create Polars DataFrame
scenes_with_multiple_files_df = pl.DataFrame(file_records)

# Group by scene_id and find scenes where durations don't match
duration_mismatches = scenes_with_multiple_files_df.group_by('scene_id').agg([
    pl.col('title').first().alias('title'),
    pl.col('duration').n_unique().alias('unique_durations'),
    pl.col('duration').alias('all_durations'),
    pl.col('file_id').alias('all_file_ids'),
    pl.col('file_path').alias('all_file_paths'),
    pl.col('size').alias('all_file_sizes'),
    pl.col('is_primary').alias('all_is_primary')
]).filter(
    pl.col('unique_durations') > 1  # Only keep scenes with different durations
).sort('scene_id')

# Print summary of mismatched files
print("\nScenes with duration mismatches:")
for row in duration_mismatches.iter_rows(named=True):
    print(f"\nScene {row['scene_id']} - {row['title']}")
    
    for i, (duration, file_id, file_path, size, is_primary) in enumerate(zip(
        row['all_durations'], 
        row['all_file_ids'], 
        row['all_file_paths'],
        row['all_file_sizes'],
        row['all_is_primary']
    )):
        primary_status = " (Primary)" if is_primary else ""
        print(f"  File{primary_status}: {file_path}")
        print(f"    Duration: {duration}s, ID: {file_id}, Size: {size:,} bytes")

# Print summary statistics
print(f"\nTotal scenes with duration mismatches: {len(duration_mismatches)}")

In [None]:
for row in duration_mismatches.iter_rows(named=True):
    refreshed_scene = stash_raw_client.find_scene(row['scene_id'])
    print(f"Scene {row['scene_id']} - {row['title']}")
    existing_tag_ids = [tag['id'] for tag in refreshed_scene['tags']]
    if duration_mismatch_tag['id'] not in existing_tag_ids:
        new_tag_ids = [*existing_tag_ids, duration_mismatch_tag['id']]
        stash_raw_client.update_scene({ "id": row['scene_id'], "tag_ids": new_tag_ids })
        print(f"Added duration mismatch tag to scene {row['scene_id']}")


In [None]:
stash_raw_client.find_scenes({ "tags": { "value": [duration_mismatch_tag["id"]], "modifier": "INCLUDES" }}, fragment="id title tags { id name }")

# Find PHASH mismatches

In [55]:
def hamming_distance_hex(hash1: str, hash2: str) -> int:
    """Calculate Hamming distance between two hex strings."""
    try:
        bin1 = bin(int(hash1, 16))[2:].zfill(64)
        bin2 = bin(int(hash2, 16))[2:].zfill(64)
        return sum(b1 != b2 for b1, b2 in zip(bin1, bin2))
    except (ValueError, TypeError):
        return 0

In [60]:
scenes_with_dupes = stash_raw_client.find_scenes({ 
  "file_count": {
    "modifier": "GREATER_THAN",
    "value": 1
  },
  "tags": {
    "value": [],
    "modifier": "INCLUDES",
    "excludes": [scenes_with_multiple_versions['id']]
  }
}, fragment="id title date studio { name } files { id duration path width height size fingerprints { type value } }")
# Create Polars DataFrame with strict=False to handle mixed numeric types
scenes_with_dupes_df = pl.DataFrame(scenes_with_dupes, strict=False)

In [None]:
# Create a list to store all file records
file_records = []

for scene in scenes_with_dupes:
    for i, file in enumerate(scene['files']):
        # Extract fingerprints
        oshash = next((fp['value'] for fp in file['fingerprints'] if fp['type'] == 'oshash'), None)
        phash = next((fp['value'] for fp in file['fingerprints'] if fp['type'] == 'phash'), None)
        
        # Create a record for each file
        record = {
            'scene_id': scene['id'],
            'title': scene['title'],
            'date': scene['date'],
            'studio_name': scene['studio']['name'] if scene['studio'] else None,
            'file_id': file['id'],
            'file_path': file['path'],
            'resolution_width': float(file['width']),
            'resolution_height': float(file['height']),
            'size': float(file['size']),
            'duration': float(file['duration']),
            'oshash': oshash,
            'phash': phash,
            'is_primary': i == 0
        }
        file_records.append(record)

# Create DataFrame from the flattened records
files_df = pl.DataFrame(file_records, strict=False)

# Find scenes with matching durations but differing phashes
phash_mismatches = []
for scene_id, group in files_df.group_by('scene_id'):
    # Get all files for this scene
    scene_files = group.to_dicts()
    
    # Skip if durations don't match
    durations = set(file['duration'] for file in scene_files)
    if len(durations) > 1:
        continue
        
    # Compare phashes
    has_mismatch = False
    for i, file1 in enumerate(scene_files):
        for file2 in scene_files[i + 1:]:
            if hamming_distance_hex(file1['phash'], file2['phash']) > 8:
                has_mismatch = True
                break
        if has_mismatch:
            phash_mismatches.extend(scene_files)
            break

# Create results DataFrame
results_df = pl.DataFrame(phash_mismatches, strict=False).sort('scene_id')

# Print results
print("\nScenes with matching durations but differing phash values (>8 bits different):")
for scene_id, group in results_df.group_by('scene_id'):
    scene_files = group.to_dicts()
    print(f"\nScene {scene_id} - {scene_files[0]['title']}")
    
    for file in scene_files:
        primary_status = " (Primary)" if file['is_primary'] else ""
        print(f"  File{primary_status}: {file['file_path']}")
        print(f"    Duration: {file['duration']}s")
        print(f"    pHash: {file['phash']}")
        
        if not file['is_primary']:
            primary_file = next(f for f in scene_files if f['is_primary'])
            hamming_dist = hamming_distance_hex(primary_file['phash'], file['phash'])
            print(f"    Hamming distance from primary: {hamming_dist} bits")

print(f"\nTotal scenes with phash mismatches: {len(set(results_df['scene_id']))}")

# Making higher quality versions the primary file

In [None]:
scenes_with_dupes = stash_raw_client.find_scenes({ 
  "file_count": {
    "modifier": "GREATER_THAN",
    "value": 1
  },
  "tags": {
    "value": [],
    "modifier": "INCLUDES",
    "excludes": [scenes_with_multiple_versions['id']]
  }
}, fragment="id title date studio { name } files { id duration path width height size fingerprints { type value } format video_codec audio_codec }")
# Create Polars DataFrame with strict=False to handle mixed numeric types
scenes_with_dupes_df = pl.DataFrame(scenes_with_dupes, strict=False)

In [None]:
import polars as pl

# Explode the files array to get one row per file
files_df = scenes_with_dupes_df.explode('files')

# Group by scene ID and check if there are different codecs within each group
codec_analysis = (files_df
    .select([
        'id',
        'title',
        pl.col('files').struct.field('video_codec').alias('video_codec'),
        pl.col('files').struct.field('audio_codec').alias('audio_codec')
    ])
    .group_by('id')
    .agg([
        pl.col('title').first(),
        pl.col('video_codec').alias('video_codecs'),
        pl.col('audio_codec').alias('audio_codecs'),
        pl.col('video_codec').n_unique().alias('unique_video_codecs'),
        pl.col('audio_codec').n_unique().alias('unique_audio_codecs')
    ])
    .filter(
        (pl.col('unique_video_codecs') > 1) |
        (pl.col('unique_audio_codecs') > 1)
    )
)
codec_analysis

In [None]:
stash_client.update_tags_for_scenes(
    codec_analysis.select("id").to_series().to_list(),
    ["Duplicate: Video or Audio Formats Differ"],
    []
)


In [81]:
codec_mismatch_tag = stash_raw_client.find_tag({ "name": "Duplicate: Video or Audio Formats Differ" })

In [None]:
scenes_with_codec_mismatch_tag_but_only_one_file = stash_raw_client.find_scenes({
    "tags": { "value": [codec_mismatch_tag["id"]], "modifier": "INCLUDES" },
    "file_count": { "modifier": "EQUALS", "value": 1 }
}, fragment="id title tags { id name }")
scenes_with_codec_mismatch_tag_but_only_one_file_ids = [scene['id'] for scene in scenes_with_codec_mismatch_tag_but_only_one_file]
scenes_with_codec_mismatch_tag_but_only_one_file

In [None]:
stash_client.update_tags_for_scenes(
    scenes_with_codec_mismatch_tag_but_only_one_file_ids,
    [],
    [codec_mismatch_tag["name"]]
)