In [33]:
import logging
from typing import List, Dict
import json
import pandas as pd
from collections import Counter
import random

logging.basicConfig(level=logging.WARNING, format="%(levelname)s: %(message)s")

def pk_score(predicted: List[int], reference: List[int], k: int) -> float:
    
    def binary_to_incremental(binary: List[int]) -> List[int]:
        segmentation = []
        current_segment = 0
        for marker in binary:
            segmentation.append(current_segment)
            if marker == 1:
                current_segment += 1
        return segmentation

    predicted_seg = binary_to_incremental(predicted)
    reference_seg = binary_to_incremental(reference)
    print(len(reference_seg))

    N = len(predicted_seg)
    if N != len(reference_seg):
        raise ValueError("The predicted and reference segmentations must have the same length.")
    if k <= 0 or k >= N:
        raise ValueError("Window size k must be greater than 0 and less than the sequence length.")

    mismatches = sum(
        (predicted_seg[i] == predicted_seg[i + k]) != (reference_seg[i] == reference_seg[i + k])
        for i in range(N - k)
    )

    return mismatches / (N - k) if (N - k) > 0 else 0.0

def compare_annotations(data_1: Dict, data_2: Dict, use_random: bool = False) -> float:
 
    original_pages = data_1.get("comic_data", {}).get("pages", [])
    person_pages = data_2.get("comic_data", {}).get("pages", [])

    if not original_pages or not person_pages:
        raise ValueError("One or both datasets are missing 'comic_data' or 'pages'.")

    orig_annotations_all = []
    person_annotations_all = []

    for orig_page, person_page in zip(original_pages, person_pages):
        if orig_page.get("page_number") != person_page.get("page_number"):
            logging.warning(f"Mismatch in page numbers - {orig_page.get('page_number')} vs {person_page.get('page_number')}")
            continue

        orig_annotations = [ann.get("starting_tag", None) for ann in orig_page.get("annotations", [])]
        person_annotations = [ann.get("starting_tag", None) for ann in person_page.get("annotations", [])]

        orig_annotations_all.extend(orig_annotations)
        person_annotations_all.extend(person_annotations)
        


    if use_random:
        person_annotations_all = [random.choice([0, 1]) for _ in range(len(person_annotations_all))]

    if not orig_annotations_all or not person_annotations_all:
        raise ValueError("No annotations found in one or both datasets.")

    k = 3

    return pk_score(person_annotations_all, orig_annotations_all, k)


In [34]:
import os
names = ["Alley_Oop", "Champ", "Treasure_Comics", "Western_Love"]

all_results = []

for name in names:
    files = [rf"Data\{name}\annotated_scenes\{name}_annotated_scenes_{i}.json" for i in range(1, 11)]
    
    metadata_path = f"Data/{name}/updated_comic_metadata.json"
    if not os.path.exists(metadata_path):
        print(f"Warning: Metadata file for {name} not found.")
        continue

    with open(metadata_path, "r", encoding="utf-8") as f:
        updated_comic_data = json.load(f)

    for i, file in enumerate(files):
        if not os.path.exists(file):
            print(f"Warning: Scene file {file} not found.")
            continue

        with open(file, "r", encoding="utf-8") as f:
            scene_data = json.load(f)

        score = compare_annotations(updated_comic_data, scene_data)

        all_results.append({
            'comic_name': name,
            'scene': f"{name}_annotated_scenes_{i + 1}.json",
            'compared_to': "updated_comic_metadata.json",
            'pk_score': score,
        })

pk_scores_df = pd.DataFrame(all_results)
pk_scores_df

191
191
191
191
191
191
191
191
191
191
263
263
263
263
263
263
263
263
263
263
221
221
221
221
221
221
221
221
221
221
279
279
279
279
279
279
279
279
279
279


Unnamed: 0,comic_name,scene,compared_to,pk_score
0,Alley_Oop,Alley_Oop_annotated_scenes_1.json,updated_comic_metadata.json,0.441489
1,Alley_Oop,Alley_Oop_annotated_scenes_2.json,updated_comic_metadata.json,0.441489
2,Alley_Oop,Alley_Oop_annotated_scenes_3.json,updated_comic_metadata.json,0.425532
3,Alley_Oop,Alley_Oop_annotated_scenes_4.json,updated_comic_metadata.json,0.388298
4,Alley_Oop,Alley_Oop_annotated_scenes_5.json,updated_comic_metadata.json,0.430851
5,Alley_Oop,Alley_Oop_annotated_scenes_6.json,updated_comic_metadata.json,0.473404
6,Alley_Oop,Alley_Oop_annotated_scenes_7.json,updated_comic_metadata.json,0.446809
7,Alley_Oop,Alley_Oop_annotated_scenes_8.json,updated_comic_metadata.json,0.37766
8,Alley_Oop,Alley_Oop_annotated_scenes_9.json,updated_comic_metadata.json,0.462766
9,Alley_Oop,Alley_Oop_annotated_scenes_10.json,updated_comic_metadata.json,0.457447


In [35]:
comic_stats_first = pk_scores_df.groupby("comic_name")["pk_score"].describe()
comic_stats_first

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
comic_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Alley_Oop,10.0,0.434574,0.030817,0.37766,0.426862,0.441489,0.454787,0.473404
Champ,10.0,0.404231,0.019226,0.376923,0.389423,0.401923,0.418269,0.434615
Treasure_Comics,10.0,0.433945,0.014698,0.40367,0.427752,0.433486,0.443807,0.454128
Western_Love,10.0,0.431522,0.015509,0.394928,0.427536,0.434783,0.443841,0.445652


In [36]:
import os
names = ["Alley_Oop", "Champ", "Treasure_Comics", "Western_Love"]

all_results = []

for name in names:
    files = [rf"Data\{name}\annotated_scenes\{name}_annotated_scenes_{i}.json" for i in range(1, 11)]
    
    metadata_path = f"Data/{name}/updated_comic_metadata.json"
    if not os.path.exists(metadata_path):
        print(f"Warning: Metadata file for {name} not found.")
        continue

    with open(metadata_path, "r", encoding="utf-8") as f:
        updated_comic_data = json.load(f)

    for i, file in enumerate(files):
        if not os.path.exists(file):
            print(f"Warning: Scene file {file} not found.")
            continue

        with open(file, "r", encoding="utf-8") as f:
            scene_data = json.load(f)

        score = compare_annotations(updated_comic_data, scene_data,True)

        all_results.append({
            'comic_name': name,
            'scene': f"{name}_random_{i + 1}.json",
            'compared_to': "updated_comic_metadata.json",
            'pk_score': score,
        })

pk_scores_df = pd.DataFrame(all_results)
pk_scores_df

191
191
191
191
191
191
191
191
191
191
263
263
263
263
263
263
263
263
263
263
221
221
221
221
221
221
221
221
221
221
279
279
279
279
279
279
279
279
279
279


Unnamed: 0,comic_name,scene,compared_to,pk_score
0,Alley_Oop,Alley_Oop_random_1.json,updated_comic_metadata.json,0.494681
1,Alley_Oop,Alley_Oop_random_2.json,updated_comic_metadata.json,0.457447
2,Alley_Oop,Alley_Oop_random_3.json,updated_comic_metadata.json,0.468085
3,Alley_Oop,Alley_Oop_random_4.json,updated_comic_metadata.json,0.430851
4,Alley_Oop,Alley_Oop_random_5.json,updated_comic_metadata.json,0.409574
5,Alley_Oop,Alley_Oop_random_6.json,updated_comic_metadata.json,0.457447
6,Alley_Oop,Alley_Oop_random_7.json,updated_comic_metadata.json,0.494681
7,Alley_Oop,Alley_Oop_random_8.json,updated_comic_metadata.json,0.484043
8,Alley_Oop,Alley_Oop_random_9.json,updated_comic_metadata.json,0.489362
9,Alley_Oop,Alley_Oop_random_10.json,updated_comic_metadata.json,0.430851


In [37]:
comic_stats_random = pk_scores_df.groupby("comic_name")["pk_score"].describe()
comic_stats_random

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
comic_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Alley_Oop,10.0,0.461702,0.030069,0.409574,0.4375,0.462766,0.488032,0.494681
Champ,10.0,0.445385,0.033373,0.4,0.416346,0.453846,0.465385,0.488462
Treasure_Comics,10.0,0.462844,0.026961,0.417431,0.450688,0.470183,0.475917,0.509174
Western_Love,10.0,0.463768,0.023543,0.434783,0.442935,0.461957,0.476449,0.507246


In [38]:
import pandas as pd
import json
import os
from itertools import combinations

names = ["Alley_Oop", "Champ", "Treasure_Comics", "Western_Love"]

all_results = []

for name in names:
    files = [rf"Data\{name}\annotated_scenes\{name}_annotated_scenes_{i}.json" for i in range(1, 11)]

    data = []
    for file in files:
        if not os.path.exists(file):
            print(f"Warning: Scene file {file} not found.")
            continue
        
        with open(file, "r", encoding="utf-8") as f:
            data.append(json.load(f))

    for (i, scene1), (j, scene2) in combinations(enumerate(data, 1), 2):
        score = compare_annotations(scene1, scene2)

        all_results.append({
            'comic_name': name,
            'scene_1': f"{name}_annotated_scenes_{i}.json",
            'scene_2': f"{name}_annotated_scenes_{j}.json",
            'pk_score': score,
        })

pk_scores_df = pd.DataFrame(all_results)
pk_scores_df

191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
279
279
279
279
279
279
279
279
279
279
279
279
279
279
279
279
279
279
279
279
279
279
279
279
279
279
279
279
279
279
279
279
279
279
279
279
279
279
279
279
279
279
279
279
279


Unnamed: 0,comic_name,scene_1,scene_2,pk_score
0,Alley_Oop,Alley_Oop_annotated_scenes_1.json,Alley_Oop_annotated_scenes_2.json,0.095745
1,Alley_Oop,Alley_Oop_annotated_scenes_1.json,Alley_Oop_annotated_scenes_3.json,0.111702
2,Alley_Oop,Alley_Oop_annotated_scenes_1.json,Alley_Oop_annotated_scenes_4.json,0.159574
3,Alley_Oop,Alley_Oop_annotated_scenes_1.json,Alley_Oop_annotated_scenes_5.json,0.117021
4,Alley_Oop,Alley_Oop_annotated_scenes_1.json,Alley_Oop_annotated_scenes_6.json,0.095745
...,...,...,...,...
175,Western_Love,Western_Love_annotated_scenes_7.json,Western_Love_annotated_scenes_9.json,0.097826
176,Western_Love,Western_Love_annotated_scenes_7.json,Western_Love_annotated_scenes_10.json,0.086957
177,Western_Love,Western_Love_annotated_scenes_8.json,Western_Love_annotated_scenes_9.json,0.090580
178,Western_Love,Western_Love_annotated_scenes_8.json,Western_Love_annotated_scenes_10.json,0.057971


In [39]:
comic_stats = pk_scores_df.groupby("comic_name")["pk_score"].describe()
comic_stats

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
comic_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Alley_Oop,45.0,0.100118,0.03186,0.031915,0.079787,0.095745,0.117021,0.159574
Champ,45.0,0.085726,0.021921,0.046154,0.073077,0.084615,0.1,0.134615
Treasure_Comics,45.0,0.045668,0.016069,0.018349,0.03211,0.045872,0.055046,0.091743
Western_Love,45.0,0.071739,0.017574,0.036232,0.061594,0.076087,0.083333,0.108696


In [40]:
import os
import json
import pandas as pd

names = ["Alley_Oop", "Champ", "Treasure_Comics", "Western_Love"]

all_results = []

for name in names:
    metadata_path = f"Data/{name}/updated_comic_metadata.json"
    
    if not os.path.exists(metadata_path):
        print(f"Warning: Metadata file for {name} not found.")
        continue

    with open(metadata_path, "r", encoding="utf-8") as f:
        updated_comic_data = json.load(f)

    for iter in range(1, 11):
        for i in range(1, 11):
            file_path = rf"Data\{name}\refined_scenes\{iter}\{name}_{iter}_refined_scenes_{i}.json"

            if not os.path.exists(file_path):
                print(f"Warning: Scene file {file_path} not found.")
                continue

            with open(file_path, "r", encoding="utf-8") as f:
                scene_data = json.load(f)

            score = compare_annotations(updated_comic_data, scene_data)

            all_results.append({
                'comic_name': name,
                'iteration': iter,
                'scene': f"{name}_{iter}_refined_scenes_{i}.json",
                'compared_to': "updated_comic_metadata.json",
                'pk_score': score,
            })

pk_scores_df = pd.DataFrame(all_results)

pk_scores_df

191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
263
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221
221


Unnamed: 0,comic_name,iteration,scene,compared_to,pk_score
0,Alley_Oop,1,Alley_Oop_1_refined_scenes_1.json,updated_comic_metadata.json,0.388298
1,Alley_Oop,1,Alley_Oop_1_refined_scenes_2.json,updated_comic_metadata.json,0.260638
2,Alley_Oop,1,Alley_Oop_1_refined_scenes_3.json,updated_comic_metadata.json,0.265957
3,Alley_Oop,1,Alley_Oop_1_refined_scenes_4.json,updated_comic_metadata.json,0.265957
4,Alley_Oop,1,Alley_Oop_1_refined_scenes_5.json,updated_comic_metadata.json,0.239362
...,...,...,...,...,...
349,Western_Love,10,Western_Love_10_refined_scenes_6.json,updated_comic_metadata.json,0.442029
350,Western_Love,10,Western_Love_10_refined_scenes_7.json,updated_comic_metadata.json,0.442029
351,Western_Love,10,Western_Love_10_refined_scenes_8.json,updated_comic_metadata.json,0.442029
352,Western_Love,10,Western_Love_10_refined_scenes_9.json,updated_comic_metadata.json,0.442029


In [41]:
comic_stats = pk_scores_df.groupby(["comic_name", "iteration"])["pk_score"].describe()
comic_stats = comic_stats.sort_index(level=["comic_name", "iteration"])
comic_stats

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
comic_name,iteration,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alley_Oop,1,10.0,0.273404,0.04114087,0.239362,0.260638,0.263298,0.265957,0.388298
Alley_Oop,2,9.0,0.285461,0.007036573,0.271277,0.281915,0.287234,0.292553,0.292553
Alley_Oop,3,10.0,0.289894,0.003761206,0.287234,0.287234,0.287234,0.292553,0.297872
Alley_Oop,4,7.0,0.344985,0.01853543,0.303191,0.351064,0.351064,0.351064,0.356383
Alley_Oop,5,10.0,0.354787,0.00437911,0.351064,0.351064,0.353723,0.356383,0.361702
Alley_Oop,6,6.0,0.366135,0.005229745,0.356383,0.367021,0.367021,0.367021,0.37234
Alley_Oop,7,9.0,0.41253,0.04765844,0.37234,0.37234,0.37234,0.462766,0.462766
Alley_Oop,8,5.0,0.452128,6.206335e-17,0.452128,0.452128,0.452128,0.452128,0.452128
Alley_Oop,9,9.0,0.456265,0.002345524,0.452128,0.457447,0.457447,0.457447,0.457447
Alley_Oop,10,8.0,0.458112,0.001880603,0.457447,0.457447,0.457447,0.457447,0.462766


In [42]:
comic_mean = pk_scores_df.groupby("comic_name")["pk_score"].mean()
comic_mean

comic_name
Alley_Oop          0.362728
Champ              0.400916
Treasure_Comics    0.415113
Western_Love       0.394966
Name: pk_score, dtype: float64

In [43]:
comic_stats_first

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
comic_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Alley_Oop,10.0,0.434574,0.030817,0.37766,0.426862,0.441489,0.454787,0.473404
Champ,10.0,0.404231,0.019226,0.376923,0.389423,0.401923,0.418269,0.434615
Treasure_Comics,10.0,0.433945,0.014698,0.40367,0.427752,0.433486,0.443807,0.454128
Western_Love,10.0,0.431522,0.015509,0.394928,0.427536,0.434783,0.443841,0.445652


In [44]:
comic_stats_random

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
comic_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Alley_Oop,10.0,0.461702,0.030069,0.409574,0.4375,0.462766,0.488032,0.494681
Champ,10.0,0.445385,0.033373,0.4,0.416346,0.453846,0.465385,0.488462
Treasure_Comics,10.0,0.462844,0.026961,0.417431,0.450688,0.470183,0.475917,0.509174
Western_Love,10.0,0.463768,0.023543,0.434783,0.442935,0.461957,0.476449,0.507246


In [45]:
import pandas as pd
import json
import os
from itertools import combinations

names = ["Alley_Oop", "Champ", "Treasure_Comics", "Western_Love"]

all_results = []

scene_data = {}

for name in names:
    scene_data[name] = {}

    for iter in range(1, 11):
        scene_data[name][iter] = []

        for i in range(1, 11):
            file_path = rf"Data\{name}\refined_scenes\{iter}\{name}_{iter}_refined_scenes_{i}.json"

            if not os.path.exists(file_path):
                print(f"Warning: Scene file {file_path} not found.")
                continue

            with open(file_path, "r", encoding="utf-8") as f:
                scene_data[name][iter].append(json.load(f))

for name in names:
    for iter in range(1, 11):
        scenes = scene_data[name].get(iter, [])

        for (i, scene1), (j, scene2) in combinations(enumerate(scenes, 1), 2):
            score = compare_annotations(scene1, scene2)

            all_results.append({
                'comic_name': name,
                'iteration': iter,
                'scene_1': f"{name}_{iter}_refined_scenes_{i}.json",
                'scene_2': f"{name}_{iter}_refined_scenes_{j}.json",
                'pk_score': score,
            })

pk_scores_df = pd.DataFrame(all_results)

pk_scores_df


191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191


Unnamed: 0,comic_name,iteration,scene_1,scene_2,pk_score
0,Alley_Oop,1,Alley_Oop_1_refined_scenes_1.json,Alley_Oop_1_refined_scenes_2.json,0.255319
1,Alley_Oop,1,Alley_Oop_1_refined_scenes_1.json,Alley_Oop_1_refined_scenes_3.json,0.281915
2,Alley_Oop,1,Alley_Oop_1_refined_scenes_1.json,Alley_Oop_1_refined_scenes_4.json,0.281915
3,Alley_Oop,1,Alley_Oop_1_refined_scenes_1.json,Alley_Oop_1_refined_scenes_5.json,0.319149
4,Alley_Oop,1,Alley_Oop_1_refined_scenes_1.json,Alley_Oop_1_refined_scenes_6.json,0.382979
...,...,...,...,...,...
1456,Western_Love,10,Western_Love_10_refined_scenes_7.json,Western_Love_10_refined_scenes_9.json,0.000000
1457,Western_Love,10,Western_Love_10_refined_scenes_7.json,Western_Love_10_refined_scenes_10.json,0.000000
1458,Western_Love,10,Western_Love_10_refined_scenes_8.json,Western_Love_10_refined_scenes_9.json,0.000000
1459,Western_Love,10,Western_Love_10_refined_scenes_8.json,Western_Love_10_refined_scenes_10.json,0.000000


In [46]:
comic_stats = pk_scores_df.groupby(["comic_name", "iteration"])["pk_score"].describe()
comic_stats = comic_stats.sort_index(level=["comic_name", "iteration"])
comic_stats


Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
comic_name,iteration,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alley_Oop,1,45.0,0.129314,0.123251,0.0,0.031915,0.106383,0.143617,0.425532
Alley_Oop,2,36.0,0.007979,0.006031,0.0,0.005319,0.005319,0.010638,0.021277
Alley_Oop,3,45.0,0.006738,0.006651,0.0,0.0,0.005319,0.010638,0.021277
Alley_Oop,4,21.0,0.024316,0.036605,0.0,0.0,0.005319,0.079787,0.085106
Alley_Oop,5,45.0,0.01383,0.010041,0.0,0.005319,0.010638,0.021277,0.031915
Alley_Oop,6,15.0,0.011702,0.007576,0.0,0.007979,0.010638,0.015957,0.026596
Alley_Oop,7,36.0,0.056147,0.050931,0.0,0.0,0.101064,0.101064,0.101064
Alley_Oop,8,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Alley_Oop,9,36.0,0.002069,0.00263,0.0,0.0,0.0,0.005319,0.005319
Alley_Oop,10,28.0,0.00133,0.002346,0.0,0.0,0.0,0.00133,0.005319


In [47]:
import pandas as pd
import json
import os
from itertools import combinations

names = ["Alley_Oop", "Champ", "Treasure_Comics", "Western_Love"]

all_results = []

scene_data = {}

for name in names:
    scene_data[name] = {}

    for iter in range(1, 11):
        scene_data[name][iter] = []

        for i in range(1, 11):
            file_path = rf"Data\{name}\refined_scenes\{iter}\{name}_{iter}_refined_scenes_{i}.json"

            if not os.path.exists(file_path):
                print(f"Warning: Scene file {file_path} not found.")
                continue

            with open(file_path, "r", encoding="utf-8") as f:
                scene_data[name][iter].append(json.load(f))

for name in names:
    for (iter1, iter2) in combinations(range(1, 11), 2):
        scenes1 = scene_data[name].get(iter1, [])
        scenes2 = scene_data[name].get(iter2, [])

        for (i, scene1), (j, scene2) in combinations(enumerate(scenes1, 1), 2):
            score = compare_annotations(scene1, scene2)

            all_results.append({
                'comic_name': name,
                'iteration_1': iter1,
                'iteration_2': iter2,
                'scene_1': f"{name}_{iter1}_refined_scenes_{i}.json",
                'scene_2': f"{name}_{iter2}_refined_scenes_{j}.json",
                'pk_score': score,
            })

pk_scores_df = pd.DataFrame(all_results)

pk_scores_df


191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191
191


Unnamed: 0,comic_name,iteration_1,iteration_2,scene_1,scene_2,pk_score
0,Alley_Oop,1,2,Alley_Oop_1_refined_scenes_1.json,Alley_Oop_2_refined_scenes_2.json,0.255319
1,Alley_Oop,1,2,Alley_Oop_1_refined_scenes_1.json,Alley_Oop_2_refined_scenes_3.json,0.281915
2,Alley_Oop,1,2,Alley_Oop_1_refined_scenes_1.json,Alley_Oop_2_refined_scenes_4.json,0.281915
3,Alley_Oop,1,2,Alley_Oop_1_refined_scenes_1.json,Alley_Oop_2_refined_scenes_5.json,0.319149
4,Alley_Oop,1,2,Alley_Oop_1_refined_scenes_1.json,Alley_Oop_2_refined_scenes_6.json,0.382979
...,...,...,...,...,...,...
6762,Western_Love,9,10,Western_Love_9_refined_scenes_7.json,Western_Love_10_refined_scenes_9.json,0.000000
6763,Western_Love,9,10,Western_Love_9_refined_scenes_7.json,Western_Love_10_refined_scenes_10.json,0.000000
6764,Western_Love,9,10,Western_Love_9_refined_scenes_8.json,Western_Love_10_refined_scenes_9.json,0.000000
6765,Western_Love,9,10,Western_Love_9_refined_scenes_8.json,Western_Love_10_refined_scenes_10.json,0.000000


In [48]:
comic_stats = pk_scores_df.groupby("comic_name")["pk_score"].describe()
comic_stats = comic_stats.sort_index(level="comic_name")
comic_stats


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
comic_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Alley_Oop,1583.0,0.044099,0.082207,0.0,0.005319,0.010638,0.031915,0.425532
Champ,1616.0,0.054784,0.133278,0.0,0.0,0.011538,0.023077,0.596154
Treasure_Comics,1823.0,0.048685,0.07508,0.0,0.0,0.009174,0.06422,0.334862
Western_Love,1745.0,0.028489,0.036917,0.0,0.003623,0.014493,0.039855,0.130435


In [1]:
import logging
import json
import random
from typing import List, Dict

logging.basicConfig(level=logging.WARNING, format="%(levelname)s: %(message)s")

def pk_score(predicted: List[int], reference: List[int], k: int) -> float:
    def binary_to_incremental(binary: List[int]) -> List[int]:
        segmentation = []
        current_segment = 0
        for marker in binary:
            segmentation.append(current_segment)
            if marker == 1:
                current_segment += 1
        return segmentation

    predicted_seg = binary_to_incremental(predicted)
    reference_seg = binary_to_incremental(reference)

    N = len(predicted_seg)
    if N != len(reference_seg):
        raise ValueError("Predicted and reference segmentations must have the same length.")
    if k <= 0 or k >= N:
        raise ValueError("Window size k must be greater than 0 and less than sequence length.")

    mismatches = sum(
        (predicted_seg[i] == predicted_seg[i + k]) != (reference_seg[i] == reference_seg[i + k])
        for i in range(N - k)
    )

    return mismatches / (N - k) if (N - k) > 0 else 0.0

def flatten_pages(data: Dict) -> List[int]:
    """Extracts and flattens all page arrays into a single list."""
    flattened = []
    for chapter in data.get("comic", {}).get("chapters", []):
        for page in chapter.get("pages", []):
            flattened.extend(page)  # Append all values in page list
    return flattened

def compare_annotations(data_1: Dict, data_2: Dict, use_random: bool = False) -> float:
    original_annotations = flatten_pages(data_1)
    person_annotations = flatten_pages(data_2)

    if use_random:
        person_annotations = [random.choice([0, 1]) for _ in range(len(person_annotations))]

    if len(original_annotations) != len(person_annotations):
        raise ValueError("The datasets must have the same number of annotations for comparison.")

    k = 3  # Window size
    return pk_score(person_annotations, original_annotations, k)
names = ["Liberty", "Sain", "Arab", "Twinkle","Six","Marco"]
for name in names:
    # Example usage:
    with open(rf"C:\Users\derra\Desktop\Paper\{name}.json", "r") as f:
        data_1 = json.load(f)
    
    with open(rf"C:\Users\derra\Desktop\Paper\{name}_1.json", "r") as f:
        data_2 = json.load(f)
    
    with open(rf"C:\Users\derra\Desktop\Paper\{name}_2.json", "r") as f:
        data_3 = json.load(f)
    
    score_1 = compare_annotations(data_1, data_2)
    score_2 = compare_annotations(data_1, data_3)
    score_3 = compare_annotations(data_2,data_3)
    print(f"{name}:   P_k Score 1: {score_1:.4f}           P_k Score 2: {score_2:.4f}           P_k Score 3: {score_3:.4f}")


Liberty:   P_k Score 1: 0.2093           P_k Score 2: 0.0698           P_k Score 3: 0.2791
Sain:   P_k Score 1: 0.0698           P_k Score 2: 0.0000           P_k Score 3: 0.0698
Arab:   P_k Score 1: 0.1667           P_k Score 2: 0.3333           P_k Score 3: 0.2619
Twinkle:   P_k Score 1: 0.2727           P_k Score 2: 0.1636           P_k Score 3: 0.2182
Six:   P_k Score 1: 0.1224           P_k Score 2: 0.3673           P_k Score 3: 0.2857
Marco:   P_k Score 1: 0.0556           P_k Score 2: 0.1944           P_k Score 3: 0.1389
