This Notebook is strongly based on Elisa Michelet, who did a similar project with image detection at the DFVK Paris:
https://github.com/dfk-paris/DFKV-illustrations/blob/main/3_illustration_detection/c_merging_datasets/merge_annotations.ipynb

In [1]:
import json
import random
import shutil
path = "./training/freiheitskampf_training/"
path_bw_images = "./beyond_words_data/"

## Load beyond_words_dataset and filter out headlines

1. Load the data and check which images contain titles
2. Move the images to the final training folder
3. Get the last or maximal id of the images and annotations to make a continous json

In [2]:
f = open(path + "all_beyond_words.json")
data = json.load(f)
f.close()
data["categories"] #id 5 is headline, thats the only category we need

    


[{'id': 0, 'name': 'Photograph', 'supercategory': 'Content'},
 {'id': 1, 'name': 'Illustration', 'supercategory': 'Content'},
 {'id': 2, 'name': 'Map', 'supercategory': 'Content'},
 {'id': 3, 'name': 'Comics/Cartoon', 'supercategory': 'Content'},
 {'id': 4, 'name': 'Editorial Cartoon', 'supercategory': 'Content'},
 {'id': 5, 'name': 'Headline', 'supercategory': 'Content'},
 {'id': 6, 'name': 'Advertisement', 'supercategory': 'Content'}]

In [3]:
for item in data["annotations"]:
    if item["image_id"] == 35:
        print(item)

{'id': 2609, 'bw_id': '5d91fdd7e8c9c80001000d21', 'image_id': 35, 'category_id': 0, 'bbox': [54, 746, 785, 313], 'iscrowd': 0, 'area': 245705}
{'id': 3878, 'bw_id': '59e0ec353e7a180001001bf3', 'image_id': 35, 'category_id': 0, 'bbox': [640, 417, 194, 333], 'iscrowd': 0, 'area': 64602}
{'id': 3879, 'bw_id': '59e0ec7b3e7a180001001bf5', 'image_id': 35, 'category_id': 0, 'bbox': [377, 550, 271, 193], 'iscrowd': 0, 'area': 52303}


In [4]:
# To change the label to 0 
def replace_label(x):
    try: 
        x['category_id'] = 0
        return x
    except:
        return dict()

In [5]:
def change_file_name(name):
    try: 
        name["file_name"] = "images/" + name["file_name"]
        return name
    except:
        return dict()

In [6]:

# Only keep annotations with label 5 
new_annotations = [replace_label(x) for x in data['annotations'] if x['category_id']==5 ]
# Only keep images which have these annotations
images_to_keep = list(set([x['image_id'] for x in new_annotations]))
new_images = [change_file_name(im) for im in data['images'] if im['id'] in images_to_keep]
# One big new category : Title
new_categorie = [{'id' : 0, 'name': "Title"}]


In [7]:
## ALLe annotierungen nochmal neu machen und bei google drive nochmal hochladen

In [8]:

# Creating the new dataset
new_data = data.copy()
new_data['categorie'] = new_categorie
new_data['annotations'] = new_annotations
new_data['images'] = new_images



In [9]:
f = open(path + "result_september.json")
data1 = json.load(f)
f.close()
f = open(path + "result_march.json")
data2 = json.load(f)
f.close()

In [10]:
print(len(data1["annotations"]))
print(len(data2["annotations"]))

219
263


In [11]:
# Function to replace the id of the image
def replace_id_images(x, im_offset):
    try: 
        x['id'] = x['id'] + im_offset
        return x
    except:
        return dict()


In [12]:
# Function to replace the ids in the annotation part of the json
def replace_ids_annotations(x, im_offset, an_offset):
    try: 
        x['id'] = x['id'] + an_offset
        x['image_id'] = x['image_id'] + im_offset 
        return x
    except:
        return dict()

In [13]:
def float_to_int(dic):
    try:
        dic["bbox"] = [round(x) for x in dic["bbox"]]
        dic["area"] = round(dic["area"])
        
        if "segmentation" in dic:
            del dic["segmentation"]
        if "ignore" in dic:
            del dic["ignore"]
        
        
        return dic
    except:
        return dic

In [13]:

# Function to merge two COCO files, changing overlapping indices
def merge_two_files(f1, f2):
    # Ids of last images and annotations of first file
    last_id_image = max([im['id'] for im in f1['images']])
    last_id_annotation = max([an['id'] for an in f1['annotations']])
    
    # Merge images with new indices
  
    new_fk_imgs = [replace_id_images(im, last_id_image + 1) for im in f2['images']]
    new_fk_imgs = [*f1["images"], *new_fk_imgs]
    
    # Merge annotations with new indices
    new_fk_annos = [replace_ids_annotations(im, last_id_image + 1, last_id_annotation + 1) for im in f2['annotations']]
    new_fk_annos = [*f1['annotations'], *new_fk_annos]
    
    new_fk_annos = [replace_label(x) for x in new_fk_annos]

  
    # New merge data file
    new_data = f1.copy()
    new_data['annotations'] = new_fk_annos
    new_data['images'] = new_fk_imgs
    
    #new_data["annotations"] = [float_to_int(item) for item in new_data["annotations"]]
    return new_data



In [14]:
r1 = merge_two_files(data1, data2)
data_final = merge_two_files(r1, new_data)

Now we save the final data 

In [15]:
json_string = json.dumps(data_final)
with open(path + "anno_complete.json", 'w') as outfile:
    outfile.write(json_string)

## Copy all important pictures from Beyond_words to Freiheitskampf Trainingsfolder

In [16]:
file_list = [item["file_name"] for item in new_data["images"] ]

In [17]:
for name in file_list:
    shutil.copyfile(path_bw_images + name, path +name)

## Split Training and Validation Data

In [18]:
# Split into train and validation sets
#Split ist 80% train and 20% validation
#80% of the FK data is in the training set and 20% is in the validation set
#split images ids between Freiheitskampf and beyond_words_data

imgs_ids_bw = [im['id'] for im in data_final['images'] if ".tif.large." not in im["file_name"]] 
imgs_ids_fk = [im['id'] for im in data_final['images'] if ".tif.large." in im["file_name"]]

train_ids_bw = random.sample(imgs_ids_bw, k=2465)
train_ids_fk = random.sample(imgs_ids_fk, k=27)


train_imgs = [im for im in data_final['images'] if im['id'] in train_ids_bw or im['id'] in train_ids_fk]
test_imgs = [im for im in data_final['images'] if im['id'] not in train_ids_bw and im['id'] not in train_ids_fk]

print(len(train_imgs))
print(len(test_imgs))



train_annos = [an for an in data_final['annotations'] if an['image_id'] in train_ids_bw or an['image_id'] in train_ids_fk]
test_annos = [an for an in data_final['annotations'] if an['image_id'] not in train_ids_bw and an['image_id'] not in train_ids_fk]




2492
624


In [19]:
data_final["annotations"]

[{'id': 0,
  'image_id': 0,
  'category_id': 0,
  'segmentation': [],
  'bbox': [285.07444936792996,
   168.78342291669674,
   787.4769686292962,
   64.57831572897109],
  'ignore': 0,
  'iscrowd': 0,
  'area': 50853.93630943575},
 {'id': 1,
  'image_id': 0,
  'category_id': 0,
  'segmentation': [],
  'bbox': [372.2301034479953,
   240.53710705999794,
   246.086552696655,
   49.20252626969229],
  'ignore': 0,
  'iscrowd': 0,
  'area': 12108.080073675184},
 {'id': 2,
  'image_id': 0,
  'category_id': 0,
  'segmentation': [],
  'bbox': [388.63587362777236,
   371.74384377917727,
   197.89460279356007,
   37.92694733288779],
  'ignore': 0,
  'iscrowd': 0,
  'area': 7505.538177614102},
 {'id': 3,
  'image_id': 0,
  'category_id': 0,
  'segmentation': [],
  'bbox': [307.63238336512336,
   289.7396333296902,
   119.9671944396193,
   15.375789459278808],
  'ignore': 0,
  'iscrowd': 0,
  'area': 1844.5903237239497},
 {'id': 4,
  'image_id': 0,
  'category_id': 0,
  'segmentation': [],
  'bbox':

In [20]:
train_data = data_final.copy()
train_data['images'] = train_imgs
train_data['annotations'] = train_annos
test_data = data_final.copy()
test_data['images'] = test_imgs
test_data['annotations'] = test_annos



In [21]:
train_data

{'images': [{'width': 2000,
   'height': 2739,
   'id': 0,
   'file_name': 'images/39a3eeb1-00000016.tif.large.jpg'},
  {'width': 2000,
   'height': 2737,
   'id': 1,
   'file_name': 'images/181d8559-00000015.tif.large.jpg'},
  {'width': 2000,
   'height': 2737,
   'id': 2,
   'file_name': 'images/25a85844-00000014.tif.large.jpg'},
  {'width': 2000,
   'height': 2743,
   'id': 3,
   'file_name': 'images/46c37a9b-00000013.tif.large.jpg'},
  {'width': 2000,
   'height': 2743,
   'id': 4,
   'file_name': 'images/03a90b05-00000012.tif.large.jpg'},
  {'width': 2000,
   'height': 2736,
   'id': 5,
   'file_name': 'images/016c1b2f-00000011.tif.large.jpg'},
  {'width': 2000,
   'height': 2730,
   'id': 7,
   'file_name': 'images/dd00ccb3-00000009.tif.large.jpg'},
  {'width': 2000,
   'height': 2730,
   'id': 8,
   'file_name': 'images/5e766ad3-00000008.tif.large.jpg'},
  {'width': 2000,
   'height': 2739,
   'id': 9,
   'file_name': 'images/34455cdc-00000007.tif.large.jpg'},
  {'width': 2000,


Now safe the training and validation split

In [22]:

json_string = json.dumps(train_data)
with open(path + 'train_annos.json', 'w') as outfile:
    outfile.write(json_string)
    
json_string = json.dumps(test_data)
with open(path + 'val_annos.json', 'w') as outfile:
    outfile.write(json_string)

