# Make one json file with all annotations

In [None]:
import os
import json

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import cv2
from PIL import Image
import imageio, skimage

from collections import Counter

In [None]:
DATA_ROOT = "datasets/ARCH"

In [None]:
os.listdir(f'../{DATA_ROOT}')

## Books set

In [None]:
book_set_dir = f'../{DATA_ROOT}/books_set'
os.listdir(book_set_dir)

### Readme

In [None]:
!cat ../datasets/ARCH/books_set/README.md

### Images

In [None]:
bookset_image_dir = f'{book_set_dir}/images'
len(os.listdir(bookset_image_dir))

In [None]:
bookset_uuids_to_extensions = {
    file_name.split('.')[0]: file_name.split('.')[1]
    for file_name in os.listdir(f'{book_set_dir}/images')
}
len(bookset_uuids_to_extensions)

### Captions

In [None]:
with open(f'{book_set_dir}/captions.json', 'r') as f:
    bookset_captions = json.load(f)
len(bookset_captions)

In [None]:
bookset_captions

In [None]:
bookset_captions_all_images_present = {idx: ann for (idx, ann) in bookset_captions.items()
                                      if ann['uuid'] in bookset_uuids_to_extensions}
len(bookset_captions_all_images_present)

In [None]:
bookset_captions_all_images_present

## PubMed Set

In [None]:
pubmed_set_dir = f'../{DATA_ROOT}/pubmed_set'

In [None]:
os.listdir(pubmed_set_dir)

### Readme

In [None]:
!cat ../datasets/ARCH/pubmed_set/README.md

### Images

In [None]:
pubmed_image_dir = f'{pubmed_set_dir}/images'
len(os.listdir(pubmed_image_dir))

In [None]:
pubmed_uuids_to_extensions = {
    file_name.split('.')[0]: file_name.split('.')[1]
    for file_name in os.listdir(f'{pubmed_set_dir}/images')
}
len(pubmed_uuids_to_extensions)

### Captions

In [None]:
with open(f'{pubmed_set_dir}/captions.json', 'r') as f:
    pubmed_captions = json.load(f)

pubmed_captions

In [None]:
pubmed_captions_all_images_present = {idx: ann for (idx, ann) in pubmed_captions.items()
                                      if ann['uuid'] in pubmed_uuids_to_extensions}
len(pubmed_captions_all_images_present)

## Unified Set

### Make unified set

In [None]:
arch_captions_all_images_present = {}

i = 0
for idx, ann in bookset_captions_all_images_present.items():
    arch_captions_all_images_present[str(i)] = ann
    
    source = 'books'
    arch_captions_all_images_present[str(i)]['source'] = source
    
    path = f"{source}_set/images/{ann['uuid']}.{bookset_uuids_to_extensions[ann['uuid']]}"
    path_with_root = f"../{DATA_ROOT}/{path}"
    assert os.path.exists(path_with_root), f"{path_with_root}"
    arch_captions_all_images_present[str(i)]['path'] = path
    
    i += 1
    
for idx, ann in pubmed_captions_all_images_present.items():
    arch_captions_all_images_present[str(i)] = ann
    
    arch_captions_all_images_present[str(i)]['letter'] = None
    arch_captions_all_images_present[str(i)]['figure_id'] = None
    
    source = 'pubmed'
    arch_captions_all_images_present[str(i)]['source'] = source
    
    path = f"{source}_set/images/{ann['uuid']}.{pubmed_uuids_to_extensions[ann['uuid']]}"
    path_with_root = f"../{DATA_ROOT}/{path}"
    assert os.path.exists(path_with_root), f"{path_with_root}"
    arch_captions_all_images_present[str(i)]['path'] = path

    i += 1
    
arch_captions_all_images_present

In [None]:
arch_captions_all_images_present['0']

In [None]:
arch_captions_all_images_present['4270']

### Save the unified set

In [None]:
%ls ../datasets/ARCH

In [None]:
annotations_dir = f'../{DATA_ROOT}/annotations'
if not os.path.exists(annotations_dir):
    os.path.mkdir(annotations_dir)

In [None]:
with open(f'../{DATA_ROOT}/annotations/captions_all.json', 'w') as f:
    json.dump(arch_captions_all_images_present, f)

### Check the unified dataset

In [None]:
import json

with open(f'../{DATA_ROOT}/annotations/captions_all.json', 'r') as f:
    arch_captions_all_images_present = json.load(f)

In [None]:
import pandas as pd
arch_captions_df = pd.DataFrame(arch_captions_all_images_present).T

# check that the 'uuid'-s are unique and fine 
assert len(arch_captions_df.uuid) == arch_captions_df.uuid.nunique()

In [None]:
arch_captions_df

In [None]:
arch_captions_df.nunique()

## Save a mapping of UUIDs to integers

Not sure if it's better to do here or in the dataset class on the fly

In [None]:


# # create the mappings
# uuids_to_ints = {}
# ints_to_uuids = {}

# # fill in the mappings
# for idx, uuid in enumerate(arch_captions_df.uuid):
#     #print(idx, uuid)
#     uuids_to_ints[uuid] = idx
#     ints_to_uuids[idx] = uuid
    
# # save the mappings
# with open('../datasets/ARCH/annotations/uuids_to_ints.json', 'w') as f:
#     json.dump(uuids_to_ints, f)
# with open('../datasets/ARCH/annotations/ints_to_uuids.json', 'w') as f:
#     json.dump(ints_to_uuids, f)
    
# print("Saved the mappings.")

In [None]:
import os
os.listdir(f'../{DATA_ROOT}/annotations/')