# Make one json file with all annotations

In [None]:
import os
import json

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import cv2
from PIL import Image
import imageio, skimage

from collections import Counter

In [None]:
os.listdir('../datasets/ARCH')

## Books set

In [None]:
book_set_dir = '../datasets/ARCH/books_set'
os.listdir(book_set_dir)

### Readme

In [None]:
!cat ../datasets/ARCH/books_set/README.md

### Images

In [None]:
len(os.listdir(f'{book_set_dir}/images'))

In [None]:
bookset_image_uuids = [f_name.split('.')[0] for f_name in os.listdir(f'{book_set_dir}/images')]
bookset_image_uuids_set = set(bookset_image_uuids)
assert len(bookset_image_uuids) == len(bookset_image_uuids_set)

len(bookset_image_uuids)

### Captions

In [None]:
with open(f'{book_set_dir}/captions.json', 'r') as f:
    bookset_captions = json.load(f)
len(bookset_captions)

In [None]:
bookset_captions

In [None]:
bookset_captions_all_images_present = {idx: ann for (idx, ann) in bookset_captions.items()
                                      if ann['uuid'] in bookset_image_uuids_set}
len(bookset_captions_all_images_present)

In [None]:
bookset_captions_all_images_present

## PubMed Set

In [None]:
pubmed_set_dir = '../datasets/ARCH/pubmed_set'

In [None]:
os.listdir(pubmed_set_dir)

### Readme

In [None]:
!cat ../datasets/ARCH/pubmed_set/README.md

### Images

In [None]:
len(os.listdir(f'{pubmed_set_dir}/images'))

In [None]:
pubmed_image_uuids = [f_name.split('.')[0] for f_name in os.listdir(f'{pubmed_set_dir}/images')]
pubmed_image_uuids_set = set(pubmed_image_uuids)
assert len(pubmed_image_uuids) == len(pubmed_image_uuids_set)

len(pubmed_image_uuids)

### Captions

In [None]:
with open(f'{pubmed_set_dir}/captions.json', 'r') as f:
    pubmed_captions = json.load(f)

pubmed_captions

In [None]:
pubmed_captions_all_images_present = {idx: ann for (idx, ann) in pubmed_captions.items()
                                      if ann['uuid'] in pubmed_image_uuids_set}
len(pubmed_captions_all_images_present)

## Unified Set

### Make unified set

In [None]:
arch_captions_all_images_present = {}

i = 0
for idx, ann in bookset_captions_all_images_present.items():
    arch_captions_all_images_present[str(i)] = ann
    arch_captions_all_images_present[str(i)]['source'] = 'books'
    i += 1
    
for idx, ann in pubmed_captions_all_images_present.items():
    arch_captions_all_images_present[str(i)] = ann
    arch_captions_all_images_present[str(i)]['letter'] = None
    arch_captions_all_images_present[str(i)]['figure_id'] = None
    arch_captions_all_images_present[str(i)]['source'] = 'pubmed'
    i += 1
    
arch_captions_all_images_present

In [None]:
arch_captions_all_images_present['0']

In [None]:
arch_captions_all_images_present['4270']

### Save the unified set

In [None]:
%ls ../datasets/ARCH

In [None]:
annotations_dir = '../datasets/ARCH/annotations'
if not os.path.exists(annotations_dir):
    os.path.mkdir(annotations_dir)

In [None]:
with open('../datasets/ARCH/annotations/captions_all.json', 'w') as f:
    json.dump(arch_captions_all_images_present, f)

### Check the unified dataset

In [None]:
arch_captions_df = pd.DataFrame(arch_captions_all_images_present).T

# check that the 'uuid'-s are unique and fine 
assert len(arch_captions_df.uuid) == arch_captions_df.uuid.nunique()

In [None]:
arch_captions_df

In [None]:
arch_captions_df.nunique()