In [None]:
import os
import json

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import cv2
from PIL import Image
import imageio, skimage

from collections import Counter

In [None]:
def show_book_figure(figure_id: str):
    #print('figure_id:', figure_id)
    display(bookset_captions_df[bookset_captions_df['figure_id'] == figure_id])

    for i, row in bookset_captions_df[bookset_captions_df.figure_id == figure_id].sort_values(by='letter').iterrows():
        letter = row[1]
        img_uuid = row[3]
        print('fugire_id:', figure_id)
        print(letter, ':', img_uuid)

        img = imageio.imread(f"{book_set_dir}/images/{img_uuid}.png")
        plt.imshow(img)
        plt.show()

In [None]:
def show_pubmed_figure(caption: str):
    print('caption:', caption)
    display(pubmed_captions_df[pubmed_captions_df['caption'] == caption])

    for i, row in pubmed_captions_df[pubmed_captions_df.caption == caption].sort_values(by='uuid').iterrows():
        #letter = row[1]
        img_uuid = row[1]
        #print(letter, ':', img_uuid)

        img = imageio.imread(f"{pubmed_set_dir}/images/{img_uuid}.jpg")
        plt.imshow(img)
        plt.show()

In [None]:
def counter_to_dataframe(c: Counter):

    bag_sizes_to_freqs = {
        'bag_size': [],
        'freq': []
    }

    for bag_size, freq in c.items():
        bag_sizes_to_freqs['bag_size'].append(bag_size)
        bag_sizes_to_freqs['freq'].append(freq)

    bag_sizes_to_freqs_df = pd.DataFrame(bag_sizes_to_freqs).sort_values(by='bag_size')
    bag_sizes_to_freqs_df = bag_sizes_to_freqs_df.set_index('bag_size')
    display(bag_sizes_to_freqs_df)

In [None]:
os.listdir('../datasets/ARCH')

## Books set

In [None]:
book_set_dir = '../datasets/ARCH/books_set'
os.listdir(book_set_dir)

### Readme

In [None]:
!cat ../datasets/ARCH/books_set/README.md

### Images

In [None]:
len(os.listdir(f'{book_set_dir}/images'))

In [None]:
bookset_image_uuids = [f_name.split('.')[0] for f_name in os.listdir(f'{book_set_dir}/images')]
assert len(bookset_image_uuids) == len(set(bookset_image_uuids))

len(bookset_image_uuids)

### Captions

In [None]:
with open(f'{book_set_dir}/captions.json', 'r') as f:
    bookset_captions = json.load(f)

bookset_captions_df = pd.DataFrame(bookset_captions).T

In [None]:
bookset_captions_df

In [None]:
set(bookset_image_uuids).issubset(set(bookset_captions_df.uuid))

Captions are available for all available images.

In [None]:
missing_image_uuids = set(bookset_captions_df.uuid) - set(bookset_image_uuids)
len(missing_image_uuids)

In [None]:
print("Total missing:", len(missing_image_uuids))

missing_image_uuids

There are also 35 missing images present

In [None]:
bookset_captions_df.nunique()

In [None]:
# use only the information about the images present in the ..datasets/ARCH/books_set/images/
bookset_captions_df[bookset_captions_df.uuid.isin(bookset_image_uuids)].letter.value_counts()

**Not all bags start with "A"! Do not use these numbers for information on frequencies of bag sizes.**

In [None]:
bookset_captions_df[bookset_captions_df['figure_id'] == '00']

In [None]:
bookset_captions_df[bookset_captions_df['figure_id'] == '00'].caption[0]

### Examples

#### Single

In [None]:
show_book_figure('01')

#### 2 images

In [None]:
show_book_figure('00')

Figure ID `figure_id` determines the **bucket**

#### Has "L" in the index

In [None]:
L_item_figure_id = bookset_captions_df[bookset_captions_df['letter'] == 'L'].figure_id.item()
L_item_figure_id

In [None]:
show_book_figure(L_item_figure_id)

### Number of images per bag

#### Missing Images present

In [None]:
Counter(Counter(bookset_captions_df.figure_id).values())

In [None]:
counter_to_dataframe(Counter(Counter(bookset_captions_df.figure_id).values()))

In [None]:
bookset_captions_df.nunique()

#### w/o missing images

In [None]:
# use only the information about the images present in the ..datasets/ARCH/books_set/images/
bookset_captions_all_images_present_df = bookset_captions_df[bookset_captions_df.uuid.isin(bookset_image_uuids)]
counter_to_dataframe(Counter(Counter(bookset_captions_all_images_present_df.figure_id).values()))

In [None]:
bookset_captions_all_images_present_df.nunique()

In [None]:
figids_to_captions = {}
for i, (figid, caption) in bookset_captions_all_images_present_df[['figure_id', 'caption']].iterrows():
    #print(figid, caption)
    #print()
    
    if figid not in figids_to_captions:
        figids_to_captions[figid] = []
    figids_to_captions[figid].append(caption)
    
all_captions_match = True # a flag that guarantees that all the captions corresponding to the same figure match
for figid, caption_list in figids_to_captions.items():
    all_captions_match = all_captions_match and (len(set(caption_list)) == 1)
print(all_captions_match)

**For each of the figure ids, there is always a single caption.**

In [None]:
captions_to_figids = {}
for i, (figid, caption) in bookset_captions_all_images_present_df[['figure_id', 'caption']].iterrows():
    if caption not in captions_to_figids:
        captions_to_figids[caption] = []
    captions_to_figids[caption].append(figid)
    
all_figids_match = True
total_extra = 0
for caption, figid_list in captions_to_figids.items():
    all_figids_match_in_the_list = (len(set(figid_list)) == 1)
    if not all_figids_match_in_the_list:
        print(caption, figid_list)
        print()
        total_extra += (len(set(figid_list))-1)
        
    if (len(set(figid_list))-1) > 1:
        print('-'*80)
        print(caption, figid_list)
        print('-'*80)
        print()
    
    all_figids_match = all_figids_match and all_figids_match_in_the_list

print('-'*80)
print(all_figids_match, total_extra)
print()

There are 77 captions, which correspond to 2 (76 captions) or more (1 caption has 3 ids: ['4122', '4122', '4123', '4123', '4124']) different ids. In total, this gives a total difference between the number of unique captions and unique figure ids in the `books_set` of 78=76\*(2-1)+1\*(3-1). 

In [None]:
example_caption_with_multiple_figures = bookset_captions_all_images_present_df[bookset_captions_all_images_present_df.figure_id=='4122'].caption[0]

print(example_caption_with_multiple_figures)
bookset_captions_all_images_present_df[bookset_captions_all_images_present_df.caption == example_caption_with_multiple_figures]

In [None]:
print(bookset_captions_all_images_present_df[bookset_captions_all_images_present_df.figure_id=='4122'].caption[0])

for img_uuid in set(['4122', '4122', '4123', '4123', '4124']):
    show_book_figure(img_uuid)
    print('-'*80)

#### 9 images for the same figure ID

In [None]:
[key for key, value in Counter(bookset_captions_df.figure_id).items() if value == 9]

In [None]:
nine_item_figure_id = '0107'
show_book_figure(nine_item_figure_id)

In [None]:
nine_item_figure_id = '584'
show_book_figure(nine_item_figure_id)

## PubMed Set

In [None]:
pubmed_set_dir = '../datasets/ARCH/pubmed_set'

In [None]:
os.listdir(pubmed_set_dir)

### Readme

In [None]:
!cat ../datasets/ARCH/pubmed_set/README.md

### Captions

In [None]:
with open(f'{pubmed_set_dir}/captions.json', 'r') as f:
    pubmed_captions = json.load(f)

pubmed_captions

Indices appear to be integers from 0 to 3308.

In [None]:
# Check for duplicates (slightly pointless since keys have to be unique anyway)
assert len(pubmed_captions.keys()) == len(set(pubmed_captions.keys()))

# check for the indices to be without gaps ['0', '1', '2', ..., '3308']
assert sorted(list(pubmed_captions.keys())) == sorted([str(i) for i in range(len(pubmed_captions.keys()))])

A simple check confirms that everything is in order.

In [None]:
pubmed_captions_df = pd.DataFrame(pubmed_captions).T
pubmed_captions_df

In [None]:
pubmed_captions_df.nunique()

In [None]:
captions_with_multiple_occurrences = [(caption, count) \
                                      for caption, count in Counter(pubmed_captions_df.caption).items() \
                                      if count > 1]
captions_with_multiple_occurrences

In [None]:
# counting extra uuids (substract 1 since 1 uuid per caption should be there by default)
sum([num-1 for caption, num in captions_with_multiple_occurrences])

In [None]:
len(captions_with_multiple_occurrences)

In [None]:
Counter(pubmed_captions_df.caption)

In [None]:
caption_of_interest = 'Low-grade peritoneal serous carcinoma. Monomorphic cells associated with psammoma bodies are seen. Hematoxylin and eosin stain, original magnification: x100.'
pubmed_captions_df[pubmed_captions_df.caption == caption_of_interest]

### Images

In [None]:
len(os.listdir(f'{pubmed_set_dir}/images'))

In [None]:
pubmed_image_ids = [f_name.split('.')[0] for f_name in os.listdir(f'{pubmed_set_dir}/images')]
assert len(pubmed_image_ids) == len(set(pubmed_image_ids))

len(pubmed_image_ids)

In [None]:
set(pubmed_image_ids) == set(pubmed_captions_df.uuid)

All UUIDs in the `captions.json` coincide with the UUIDs of the images in the folder. No Images missing.

In [None]:
for caption_of_interest, count in captions_with_multiple_occurrences:
    show_pubmed_figure(caption_of_interest)

Captions are not split into different images. There are no "A", "B", "C" parts in a caption. There are also no "A", "B", "C" labels on images.

In [None]:
Counter(Counter(pubmed_captions_df.caption).values())

In [None]:
counter_to_dataframe(Counter(Counter(pubmed_captions_df.caption).values()))

In [None]:
pubmed_captions_df.nunique()

In [None]:
pubmed_captions_df['uuid'].nunique() - pubmed_captions_df['caption'].nunique()

In [None]:
uuids_to_captions = {}
for i, (uuid, caption) in pubmed_captions_df[['uuid', 'caption']].iterrows():
    #print(figid, caption)
    #print()
    
    if uuid not in uuids_to_captions:
        uuids_to_captions[uuid] = []
    uuids_to_captions[uuid].append(caption)
    
all_captions_match = True # a flag that guarantees that all the captions corresponding to the same figure match
for uuid, caption_list in uuids_to_captions.items():
    all_captions_match = (all_captions_match and (len(set(caption_list)) == 1))
print(all_captions_match)

**For each uuid, there is always a single caption.**

In [None]:
captions_to_uuids = {}
for i, (uuid, caption) in pubmed_captions_df[['uuid', 'caption']].iterrows():
    if caption not in captions_to_uuids:
        captions_to_uuids[caption] = []
    captions_to_uuids[caption].append(uuid)
    
all_uuids_match = True
total_extra = 0
total_captions_with_multiple_uuids = 0

for caption, uuid_list in captions_to_uuids.items():
    all_uuids_match_in_the_list = (len(set(uuid_list)) == 1)
    if not all_uuids_match_in_the_list:
        print(caption, uuid_list)
        print()
        total_captions_with_multiple_uuids += 1
        total_extra += (len(set(uuid_list))-1)
        
    if (len(set(uuid_list))-1) > 1:
        print('-'*80)
        print(caption, uuid_list)
        print('-'*80)
        print()
    
    all_uuids_match = (all_uuids_match and all_uuids_match_in_the_list)

print('-'*80)
print("all_uuids_match: ", all_uuids_match)
print("Extra uuids:", total_extra)
print("Captions with multiple uuids", total_captions_with_multiple_uuids)
print()

There are 24 "extra" uuids. 