In [None]:
import os
import json

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import cv2
from PIL import Image
import imageio, skimage

from collections import Counter

In [None]:
def show_book_figure(fugire_id: str):
    print('fugire_id:', fugire_id)
    display(bookset_captions_df[bookset_captions_df['figure_id'] == fugire_id])

    for i, row in bookset_captions_df[bookset_captions_df.figure_id == fugire_id].sort_values(by='letter').iterrows():
        letter = row[1]
        img_uuid = row[3]
        print(letter, ':', img_uuid)

        img = imageio.imread(f"{book_set_dir}/images/{img_uuid}.png")
        plt.imshow(img)
        plt.show()

In [None]:
def counter_to_dataframe(c: Counter):

    bag_sizes_to_freqs = {
        'bag_size': [],
        'freq': []
    }

    for bag_size, freq in c.items():
        bag_sizes_to_freqs['bag_size'].append(bag_size)
        bag_sizes_to_freqs['freq'].append(freq)

    bag_sizes_to_freqs_df = pd.DataFrame(bag_sizes_to_freqs).sort_values(by='bag_size')
    bag_sizes_to_freqs_df = bag_sizes_to_freqs_df.set_index('bag_size')
    display(bag_sizes_to_freqs_df)

In [None]:
os.listdir('../datasets/ARCH')

## Books set

In [None]:
book_set_dir = '../datasets/ARCH/books_set'
os.listdir(book_set_dir)

### Readme

In [None]:
!cat ../datasets/ARCH/books_set/README.md

### Images

In [None]:
len(os.listdir(f'{book_set_dir}/images'))

In [None]:
bookset_image_uuids = [f_name.split('.')[0] for f_name in os.listdir(f'{book_set_dir}/images')]
assert len(bookset_image_uuids) == len(set(bookset_image_uuids))

len(bookset_image_uuids)

### Captions

In [None]:
with open(f'{book_set_dir}/captions.json', 'r') as f:
    bookset_captions = json.load(f)

bookset_captions_df = pd.DataFrame(bookset_captions).T

In [None]:
bookset_captions_df

In [None]:
set(bookset_image_uuids).issubset(set(bookset_captions_df.uuid))

Captions are available for all available images.

In [None]:
missing_image_uuids = set(bookset_captions_df.uuid) - set(bookset_image_uuids)
len(missing_image_uuids)

In [None]:
missing_image_uuids

There are also 35 missing images present

In [None]:
bookset_captions_df.nunique()

In [None]:
# use only the information about the images present in the ..datasets/ARCH/books_set/images/
bookset_captions_df[bookset_captions_df.uuid.isin(bookset_image_uuids)].letter.value_counts()

**Not all bags start with "A"! Do not use these numbers for information on frequencies of bag sizes.**

In [None]:
bookset_captions_df[bookset_captions_df['figure_id'] == '00']

In [None]:
bookset_captions_df[bookset_captions_df['figure_id'] == '00'].caption[0]

### Examples

#### Single

In [None]:
show_book_figure('01')

#### 2 images

In [None]:
show_book_figure('00')

Figure ID `figure_id` determines the **bucket**

#### Has "L" in the index

In [None]:
L_item_figure_id = bookset_captions_df[bookset_captions_df['letter'] == 'L'].figure_id.item()
L_item_figure_id

In [None]:
show_book_figure(L_item_figure_id)

### Number of images per bag

In [None]:
Counter(Counter(bookset_captions_df.figure_id).values())

In [None]:
counter_to_dataframe(Counter(Counter(bookset_captions_df.figure_id).values()))

In [None]:
bookset_captions_df.nunique()

In [None]:
# use only the information about the images present in the ..datasets/ARCH/books_set/images/
bookset_captions_all_images_present_df = bookset_captions_df[bookset_captions_df.uuid.isin(bookset_image_uuids)]
counter_to_dataframe(Counter(Counter(bookset_captions_all_images_present_df.figure_id).values()))

In [None]:
bookset_captions_all_images_present_df.nunique()

In [None]:
[key for key, value in Counter(bookset_captions_df.figure_id).items() if value == 9]

#### 9 images for the same figure ID

In [None]:
nine_item_figure_id = '0107'
show_book_figure(nine_item_figure_id)

In [None]:
nine_item_figure_id = '584'
show_book_figure(nine_item_figure_id)

## PubMed Set

In [None]:
pubmed_set_dir = '../datasets/ARCH/pubmed_set'

In [None]:
os.listdir(pubmed_set_dir)

### Readme

In [None]:
!cat ../datasets/ARCH/pubmed_set/README.md

### Captions

In [None]:
with open(f'{pubmed_set_dir}/captions.json', 'r') as f:
    pubmed_captions = json.load(f)

pubmed_captions_df = pd.DataFrame(pubmed_captions).T

In [None]:
pubmed_captions_df

In [None]:
pubmed_captions_df.nunique()

In [None]:
captions_with_multiple_occurrences = [(caption, count) \
                                      for caption, count in Counter(pubmed_captions_df.caption).items() \
                                      if count > 1]
captions_with_multiple_occurrences

In [None]:
Counter(pubmed_captions_df.caption)

In [None]:
caption_of_interest = 'Low-grade peritoneal serous carcinoma. Monomorphic cells associated with psammoma bodies are seen. Hematoxylin and eosin stain, original magnification: x100.'
pubmed_captions_df[pubmed_captions_df.caption == caption_of_interest]

### Images

In [None]:
len(os.listdir(f'{pubmed_set_dir}/images'))

In [None]:
pubmed_image_ids = [f_name.split('.')[0] for f_name in os.listdir(f'{pubmed_set_dir}/images')]
assert len(pubmed_image_ids) == len(set(pubmed_image_ids))

len(pubmed_image_ids)

In [None]:
set(pubmed_image_ids) == set(pubmed_captions_df.uuid)

All UUIDs in the `captions.json` coincide with the UUIDs of the images in the folder. No Images missing.

In [None]:
def show_pubmed_figure(caption: str):
    print('caption:', caption)
    display(pubmed_captions_df[pubmed_captions_df['caption'] == caption])

    for i, row in pubmed_captions_df[pubmed_captions_df.caption == caption].sort_values(by='uuid').iterrows():
        #letter = row[1]
        img_uuid = row[1]
        #print(letter, ':', img_uuid)

        img = imageio.imread(f"{pubmed_set_dir}/images/{img_uuid}.jpg")
        plt.imshow(img)
        plt.show()

In [None]:
for caption_of_interest, count in captions_with_multiple_occurrences:
    show_pubmed_figure(caption_of_interest)

Captions are not split into different images. There are no "A", "B", "C" parts in a caption. There are also no "A", "B", "C" labels on images.

In [None]:
Counter(Counter(pubmed_captions_df.caption).values())

In [None]:
counter_to_dataframe(Counter(Counter(pubmed_captions_df.caption).values()))

In [None]:
pubmed_captions_df.nunique()