In [1]:
import json
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np
import urllib.request, json
import cv2
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import pickle
from tqdm import tqdm

Generic class to store an image and its IIIF representation

In [2]:
class Terzani_Photo(object):
    def __init__(self, iiif, photo):
        self.iiif = iiif
        self.photo = photo
        
    def get_photo_link(self):
        return self.iiif["images"][0]["resource"]["@id"]

First we need to know all the collections for the Terzani archive. For this we scrape the HTML content from the Fondo Tiziano Terzani (collection 1352). We store the collections number in the ```collections``` list. We just need to discard collections 1353, 1360 and 1450 which are collections of collections

In [3]:
r = requests.get('http://dl.cini.it/collections/show/1353') # /ip: Returns the requester's IP Address.
soup = BeautifulSoup(r.text, "html.parser")
collections = []
unsupported_collections = ['1352', '1360', '1450']
for a in soup.find_all("a", href=True):
    link = a['href']
    if link.startswith("/collections/show/") and link[-4:] not in unsupported_collections:
        collections.append(link[-4:])
    
#Preview
collections[:5]

['1460', '1465', '1451', '1452', '1453']

Now we can read the manifest of every collection. For every entry of the manifest, we only keep it if it is the recto of a photo

We check if an picture is color or black and white and store them seperately.

In [4]:
terzani_recto_iiif_color = list() #to store color photos in Terzani photo collection
terzani_recto_iiif_bw = list() #to store black and white photos in Terzani photo collection

threshold_color = 0.05

iiif_format = "/full/max/0/default.jpg"

for collection in tqdm(collections):
    url = "http://dl.cini.it/oa/collections/"+collection+"/manifest.json"

    response = urllib.request.urlopen(url)

    data = json.loads(response.read())

    for entry in data["sequences"][0]["canvases"]:
        if entry["label"] == None:
            continue
        if  entry["label"].lower().endswith("recto"):
            photo_link = entry["images"][0]["resource"]["service"]["@id"] + iiif_format
            photo = np.asarray(bytearray(urllib.request.urlopen(photo_link).read()), dtype="uint8")
            photo = cv2.imdecode(photo, cv2.IMREAD_COLOR)
            b,g,r = cv2.split(photo)
            isize = r.size
            rg = np.abs(r-g)
            rb = np.abs(r-b)
            gb = np.abs(g-b)
            diff =np.sum(rg+rb+gb)
            diff = diff / isize / (255 * 3)
            if diff > threshold_color: # As gray values have r-g = 0 and r-b = 0 and g-b = 0 diff will be near 0 for grayscale images and > 0 for colored images.
                terzani_recto_iiif_color.append(Terzani_Photo(entry, None))
            else:
                terzani_recto_iiif_bw.append(Terzani_Photo(entry, None))

100%|██████████| 59/59 [3:57:59<00:00, 242.02s/it]  


In [5]:
len(terzani_recto_iiif_color)

5030

In [6]:
len(terzani_recto_iiif_bw)

3494

Let's now dump this list in a pickle file

In [7]:
with open('terzani_recto_iiif_color.pickle', 'wb') as file:
    pickle.dump(terzani_recto_iiif_color, file)

In [8]:
with open('terzani_recto_iiif_bw.pickle', 'wb') as file:
    pickle.dump(terzani_recto_iiif_bw, file)