Target column (Prof.Bell) is `image_links`. However, in column `microfiche_links` there are **scans** of photos and documents  

In order to download pictures to your computer, run all code in **Preprocessing** section and then go to section **Downloading** and choose whether you want to download images from `images_links` or scnas from `microfiche_links` by setting relevant variables. Don't forget to specify amount of pictures to download `OBJECTS_TO_DOWNLOAD`

# Preprocessing

In [13]:
import pandas as pd
import re
from config.fucntions import img_downloader as imgd
from config.fucntions import columns_converter as cc

In [14]:
file_name = 'datasets/dataset_CLEAN_ver2.csv'
df = pd.read_csv(file_name, sep=';', encoding='utf-8')
file_name = 'config/variables/blueprint_prefixes.txt'
with open(file_name, 'r') as f:
    blueprint_prefixes  = f.read().splitlines()


Converting strings in coulmn image_links - microfiche_archive_links back into lists of strings

In [15]:
# Takes ~5-20 seconds
cc.strings_to_links(df, df.columns[3:])
df.head()

Unnamed: 0,id_record,id_persistent,label,image_links,archive_links,microfiche_links,microfiche_archive_links
0,http://www.bildindex.de/document/obj20666124,http://id.bildindex.de/thing/0001618283,"Lauenhain (Kreis Hainichen), Fahnenträger eine...",[],[],[http://www.bildindex.de/bilder/d/mi12308g07],[https://www.bildindex.de/media/obj20666124/mi...
1,http://www.bildindex.de/document/obj20727600,http://id.bildindex.de/thing/0001677732,Fenster (Bauelement),[http://www.bildindex.de/bilder/d/fm140030],[https://www.bildindex.de/media/obj20727600/fm...,"[http://www.bildindex.de/bilder/d/mi02117a05, ...",[https://www.bildindex.de/media/obj20727600/mi...
2,http://www.bildindex.de/document/obj20943303,http://id.bildindex.de/thing/0001803584,"Kassel, Infanteriekasernen an der Königsstraße...",[http://www.bildindex.de/bilder/d/STMP_II_1108...,[https://www.bildindex.de/media/obj20943303/ST...,[],[]
3,http://www.bildindex.de/document/obj20948462,http://id.bildindex.de/thing/0001806337,"Waldeck, Domäne, Wohnhaus, Umbauentwurf, Aufrisse",[http://www.bildindex.de/bilder/d/STMP_II_1566...,[https://www.bildindex.de/media/obj20948462/ST...,[],[]
4,http://www.bildindex.de/document/obj20943569,http://id.bildindex.de/thing/0001803733,"Kassel, Opernhaus (Altes Hoftheater), Aborte i...",[http://www.bildindex.de/bilder/d/STMP_II_1383...,[https://www.bildindex.de/media/obj20943569/ST...,[],[]


# Downloading

In [16]:
# print(f'Amount of objects in dataset: {df.shape[0]}')
# print(f'Images to download: {df.image_links.apply(lambda x: len(x)).sum()}')
# print(f'Scans to download: {df.microfiche_links.apply(lambda x: len(x)).sum()}')
# print()
# print('(with blueprints)')


In [17]:
# N_img_clean, N_obj_clean = 0, 0
# blueprint_suffix = ['_r', '_v', 'T1', 'T2', 'V2', 'V1']

# for i in range(df.shape[0]):
    
#     num_img = len(df.iloc[i].image_links)

#     if num_img == 0:    
#         continue
    
#     if df.iloc[i].image_links[0][-2:] not in blueprint_suffix:
#         N_img_clean += num_img
#         N_obj_clean += 1


# print(f'Amount of objects in dataset with images available: {N_obj_clean}')
# print(f'Amount of images in \"image_links\": {N_img_clean}')
# print()
# print('(without blueprints)')


In [18]:
blueprint_suffix = ['_r', '_v', 'T1', 'T2', 'V2', 'V1']

b_not_s, b_s, not_b_s, imgs = 0, 0, 0, 0

for i in range(df.shape[0]):  
    k_blueprints, k_scans, k_img = 0,  len(df.iloc[i]['microfiche_links']), 0
    for link in df.iloc[i]['image_links']:
        if link[-2:] in blueprint_suffix:
            k_blueprints += 1
        else:
            k_img += 1
    if k_img:
        imgs += k_img
    elif not k_blueprints:
        imgs += len(df.iloc[i]['microfiche_links'])
    if k_blueprints and k_scans:
        b_s += 1
    elif k_blueprints and not k_scans:
        b_not_s += 1
    elif not k_blueprints and k_scans:
        not_b_s += 1

print(f'Blueprints and scans: {b_s}')
print(f'Blueprints only: {b_not_s}')
print(f'Scans only: {not_b_s}')
print(f'Images (image or scan, when there is no image): {imgs}')

Blueprints and scans: 7
Blueprints only: 9917
Scans only: 17458
Images (image or scan, when there is no image): 49688


There are only 7 objects with both blueprints and scans presented. After manual checking we understood, that they are not related to our task, so we can skip them.

In [19]:
# One object consists of many images. Max = df.shape[0]
OBJECTS_TO_DOWNLOAD = df.shape[0] // 4

# If you want to download only scans, set this to True
is_scan_only = False

# If you want to download only blueprints, set this to True
is_blueprints_only = False

# Otherwise it will download no blueprints, only images ('image_links') or scans, when images are absent

In [None]:
if is_scan_only:
    folder_name, column, file_type = 'scans_only', 'microfiche_links', 'scn'
    is_blueprints_only, is_all = False, False
elif is_blueprints_only:
    folder_name, column, file_type = 'blueprints_only', 'image_links', 'blpnt'
    is_all = False
else:
    folder_name, file_type = 'images', 'img'
    is_all = True
    
imgd.create_folder('images/' + folder_name)

for i in range(OBJECTS_TO_DOWNLOAD):

    # For objects without images check scans availability
    if is_all:
        if not len(df.iloc[i]['image_links']):
            if not len(df.iloc[i]['microfiche_links']):
                continue
            else:
                column, file_type = 'microfiche_links', 'scn'
        else:
            column, file_type = 'image_links', 'img'

    # Iteration through lists of links
    for link in df.iloc[i][column]:

        # Skip blueprints file if needed
        if is_blueprints_only:
            if any(prefix not in link.split('/')[-1] for prefix in blueprint_prefixes):
                continue
        else:
            if any(prefix in link.split('/')[-1] for prefix in blueprint_prefixes):
                continue
            
        # Build a name for the image from id_persistent and link
        name = ''
        line = df.iloc[i].id_persistent
        if line != 'NO_ID_PERSISTENT':
            name += re.findall(r'.*/([^/]*)$', line)[0] + '-'
        else:
            name += line+'-'
        name += re.findall(r'.*/([^/]*)$', link)[0]

        # Download the image
        save_path = f'images/{folder_name}/{file_type}-{name}.jpg'
        try:
            response, status = imgd.download_image(link, save_path)
            if response == 'Failed':
                if file_type == 'img':

                    # Try to download same object from scans
                    for link_new in df.iloc[i]['microfiche_links']:
                        # Skip blueprints file
                        if any(prefix in link.split('/')[-1] for prefix in blueprint_prefixes):
                            continue
                        
                        # Build a new name for the image
                        name = name[:re.search(f'-', name).span()[0]+1]
                        name += re.findall(r'.*/([^/]*)$', link_new)[0]
                        save_path = f'images/{folder_name}/scn-{name}.jpg'

                        # Download the image
                        response, status = imgd.download_image(link_new, save_path)
                        if response == 'Failed':
                            print(f'Failed downloading both image ({link}) and scan ({link_new}) of object {i}. Status code: {status}')
                        continue
                # print(f'Failed downloading image {link} of object {i}. Status code: {status}')

            if response == 'Error':
                print(f'Error downloading image {link} of object {i}. Status code: {status}')
        except:
            print(f'Error downloading image {link} of object {i}')

Folder 'images/blueprints_only' already exists.
Error downloading image http://www.bildindex.de/bilder/d/STMP_II_13832_v of object 42


In [21]:
pic = 'img-0001350498-fm308202'
i,column,j = imgd.filename_to_object(df, pic)
df.iloc[i][column][j], i, j, column

('http://www.bildindex.de/bilder/d/fm308202', 2770, 53, 'image_links')

In [49]:
i=1000
imgd.object_to_filenames(df, i), imgd.link_to_filename(df.iloc[i].id_persistent, df.iloc[i].image_links[0], 'img')

({'Images': ['img-0001486011-fm827854', 'img-0001486011-fm827854'],
  'Scans': ['scn-0001486011-mi04028d01']},
 'img-0001486011-fm827854')