In [163]:
import pandas as pd
import requests


# Image Data Set

I have no idea how to open that file fully. File below is reduced (only first 200 rows of original) and no umlaut (failed utf-8 coding).

| Column Name               | Type        | UNIQUE      | Description |
| -----------               | ----------- | ----------- | ----------- |
| `record identifiers`      | LINK        | NO          | A link to an object (e.g. set of papers of a book)       |
| `persistent identifier`   | LINK        | YES         | A link to an subobject of an object (e.g. one sheet of paper from this book). Sometimes is similar to `record identifiers`       |
| `identification labels`   | TEXT        | NO          |A label, text description |
| `digital images/downloads (files)`   | SET OF LINKS        | PROBABLY          |A set of links to download all coresponding images to this subobject (e.g. all sides of this sheet of paper in each link) |
| `digital images/archival description`   | LINK        | PROBABLY          |A link to another page in archive ??? |
| `microfiche/downloads (files)`   | SET OF LINKS        | PROBABLY          |A link to download pictures from archive. Probably there are scans of original objects, books, photos of sculptures with metadate on them. Could be used as a style transfer and metadata resourse, if it is not in the `identification labels` column |
| `microfiche/archival description`   | LINK        | PROBABLY          |Links to another archive |



<img src="dataset logic.jpg" width="500">

In [164]:
file_name = 'dataset_reduced.csv'

# Amount of data
N = 199 

df = pd.read_csv(file_name, sep=';', encoding='utf-8')[:N]

# In original file I changes all commas to '___' so I could successfully convert it to csv with comas
# Here I change it back to commas
df['identification labels'] = df['identification labels'].str.replace('___', ',')

# Drop useless column. Only 2 values are present in the column
df = df.drop('Unnamed: 7', axis=1)

columns, k = df.columns, 0
print(f'Object 7:')
for i in df.loc[7]:
    print(f'{columns[k]}:\n\t{i}')
    k+=1
print()

Object 7:
record identifiers:
	http://www.bildindex.de/document/obj20104670
persistent identifier:
	http://id.bildindex.de/thing/0001252369
identification labels:
	Mengsberg, Oberf?rster-Etablissement, Bauaufnahmen, Grundrisse, Aufrisse, Querschnitte, L?ngsschnitte und Lageplan Mengsberg, Oberf?rster-Etablissement, Bauaufnahmen, Grundrisse, Aufrisse, Querschnitte, L?ngsschnitte und Lageplan/Mengsberg, Oberf?rster-Etablissement, F?rsterwohnung, Bauaufnahme, Grundriss und Aufriss
digital images/downloads (files):
	["http://www.bildindex.de/bilder/d/STMP_II_12646_002_r" "http://www.bildindex.de/bilder/d/STMP_II_12646_002_v"]
digital images/archival description:
	["https://www.bildindex.de/media/obj20104670/STMP_II_12646_002_r" "https://www.bildindex.de/media/obj20104670/STMP_II_12646_002_v"]
microfiche/downloads (files):
	[]
microfiche/archival description:
	[]



In [165]:
# rename columns
df = df.rename(columns={
    'record identifiers': 'id_record',
    'persistent identifier': 'id_persistent',
    'identification labels': 'label',
    'digital images/downloads (files)': 'image_links',
    'digital images/archival description': 'archive_links',
    'microfiche/downloads (files)': 'microfiche_links',
    'microfiche/archival description': 'microfiche_archive_links',
})

columns = df.columns

I failed open a .csv file with utf-8 encoding in order not to lose Germans special character, so I saved a column with labels from original file to .txt with utf-8 encoding and then added this column to a DataFrame

In [166]:
file_name = 'labels.txt'
with open(f'{file_name}', 'r', encoding='utf-8') as file:
    lines = file.readlines()

for i in range(len(lines)):
    lines[i] = lines[i][:-1]

df['label'] = lines[:N]

In [167]:
# Transfer strings to lists of links
column_to_lists = ['image_links', 'archive_links', 'microfiche_links', 'microfiche_archive_links']

for column in column_to_lists:
    for i in range(df.shape[0]):
        if df.iloc[i][column] == '[]':
            df.at[i, column] = []
        else:
            df.at[i, column] = df.iloc[i][column][2:-2].split('" "')


In [168]:
# Delete rows with no links to upload images neiher in `image_links` nor in `microfiche_links` 
# because we don't want to parse images from pages
df = df[(df.image_links.apply(len) + df.microfiche_links.apply(len) != 0)]

### Incorporate all available links
Rrof. Bell said that target column is `image_links`. Let's investigate also `microfiche_links` and compare them, thus maybe we can use both to extend our dataset and receive more samples to one object including different styles of pictures.
I create a new column `links` where I put all availble links for uploadong pictures

In [171]:
df['links'] = pd.Series(df.loc[:,'image_links'] + df.loc[:,'microfiche_links'])

In [172]:
print(f'Amount of objects left: {df.shape[0]}')
print(f'Total amount of images: {df.links.apply(len).sum()}')

Amount of objects left: 195
Total amount of images: 407


### Download images

In [173]:
def download_image(url, save_path):
    try:
        # Send a GET request to the URL
        response = requests.get(url)
        
        # Check if the request was successful
        if response.status_code == 200:
            # Open a file in binary mode and write the image content
            with open(save_path, 'wb') as file:
                file.write(response.content)
            # print(f"Image successfully downloaded: {save_path}")
        else:
            print(f"Failed to retrieve image from {url}. Status code: {response.status_code}")
    except Exception as e:
        print(f"An error occurred: {e}")

Maybe pictures from `microfiche_links` should be downloaded to another folder :/

In [174]:
# # Upload images to your computer
# for i in range(df_reduced.shape[0]):
#     for j in range(len(df_reduced.iloc[i].links)):
#         image_url = df_reduced.iloc[i].links[j]
#         save_path = f'images/image{i}_{j}.jpg'
#         download_image(image_url, save_path)

There are some photos (e.g. 164 and 168), which are not in out field od interests AT ALL.

### Label preprocessing

In [187]:
print(f'Amount of objects:       {df.shape[0]}')
print(f'Amount of unique labels: {df.label.nunique()}')
print()
print(f'Labels:\n{df.label.unique()}')

Amount of objects:       195
Amount of unique labels: 163

Labels:
['Lauenhain (Kreis Hainichen), Fahnenträger eines internationalen Zeltlagers'
 'Fenster (Bauelement)'
 'Kassel, Infanteriekasernen an der Königsstraße, Bestandsaufnahme, Situationsplan'
 'Waldeck, Domäne, Wohnhaus, Umbauentwurf, Aufrisse'
 'Kassel, Opernhaus (Altes Hoftheater), Aborte im Flügelgebäude, Entwurf, Grundriss'
 'Mengsberg, Oberförster-Etablissement, Bauaufnahmen, Grundrisse, Aufrisse, Querschnitte, Längsschnitte und Lageplan'
 'Mengsberg, Oberförster-Etablissement, Bauaufnahmen, Grundrisse, Aufrisse, Querschnitte, Längsschnitte und Lageplan Mengsberg, Oberförster-Etablissement, Bauaufnahmen, Grundrisse, Aufrisse, Querschnitte, Längsschnitte und Lageplan/Mengsberg, Oberförster-Etablissement, Försterwohnung, Bauaufnahme, Grundrisse'
 'Mengsberg, Oberförster-Etablissement, Bauaufnahmen, Grundrisse, Aufrisse, Querschnitte, Längsschnitte und Lageplan Mengsberg, Oberförster-Etablissement, Bauaufnahmen, Grundrisse,

# Data Set (LIDO format)

In [1]:
import xml.etree.ElementTree as ET

def parse_lido(file_path):
    # Parse the XML file
    tree = ET.parse(file_path)
    root = tree.getroot()
    
    # Define namespaces (if any) used in the LIDO file
    namespaces = {'lido': 'http://www.lido-schema.org'}
    
    # Extract information
    for lido in root.findall('lido:lido', namespaces):
        # Extract the object's title
        title = lido.find('lido:descriptiveMetadata/lido:objectIdentificationWrap/lido:titleWrap/lido:titleSet/lido:appellationValue', namespaces)
        if title is not None:
            print(f"Title: {title.text}")
        
        # Extract the object's identifier
        identifier = lido.find('lido:descriptiveMetadata/lido:objectIdentificationWrap/lido:repositoryWrap/lido:repositorySet/lido:workID', namespaces)
        if identifier is not None:
            print(f"Identifier: {identifier.text}")
        
        # Extract the object's description
        description = lido.find('lido:descriptiveMetadata/lido:objectIdentificationWrap/lido:objectDescriptionWrap/lido:objectDescriptionSet/lido:descriptiveNoteValue', namespaces)
        if description is not None:
            print(f"Description: {description.text}")

# Example usage
file_path = 'BKA3_PUBLIC_2024-06-03_Standort-Marburg_combined.xml'
parse_lido(file_path)


Title: Lauenhain (Kreis Hainichen), Fahnenträger eines internationalen Zeltlagers
Title: 20727600
Identifier: 392
Title: Kassel, Infanteriekasernen an der Königsstraße, Bestandsaufnahme, Situationsplan
Identifier: P II 11087/001
Description: DN
Title: Waldeck, Domäne, Wohnhaus, Umbauentwurf, Aufrisse
Identifier: P II 15666/011
Description: DN
Title: Kassel, Opernhaus (Altes Hoftheater), Aborte im Flügelgebäude, Entwurf, Grundriss
Identifier: P II 13833/001
Description: DN
Title: Mengsberg, Oberförster-Etablissement, Bauaufnahmen, Grundrisse, Aufrisse, Querschnitte, Längsschnitte und Lageplan
Description: K3
Title: Mengsberg, Oberförster-Etablissement, Bauaufnahmen, Grundrisse, Aufrisse, Querschnitte, Längsschnitte und Lageplan – Mengsberg, Oberförster-Etablissement, Försterwohnung, Bauaufnahme, Grundrisse
Identifier: P II 12646/001
Description: K3
Title: Mengsberg, Oberförster-Etablissement, Bauaufnahmen, Grundrisse, Aufrisse, Querschnitte, Längsschnitte und Lageplan – Mengsberg, Oberf

In [41]:
import pandas as pd
import xml.etree.ElementTree as ET

def parse_lido(file_path):
    # Parse the XML file
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Define namespaces (if any) used in the LIDO file
    namespaces = {'lido': 'http://www.lido-schema.org'}

    # Initialize lists to store extracted data
    resource_ids = []
    rights_resources = []
    object_published_ids = []
    rights_work_wraps = []

    # Extract information
    for lido in root.findall('lido:lido', namespaces):
        # Extract the image address (resourceID)
        resource_id = lido.find('.//lido:resourceID', namespaces)
        resource_ids.append(resource_id.text if resource_id is not None else None)

        # Extract the rights resource
        rights_resource = lido.find('.//lido:rightsResource', namespaces)
        rights_resources.append(rights_resource.text if rights_resource is not None else None)

        # Extract the object published ID (URL)
        object_published_id = lido.find('.//lido:objectPublishedID', namespaces)
        object_published_ids.append(object_published_id.text if object_published_id is not None else None)

        # Extract the rights work wrap
        rights_work_wrap = lido.find('.//lido:rightsWorkWrap', namespaces)
        rights_work_wraps.append(rights_work_wrap.text if rights_work_wrap is not None else None)

    # Create a DataFrame
    df = pd.DataFrame({
        'ResourceID': resource_ids,
        'RightsResource': rights_resources,
        'ObjectPublishedID': object_published_ids,
        'RightsWorkWrap': rights_work_wraps
    })

    return df

# Example usage
file_path = 'BKA3_PUBLIC_2024-06-03_Standort-Marburg_combined.xml'
df = parse_lido(file_path)
print(df)


      ResourceID        RightsResource  \
0           None                  None   
1        140.030  \n                     
2           None                  None   
3           None                  None   
4           None                  None   
...          ...                   ...   
38194       None                  None   
38195       None                  None   
38196    823.249  \n                     
38197    306.085  \n                     
38198       None                  None   

                             ObjectPublishedID  RightsWorkWrap  
0      http://id.bildindex.de/thing/0001618283  \n              
1      http://id.bildindex.de/thing/0001677732  \n              
2      http://id.bildindex.de/thing/0001803584  \n              
3      http://id.bildindex.de/thing/0001806337  \n              
4      http://id.bildindex.de/thing/0001803733  \n              
...                                        ...             ...  
38194  http://id.bildindex.de/thing/0001

In [43]:
df

Unnamed: 0,ResourceID,RightsResource,ObjectPublishedID,RightsWorkWrap
0,,,http://id.bildindex.de/thing/0001618283,\n
1,140.030,\n,http://id.bildindex.de/thing/0001677732,\n
2,,,http://id.bildindex.de/thing/0001803584,\n
3,,,http://id.bildindex.de/thing/0001806337,\n
4,,,http://id.bildindex.de/thing/0001803733,\n
...,...,...,...,...
38194,,,http://id.bildindex.de/thing/0001798123,\n
38195,,,http://id.bildindex.de/thing/0001798124,\n
38196,823.249,\n,http://id.bildindex.de/thing/0001532642,\n
38197,306.085,\n,http://id.bildindex.de/thing/0001603218,\n
