In [1]:
import pandas as pd
import numpy as np
import re
import json
import os
import shutil


# Choosing Clusters
We accomplished clustering among first original dataset and received 100 clusters. Then we put visualisation of those clusters to ChatGPT and asked to name each cluster as if it was taxonomy name (details in Trello). Here we  
- want to extract those cluster names and choose relevant  
- pit them to MVP

## Retrieve and orginize ChatGPT taxonomy


In [2]:
file_name = 'datasets/taxonomy_clustering100.txt'

with open(file_name, 'r') as f:
    chatgpt_taxonomy  = f.read().splitlines()

chatgpt_taxonomy = [line for line in chatgpt_taxonomy if line != '']
cluster_info = {}

cluster_num = '0'
is_description = False
for line in chatgpt_taxonomy:
    if line == '':
        continue
    if r'Cluster_' in line:
        cluster_num = int(re.search(r'Cluster_(\d{0,3})', line).group(1))
        cluster_info[cluster_num] = {}
        continue
    if 'Cluster Name' in line:
        name = line[re.search('Cluster Name: ', line).span()[1]:]
        cluster_info[cluster_num]['name'] = name.strip()
    if 'Key Words:' in line:
        key_words = line[re.search('Key Words: ', line).span()[1]:].split(', ')
        cluster_info[cluster_num]['keywords'] = key_words
        continue
    if 'Detailed Description:' == line:
        is_description = True
        continue
    if is_description:
        is_description = False
        cluster_info[cluster_num]['description'] = line.strip()
        continue
    if 'Detailed Description:' in line:
        description = line[re.search('Detailed Description:', line).span()[1]:].strip()
        cluster_info[cluster_num]['description'] = description
    

In [3]:
print('Amount of clusters retrieved:', len(cluster_info))
cluster_info[98]

Amount of clusters retrieved: 100


{'name': 'Architectural Ceilings and Vaults',
 'keywords': ['ceilings',
  'vaults',
  'stained glass',
  'arches',
  'architectural details',
  'religious architecture',
  'geometric patterns'],
 'description': 'This cluster highlights intricate architectural ceilings and vault designs. It includes a variety of styles such as Gothic ribbed vaults, stained glass windows, and ornate decorative elements. The images showcase the intersection of structural engineering and artistic embellishment, often found in religious buildings like cathedrals, churches, and historical monuments. The collection emphasizes the beauty of ceiling craftsmanship, combining geometric precision with artistic flair.'}

In [4]:
# with open('datasets/production/clusters_info.json', 'w') as f:
#     json.dump(cluster_info, f, indent=4)

In [5]:
names, descriptions = [], []

# # Save names as list of tuples (cluster_num, name)
# for i in cluster_info:
#     names.append((i, cluster_info[i]['name']))
# # names = sorted(names, key=lambda x: x[1])

# Save names as list
for i in cluster_info:
    names.append(cluster_info[i]['name'])
    descriptions.append(cluster_info[i]['description'])
    
tockens = set([])
for name in names:
    tockens.update(set(name.split(' ')))

# names[:5], descriptions[:5]

In [6]:
## Attempt to choose tockens that are relevant to architecture 

# relevant_tockens = ['Architecture', 'Church', 'Building', 'Buildings', 
#                     'Construction', 'Exteriors', 'Facades', 'Gothic', 'Landscape', 'Landscapes',
#                     'Monuments', 'Mountainous', 'Nature', 'Residential', 'Rural',
#                     'Settlements', 'Site', 'Sites', 'Spaces', 'Studios', 'Transport',
#                     'Urban',]

# for i in cluster_info:
#     if any(relevant_tocken in cluster_info[i]['name'] for relevant_tocken in relevant_tockens):
#         print(i, cluster_info[i]['name'])

In [7]:
# with open("cluster no = 100  random_state=42/cluster_names.txt", "w", encoding="utf-8") as file:
#     for name in names:
#         file.write(name + '\n')


## Ask GPT to choose relevant

ChatGPT prompt:  
I did a clustering on archive images. Now I have names for those clusters. I need to extract all clusters (based on names) which relate to picutres of outside buildings
..

In [8]:
## Respond 1
# with open('cluster no = 100  random_state=42/gptresponse_relevant_outside_1.txt', 'r') as f:
#     relevant_names = f.read().splitlines()

# relevant_names = [name.split(' - ')[0] for name in relevant_names if 'Yes' in name]

# relevant_outdoors = []
# for i in cluster_info:
#     if cluster_info[i]['name'] in relevant_names:
#         relevant_outdoors.append(i)


In [9]:
# ## Respond 2
# with open('cluster no = 100  random_state=42/gptresponse_relevant_outside_2.txt', 'r') as f:
#     relevant_names = f.read().splitlines()

# relevant_names = [line.split(' - ')[0] for line in relevant_names if line != '' and 'Yes' in line]

# # relevant_outdoors = []
# # for i in cluster_info:
# #     if cluster_info[i]['name'] in relevant_names:
# #         relevant_outdoors.append(i)
# relevant_names

## Manualy asign clusters area

In [10]:
## Manualy
relevant_outdoors = [0, 1, 4, 5, 12, 22, 33, 40, 49, 65, 66, 74, 83, 99]
relevant_indoor = [3, 19, 61, 62, 64,67,84,]
relevant_details = [35, 51, 68, 2]
relevant_people = [11, 14, 24, 44, 47, 48, 50, 56, 57, 58,59, 63, 77,82, 89,95]
# Not sure about 52

In [11]:
# Evaluate relevance of chosen clusters by keywords (manually) 
for i in cluster_info:
    if i in relevant_outdoors:
        print(i, '\t', cluster_info[i]['name'])
        # print(cluster_info[i]['description'])
        print(cluster_info[i]['keywords'])
        print()

0 	 Architectural Archways
['arches', 'domes', 'interiors', 'bridges', 'structures']

1 	 Historical Landscape Prints
['landscapes', 'black-and-white', 'mountains', 'architecture', 'nature']

4 	 Black-and-White Landscapes
['mountains', 'towns', 'rivers', 'architecture', 'valleys']

5 	 Historical Church Facades
['cathedrals', 'spires', 'Gothic', 'architecture', 'towers']

12 	 Urban Landscapes
['buildings', 'architecture', 'cityscapes', 'aerial', 'landmarks']

22 	 Architectural Facades
['buildings', 'facades', 'architecture', 'houses', 'styles']

33 	 Urban Architectural Styles
['buildings', 'streets', 'facades', 'urban', 'historical']

40 	 Historical Buildings and Landscapes
['buildings', 'landscapes', 'ruins', 'villages', 'architecture']

49 	 Architectural Facades
['buildings', 'facades', 'black-and-white', 'windows', 'roofs']

65 	 Construction Site Documentation
['construction', 'buildings', 'scaffolding', 'architecture', 'cranes']

66 	 Residential Building Exteriors
['houses'

In [12]:
final_clusters = relevant_outdoors

EVALUATE RESULT. Seems bullshit
Specify system prompt  

Maybe **manualy** is better
Need:
- landscape with nature
- exterior and city
- interior
- people in interior

Or maybe choose keywords with code

## Concatenate df_MVP and clustering label

In [13]:
df_cluster = pd.read_csv('cluster no = 100  random_state=42/file_clusters_100.csv', sep=';')
df_cluster.rename(columns={'File Name': 'file_name', 'Cluster': 'cluster'}, inplace=True)
df_cluster['file_name'] = df_cluster['file_name'].apply(lambda x: x.split('.')[0])

df_mvp = pd.read_csv('datasets/production/dataset_MVP.csv', sep=';')

df_mvp.shape, df_cluster.shape

((37185, 5), (36981, 2))

In [14]:
df = df_mvp.merge(df_cluster, on='file_name', how='left')
df.dropna(subset=['cluster'], inplace=True)
df['cluster'] = df['cluster'].astype(int)
df['file_name'] = df['file_name'].apply(lambda x: x+'.jpg')
print(df.shape)
df.head()

(36981, 6)


Unnamed: 0,link,id_objects,labels,file_name,id,cluster
0,http://www.bildindex.de/bilder/d/ae00001b05,['9077'],[1617],scn-ae00001b05.jpg,ae00001b05,85
1,http://www.bildindex.de/bilder/d/ae00001b06,['9077'],[1617],scn-ae00001b06.jpg,ae00001b06,85
2,http://www.bildindex.de/bilder/d/ae00001b07,['9077'],[1617],scn-ae00001b07.jpg,ae00001b07,85
3,http://www.bildindex.de/bilder/d/ae00001b08,['9077'],[1617],scn-ae00001b08.jpg,ae00001b08,96
4,http://www.bildindex.de/bilder/d/ae00001b10,['3899'],[3257],scn-ae00001b10.jpg,ae00001b10,1


Add `cluster_type` to out dataset with relevant type of cluster (outdoor, indoor etc.)

In [15]:
df['cluster_type'] = df['cluster'].apply(lambda x: 'outdoor' if x in relevant_outdoors else 'indoor' if x in relevant_indoor else 'details' if x in relevant_details else 'people' if x in relevant_people else 'notype')

In [16]:
# # # df.to_csv('datasets/production/dataset_MVP_clustered.csv', sep=';', index=False)

## Choose relevant outdoor images

In [17]:
chosen_images_out = df[df.cluster.isin(final_clusters)].file_name.tolist()
print('Amount images to work with:', len(chosen_images_out))

# if not os.path.exists('images/images_outdoor_compressed'):
#     os.makedirs('images/images_outdoor_compressed')

# for image in chosen_images_out:
#     shutil.copy('images/images_compressed/' + image, 'images/images_outdoor_compressed/' + image)

Amount images to work with: 6796


In [18]:
# # Save as csv
# # #df[df.cluster.isin(final_clusters)].to_csv('datasets/production/outdoor_images.csv', sep=';', index=False)

## Creating MVP dataset for clusters info

In [19]:
df_clusters = pd.DataFrame(columns=['cluster_id', 'name', 'keywords', 'description'])
df_clusters['cluster_id'] = cluster_info.keys()
df_clusters['name'] = [cluster['name'] for cluster in cluster_info.values()]
df_clusters['keywords'] = [(', ').join(cluster['keywords']) for cluster in cluster_info.values()]
df_clusters['description'] = [cluster['description'] for cluster in cluster_info.values()]

print('NaN values in clusters:', df_clusters.isna().sum().sum())
print('Size of clusters dataframe:', df_clusters.shape)

df_clusters.head()

NaN values in clusters: 0
Size of clusters dataframe: (100, 4)


Unnamed: 0,cluster_id,name,keywords,description
0,0,Architectural Archways,"arches, domes, interiors, bridges, structures",The visualization consists of a large collecti...
1,1,Historical Landscape Prints,"landscapes, black-and-white, mountains, archit...",The visualization features a grid layout of nu...
2,2,Industrial and Transport Imagery,"vehicles, infrastructure, ships, machinery, co...",The visualization contains a collection of bla...
3,3,Interior Architectural Spaces,"rooms, furniture, windows, arches, lighting",This visualization displays a structured colle...
4,4,Black-and-White Landscapes,"mountains, towns, rivers, architecture, valleys",The visualization comprises a collection of bl...


In [21]:
# df_clusters.to_csv('datasets/production/MVP_clusters_info.csv', sep=';', index=False)

## Mild changes in dataset_CLEAN.csv
Somehow I storaged labels as text, but here I substitude it with label_id from `dataset_labels.csv`

In [126]:
df_clean = pd.read_csv('datasets/production/dataset_CLEAN.csv', sep=';')
df_labels = pd.read_csv('datasets/production/dataset_labels.csv', sep=';')
df_clean.head(3)

Unnamed: 0,id_record,id_persistent,label,image_links,archive_links,microfiche_links,microfiche_archive_links
0,http://www.bildindex.de/document/obj20666124,http://id.bildindex.de/thing/0001618283,"Lauenhain (Kreis Hainichen), Fahnenträger eine...",[],[],['http://www.bildindex.de/bilder/d/mi12308g07'],['https://www.bildindex.de/media/obj20666124/m...
1,http://www.bildindex.de/document/obj20727600,http://id.bildindex.de/thing/0001677732,Fenster (Bauelement),['http://www.bildindex.de/bilder/d/fm140030'],['https://www.bildindex.de/media/obj20727600/f...,['http://www.bildindex.de/bilder/d/mi02117a05'...,['https://www.bildindex.de/media/obj20727600/m...
2,http://www.bildindex.de/document/obj20943303,http://id.bildindex.de/thing/0001803584,"Kassel, Infanteriekasernen an der Königsstraße...",['http://www.bildindex.de/bilder/d/STMP_II_110...,['https://www.bildindex.de/media/obj20943303/S...,[],[]


In [127]:
df_clean = df_clean.merge(df_labels, on='label', how='left')
df_clean['label'] = df_clean['id']
df_clean.rename(columns={'label': 'label_id'}, inplace=True)
df_clean = df_clean.drop(columns=['id'])
df_clean.head(3)

Unnamed: 0,id_record,id_persistent,label_id,image_links,archive_links,microfiche_links,microfiche_archive_links
0,http://www.bildindex.de/document/obj20666124,http://id.bildindex.de/thing/0001618283,0,[],[],['http://www.bildindex.de/bilder/d/mi12308g07'],['https://www.bildindex.de/media/obj20666124/m...
1,http://www.bildindex.de/document/obj20727600,http://id.bildindex.de/thing/0001677732,1,['http://www.bildindex.de/bilder/d/fm140030'],['https://www.bildindex.de/media/obj20727600/f...,['http://www.bildindex.de/bilder/d/mi02117a05'...,['https://www.bildindex.de/media/obj20727600/m...
2,http://www.bildindex.de/document/obj20943303,http://id.bildindex.de/thing/0001803584,2,['http://www.bildindex.de/bilder/d/STMP_II_110...,['https://www.bildindex.de/media/obj20943303/S...,[],[]


In [128]:
### df_clean.to_csv('datasets/production/dataset_CLEAN_UPDATED.csv', sep=';', index=False)