## <font color=green>0. IMPORTS </font>

In [1]:
import pandas as pd
import seaborn as sns
import json
import uuid
import ast

## <font color=green>1. CREATING DATAFRAME FROM SUBSET</font>

In [2]:
### Importing the ArtNet object detection CSV as a dataframe
df_objects = pd.read_csv("df_objects_subset.csv")
df_objects.head()

Unnamed: 0,filename,confidence,label (numeric),label (text),x1,y1,x2,y2,category
0,KMS638.png,0.836013,43,window,37.446743,28.826166,973.8179,691.58057,Architecture
1,KMS595.png,0.824079,43,window,70.270836,212.37729,160.14937,301.9278,Architecture
2,KMS3465.png,0.790962,43,window,680.9164,258.06458,774.5717,348.03717,Architecture
3,KMS830.png,0.783134,43,window,828.9544,1.786047,1021.73303,205.6354,Architecture
4,KMS1552.png,0.781475,43,window,8.729956,0.59108,1590.5485,912.3697,Architecture


In [3]:
df_objects.shape

(10775, 9)

#### <font color=##5A5A5A>1.2 Renaming Columns</font>

In [4]:
df_objects.rename(columns={"label (text)": "label_text", "label (numeric)": "label_numeric", "category" : "category_name"}, inplace = True)

#### <font color=##5A5A5A>1.3 Creating a column coords with the combined values of x1, y1, x2, y2 </font>

In [5]:
df_objects['coords'] = df_objects.apply(lambda row: [row['x1'], row['y1'], row['x2'], row['y1'], row['x2'], row['y2'], row['x1'], row['y2']], axis=1)

#### <font color=##5A5A5A>1.4 Creating painting_id column based on filename</font>

In [6]:
split_function = lambda x: x.split(".")[0]

In [7]:
df_objects['painting_id'] = df_objects.loc[:, 'filename'].apply(split_function)

#### <font color=##5A5A5A>1.5 Generating unique object_id </font>

In [8]:
# generate unique IDs for each row
df_objects['object_id'] = [uuid.uuid4().hex for _ in range(len(df_objects))]

In [9]:
df_objects_clean = df_objects.filter(['object_id','label_text', 'coords', 'painting_id', 'category_name'])

In [10]:
df_objects_clean.head()

Unnamed: 0,object_id,label_text,coords,painting_id,category_name
0,f4a25112edf843c18c75e27931e7d05c,window,"[37.446743, 28.826166, 973.8179, 28.826166, 97...",KMS638,Architecture
1,20ae0d67566648e4870f965fe6ea31fb,window,"[70.270836, 212.37729, 160.14937, 212.37729, 1...",KMS595,Architecture
2,b1d694472e1c4f32a8d27f0bcf6c88a1,window,"[680.9164, 258.06458, 774.5717, 258.06458, 774...",KMS3465,Architecture
3,ceda21842afb4927a7df85efa09c8d08,window,"[828.9544, 1.786047, 1021.73303, 1.786047, 102...",KMS830,Architecture
4,8a44b020504340978d844abb317375f3,window,"[8.729956, 0.5910803, 1590.5485, 0.5910803, 15...",KMS1552,Architecture


In [11]:
df_objects_clean['painting_id'].nunique()

3906

In [12]:
## Checking the datatypes
df_objects_clean.dtypes

object_id        object
label_text       object
coords           object
painting_id      object
category_name    object
dtype: object

## <font color=green>2. IMPORTING PAINTINGS FROM SMK API AS DATAFRAME </font>

In [13]:
df_paintings = pd.read_csv("SMK_Malerier_Filtreret.csv")
df_paintings.head(2)

Unnamed: 0,id,created,modified,responsible_department,acquisition_date,acquisition_date_precision,dimensions,documentation,object_names,production,...,media_3d,original,media_audio,medium,files_3D,edition,title,author,publication_year,creator
0,1170001003_object,2020-03-21T08:30:01Z,2023-03-17 09:40:56+00:00,Den Kongelige Maleri- og Skulptursamling,2020-01-01T00:00:00Z,2020-12-31,"[{'part': 'Brutto', 'type': 'højde', 'unit': '...",,[{'name': 'Maleri'}],"[{'creator': 'Reuter Christiansen, Ursula', 'c...",...,,,,,,,,,,"Reuter Christiansen, Ursula"
1,1170001004_object,2020-03-21T08:30:02Z,2023-03-17 09:40:57+00:00,Den Kongelige Maleri- og Skulptursamling,2020-01-01T00:00:00Z,2020-12-31,"[{'part': 'Brutto', 'type': 'højde', 'unit': '...",,[{'name': 'Maleri'}],"[{'creator': 'Reuter Christiansen, Ursula', 'c...",...,,,,,,,,,,"Reuter Christiansen, Ursula"


In [14]:
df_paintings.shape

(6570, 84)

#### <font color=##5A5A5A>2.1 Renaming Columns</font>

In [15]:
df_paintings.rename(columns={"object_number": "painting_id"}, inplace = True)

#### <font color=##5A5A5A>2.2 Filtering columns in dataframe</font>

In [16]:
df_paintings_filtered = df_paintings.filter(['painting_id','titles', 'artist', 'techniques', 'publication_year','image_width', 'image_height', 'image_thumbnail', 'colors', 'suggested_bg_color'], axis=1)

In [17]:
## Inserting an empty string for all NaN values in the publication_year column
df_paintings_filtered['publication_year'] = df_paintings_filtered['publication_year'].fillna('')

#### <font color=#5A5A5A>2.2.1 Dropping NaN values</font>

In [18]:
df_paintings_clean = df_paintings_filtered.copy()

In [19]:
df_paintings_clean.dropna(inplace = True)

In [20]:
df_paintings_clean.head(2)

Unnamed: 0,painting_id,titles,artist,techniques,publication_year,image_width,image_height,image_thumbnail,colors,suggested_bg_color
6,KMS8914,"[{'title': 'Skovinteriør med kroget træ', 'typ...",['Mogens Ballin'],['Tempera på pap'],,3780.0,4355.0,https://iip-thumb.smk.dk/iiif/jp2/0z7090448_KM...,"['#222222', '#956830', '#444444', '#795025', '...",['#908d41']
7,KMS8898,"[{'title': 'Uden titel', 'type': 'kunstnertite...",['Ib Geertsen'],['Olie på lærred'],,7205.0,5601.0,https://iip-thumb.smk.dk/iiif/jp2/pn89db79c_km...,"['#cccccc', '#c3d268', '#555555', '#d7eac3', '...",['#77733d']


#### <font color=##5A5A5A>2.3 Formatting string values </font>

In [21]:
strip_function_1 = lambda x: x.strip("['']")
strip_function_2 = lambda x: x.strip("[]c-op. ")

In [22]:
df_paintings_clean.loc[:,'artist'] = df_paintings_clean.loc[:, 'artist'].apply(strip_function_1)
df_paintings_clean.loc[:,'techniques'] = df_paintings_clean.loc[:, 'techniques'].apply(strip_function_1)
df_paintings_clean.loc[:,'suggested_bg_color'] = df_paintings_clean.loc[:, 'suggested_bg_color'].apply(strip_function_1)
df_paintings_clean.loc[:,'publication_year'] = df_paintings_clean.loc[:, 'publication_year'].apply(strip_function_2)

In [23]:
# Define a function to extract the 'title' value from the existing column
def extract_title(row):
    data = ast.literal_eval(row['titles'])
    return data[0]['title']

# Apply the extract_title function to each row in the DataFrame and store the result in a new column
df_paintings_clean['title'] = df_paintings_clean.apply(extract_title, axis=1)

In [24]:
df_paintings_clean.head(2)

Unnamed: 0,painting_id,titles,artist,techniques,publication_year,image_width,image_height,image_thumbnail,colors,suggested_bg_color,title
6,KMS8914,"[{'title': 'Skovinteriør med kroget træ', 'typ...",Mogens Ballin,Tempera på pap,,3780.0,4355.0,https://iip-thumb.smk.dk/iiif/jp2/0z7090448_KM...,"['#222222', '#956830', '#444444', '#795025', '...",#908d41,Skovinteriør med kroget træ
7,KMS8898,"[{'title': 'Uden titel', 'type': 'kunstnertite...",Ib Geertsen,Olie på lærred,,7205.0,5601.0,https://iip-thumb.smk.dk/iiif/jp2/pn89db79c_km...,"['#cccccc', '#c3d268', '#555555', '#d7eac3', '...",#77733d,Uden titel


## <font color=green>3. Merging the dataframes</font>

In [25]:
df_merged = pd.merge(df_objects_clean, df_paintings_clean, on='painting_id')

In [26]:
df_merged_clean = df_merged.drop_duplicates(subset=['object_id'], keep='last')

In [27]:
# checking for duplicate object_id values 
df_merged_clean.duplicated(subset=['object_id']).sum()

0

In [28]:
df_merged_clean.head(2)

Unnamed: 0,object_id,label_text,coords,painting_id,category_name,titles,artist,techniques,publication_year,image_width,image_height,image_thumbnail,colors,suggested_bg_color,title
0,f4a25112edf843c18c75e27931e7d05c,window,"[37.446743, 28.826166, 973.8179, 28.826166, 97...",KMS638,Architecture,"[{'title': 'Slagbillede', 'type': 'museumstite...",Jacob Weyer,Olie på træ,,4833.0,3407.0,https://iip-thumb.smk.dk/iiif/jp2/1r66j221x_KM...,"['#dddddd', '#444444', '#555555', '#333333', '...",#888888,Slagbillede
1,2656fb4cc9124c76a18ff5cdc16d0333,lake,"[93.13501, 403.28055, 966.51013, 403.28055, 96...",KMS638,Nature,"[{'title': 'Slagbillede', 'type': 'museumstite...",Jacob Weyer,Olie på træ,,4833.0,3407.0,https://iip-thumb.smk.dk/iiif/jp2/1r66j221x_KM...,"['#dddddd', '#444444', '#555555', '#333333', '...",#888888,Slagbillede


In [29]:
df_merged_clean.shape

(10142, 15)

## <font color=green>4. Create Category Dataframe</font>

In [30]:
# define the Categories dictionary
category_dict = {'Christianity': ["Jesus Christ", "God", "Angel", "Virgin Mary", "Saint", "Devil", "Cross"],
              'Occultism': ["Star", "Ghost", "Skull", "Skeleton", "Demon"],
              'Nature': ["Fire", "Sea", "Sky", "Lake", "Cloud", "Lightning", "Sun", "Moon", "Tree", "Flower", "Plant", "Mountain", "Bush", "Rock"],
              'Weaponry': ["Firearm", "Sword", "Bow", "Arrow", "Helmet", "Shield", "Spear", "Armor", "Rope", "Hammer"],
              'Architecture': ["House", "Bridge", "Castle", "Mill", "Church", "Pillar", "Window", "Door", "Staircase"],
              'Clothing': ["Shoes", "Dress", "Hat", "Mask", "Jewellery", "Crown", "Tie", "Umbrella", "Belt", "Cane", "Gloves", "Bag"],
              'Vehicles': ["Car", "Boat", "Ship", "Bicycle", "Train", "Airplane", "Wheel", "Carriage"],
              'Furniture': ["Sofa", "Chair", "Table", "Bed", "Bathtub", "Easel"],
              'Interior': ["Lamp", "Vase", "Book", "Mirror", "Drapery", "Bird cage", "Paper", "Globe", "Cup", "Flag", "Bottle", "Bowl"],
              'Food': ["Banana", "Orange", "Melon", "Apple", "Grapes", "Wine", "Lobster", "Vegetable", "Bread", "Cheese", "Pineapple"],
              'Animals': ["Bird", "Reptile", "Fish", "Rabbit", "Horse", "Cat", "Donkey", "Dog", "Chicken", "Cow", "Mouse", "Sheep", "Insect", "Butterfly"],
              'Human': ["Child", "Baby", "Face", "Hand", "Man", "Woman"],
              'Instrument': ["Harp", "Violin", "Flute", "Piano", "Drum", "Guitar"]}

In [31]:
# Create empty lists for category_name and object_labels
category_name_list = []
object_labels_list = []

# Iterate over the key-value pairs in the dictionary and append to the lists
for category_name, object_labels in category_dict.items():
    category_name_list.append(category_name)
    object_labels_list.append(object_labels)

# Create a Pandas dataframe from the lists
df_categories = pd.DataFrame({'category_name': category_name_list, 'object_labels': object_labels_list})

In [32]:
df_categories

Unnamed: 0,category_name,object_labels
0,Christianity,"[Jesus Christ, God, Angel, Virgin Mary, Saint,..."
1,Occultism,"[Star, Ghost, Skull, Skeleton, Demon]"
2,Nature,"[Fire, Sea, Sky, Lake, Cloud, Lightning, Sun, ..."
3,Weaponry,"[Firearm, Sword, Bow, Arrow, Helmet, Shield, S..."
4,Architecture,"[House, Bridge, Castle, Mill, Church, Pillar, ..."
5,Clothing,"[Shoes, Dress, Hat, Mask, Jewellery, Crown, Ti..."
6,Vehicles,"[Car, Boat, Ship, Bicycle, Train, Airplane, Wh..."
7,Furniture,"[Sofa, Chair, Table, Bed, Bathtub, Easel]"
8,Interior,"[Lamp, Vase, Book, Mirror, Drapery, Bird cage,..."
9,Food,"[Banana, Orange, Melon, Apple, Grapes, Wine, L..."


## <font color=green>5. Splitting the merged dataframe into an Object and an Painting dataframe</font>

In [33]:
df_objects_final = df_merged_clean.filter(['object_id','label_text', 'coords', 'painting_id', 'image_thumbnail', 'category_name' ], axis=1)

In [34]:
df_objects_final.shape

(10142, 6)

In [35]:
df_paintings_final = df_merged_clean.filter(['painting_id', 'artist', 'title', 'techniques',  'publication_year','image_width', 'image_height', 'image_thumbnail', 'colors', 'suggested_bg_color'], axis=1)

In [36]:
df_paintings_final = df_paintings_final.drop_duplicates(subset='painting_id', keep="first")

In [37]:
df_paintings_final.shape

(3654, 10)

#### <font color=##5A5A5A>5.1 Creating the column object_labels for the painting dataframe</font>
These column is created for the painting class, to store an array of object labels that are found on this painting

In [38]:
# Create dictionary mapping painting_id to label_text array
painting_to_object_dict_labels = {}
for painting_id in df_paintings_final['painting_id']:
    object_id_array = df_objects_final.loc[df_objects_final['painting_id'] == painting_id, 'label_text'].values
    painting_to_object_dict_labels[painting_id] = object_id_array

    
df_paintings_final['object_labels'] = df_paintings_final['painting_id'].map(painting_to_object_dict_labels)

In [39]:
df_paintings_final.head(2)

Unnamed: 0,painting_id,artist,title,techniques,publication_year,image_width,image_height,image_thumbnail,colors,suggested_bg_color,object_labels
0,KMS638,Jacob Weyer,Slagbillede,Olie på træ,,4833.0,3407.0,https://iip-thumb.smk.dk/iiif/jp2/1r66j221x_KM...,"['#dddddd', '#444444', '#555555', '#333333', '...",#888888,"[window, lake, flag, horse, donkey, sky, devil..."
14,KMS595,Nicolai Abildgaard,"En potuaner, hvis reformforslag er billiget, b...",Olie på lærred,1999.0,3131.0,3738.0,https://iip-thumb.smk.dk/iiif/jp2/kk91fp68v_KM...,"['#222222', '#3a351f', '#908d41', '#382510', '...",#908d41,"[window, spear, spear, sword, demon]"


## <font color=green>6 Exporting the dataframes as json</font>

In [40]:
## Exporting jsons
df_paintings_final.to_json('painting_class.json', orient="table", index=False)
df_categories.to_json('category_class.json', orient="table", index=False)

In [41]:
## Exporting csvs
df_objects_final.to_csv('object_class.csv', index=False)