# Omeka dataset reset (unexplode)

Load dataset with all metadata and pre-processing steps.

In [67]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import numpy as np
df = pd.read_csv("../Materials/KVO_META.csv", encoding='Latin-1', index_col=False)
df['Genre'] = df['genre'].str.lower()
sns.set_palette("Set2")

In [68]:
#remove all trailing and leading whitespaces.
for column in df.columns:
    try:
        df[column] = df[column].str.strip()
    except AttributeError:
        continue

for column in df.columns:
    df[column] = df[column].astype(str)

In [69]:
#convert the date column to datetime.
df['Date'] = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')

In [70]:
df.drop(columns=['Occasion', 'Multiple bill', 'Production', 'Original title', 'Composer', 'Original premiere', 'genre', 'Season', 'Directors', 'Genre'], inplace=True)


In [71]:
df.head()
old_len = len(df)

In [72]:
df.tail(5)

Unnamed: 0,Image,Collection,Date,Performances_ID,Normalised title,Original language
4436,1933-1934_00310-20220210_150111.jpg,1933-1934,1934-04-14,803397,Parsifal,DUI
4437,1933-1934_00312-20220210_150129.jpg,1933-1934,1934-04-15,891867,Madame Butterfly,ITA
4438,1933-1934_00314-20220210_150148.jpg,1933-1934,1934-04-15,647299,Driemeisjeshuis,DUI
4439,1933-1934_00314-20220210_150148.jpg,1933-1934,1934-04-15,647299,Mister Wu,DUI
4440,1933-1934_00314-20220210_150148.jpg,1933-1934,1934-04-15,647299,Paganini,DUI


In [73]:
df.dtypes

Image                        object
Collection                   object
Date                 datetime64[ns]
Performances_ID              object
Normalised title             object
Original language            object
dtype: object

In [74]:

merged_df = df.groupby('Performances_ID', sort = False).agg({
    'Image' : 'first',
    'Collection': 'first',
    'Normalised title': ' + '.join,
    'Date': 'first',
    'Original language': ' + '.join}).reset_index()



In [75]:
merged_df = merged_df.sort_index()
len(merged_df)

3919

In [76]:
merged_df['Date'] = merged_df['Date'].astype(str)

In [77]:
df2 = merged_df.groupby('Image', sort = False).agg({
    'Performances_ID' : 'first',
    'Collection': 'first',
    'Normalised title': ' ; '.join,
    'Date': ' ; '.join,
    'Original language': ' ; '.join}).reset_index()

In [78]:
len(df2['Image'].unique()) == len(df2)

True

In [79]:
len(merged_df) == old_len
df2.tail()
df['Production'] = df['Normalised title']
df.head()

Unnamed: 0,Image,Collection,Date,Performances_ID,Normalised title,Original language,Production
0,K.V.O. programmas 1893-97_00005-20211201_11335...,1893-1897,1893-10-03,466791,De Vrijschutter,DUI,De Vrijschutter
1,K.V.O. programmas 1893-97_00009-20211201_11343...,1893-1897,1893-10-05,445919,Willem Tell,DUI,Willem Tell
2,K.V.O. programmas 1893-97_00011-20211201_11345...,1893-1897,1893-10-10,845496,De Vrijschutter,DUI,De Vrijschutter
3,K.V.O. programmas 1893-97_00013-20211201_11351...,1893-1897,1893-10-12,205093,Willem Tell,DUI,Willem Tell
4,K.V.O. programmas 1893-97_00015-20211201_11352...,1893-1897,1893-11-16,854224,Charlotte Corday,NL,Charlotte Corday


In [80]:
import os
import shutil

df2['new_image'] = ''

for index, row in df2.iterrows():
    filename = row['Image']
    collection = row['Collection']

    _, extension = os.path.splitext(filename)

    # Create a new filename which is the collection followed by a consecutive number.
    new_filename = f"{collection}_{index}{extension}"
    df2.at[index, 'new_image'] = new_filename


In [81]:
df2 = df2.drop(columns=['Image', 'Performances_ID'])
df2.rename(columns={"new_image": "Image"}, inplace=True)
df2.head()

Unnamed: 0,Collection,Normalised title,Date,Original language,Image
0,1893-1897,De Vrijschutter,1893-10-03,DUI,1893-1897_0.jpg
1,1893-1897,Willem Tell,1893-10-05,DUI,1893-1897_1.jpg
2,1893-1897,De Vrijschutter,1893-10-10,DUI,1893-1897_2.jpg
3,1893-1897,Willem Tell,1893-10-12,DUI,1893-1897_3.jpg
4,1893-1897,Charlotte Corday,1893-11-16,NL,1893-1897_4.jpg


In [82]:

df2.to_csv('../../Primair/Volledige_dataset_omeka/OMEKA_volledige_dataset.csv', encoding='utf-8', index=False)



import os
import shutil


# Path to the folder containing the subfolders with images
base_folder = '../../Primair/1926-1927_bis'

# Path to the folder where you want to create the new subfolders
output_folder = '../../Primair/Volledige_dataset_omeka'


for index, row in df2.iterrows():
   
    filename = row['Image']
    collection = row['Collection']
    
    _, extension = os.path.splitext(filename)
    
   # Create a new filename which is the collection followed by a consecutive number.
    new_filename = f"{collection}_{index}{extension}"
    
    # Find the subfolder containing the image
    for root, dirs, files in os.walk(base_folder):
        if filename in files:
            # Create the new subfolder in the output folder if it doesn't exist
            new_subfolder = os.path.join(output_folder, collection)
            os.makedirs(new_subfolder, exist_ok=True)
            
            src_path = os.path.join(root, filename)
            dest_path = os.path.join(new_subfolder, new_filename)
            shutil.copy(src_path, dest_path)
            
            break  

print("Images copied and renamed successfully.")