In [19]:
import pandas as pd

# Accession numbers in images are not unique. They are clean except in cases where images involve multiple works, don't have accession numbers, etc.
# They have  Q IDs, which are also not unique.
images = pd.read_csv('images.csv', na_filter=False, dtype=str)
images = images.set_index('local_filename')

# The accession numbers are not unique and are not clean since they are just parsed from the filenames.
image_dimensions = pd.read_csv('image_dimensions.csv', na_filter=False, dtype=str)
image_dimensions = image_dimensions.set_index('name')

# The local filenames should be unique. They might possibly have been changed so aren't guaranteed to match the other dataframes.
# Generally, there is one uploaded image per work, but there are some exceptions. So the Q IDs and accession numbers are not guarantted to be unique.
commons_images = pd.read_csv('commons_images.csv', na_filter=False, dtype=str)
commons_images = commons_images.set_index('local_filename')

# This was the upload file for Wikidata, so the Q IDs should be unique. There should be a one-to-one mapping between Q IDs and accession numbers, 
# so the accession numbers should also be unique. They should also be clean since they were used as the inventory number for the Wikidata upload.
works = pd.read_csv('../works_multiprop.csv', na_filter=False, dtype=str)
works = works.set_index('inventory_number')


In [20]:
image_dimensions = image_dimensions.drop(image_dimensions.loc[:, 'accession':'subdir'].columns, axis='columns')


In [22]:
# Perform an outer join of the images and image_dimensions dataframes to capture.
combined_images = pd.merge(images, image_dimensions, how='outer', left_index=True, right_index=True)

combined_images.index.names = ['local_filename']

combined_images.head()


Unnamed: 0_level_0,qid,local_identifier,rank,label,notes,kilobytes,height,width,photo_inception,subdir,extension
local_filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1956-001.tif,Q102961253,1956.001,primary,,,73531,2611,4805,2020-07-23,1956,tif
1956-003.tif,Q103297456,1956.003,primary,,,47806,3600,2265,2020-08-18,1956,tif
1956-026.tif,Q103310082,1956.026,primary,,,11861,1702,1188,2020-07-23,1956,tif
1956-027.tif,Q103308843,1956.027,primary,,,126728,5651,3826,2020-07-23,1956,tif
1956-028.tif,Q102974499,1956.028,primary,,,111022,5078,3730,2020-07-23,1956,tif


In [23]:
commons_images.head()

Unnamed: 0_level_0,qid,commons_id,local_identifier,label_en,directory,rank,image_name,iiif_manifest,notes
local_filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1984.021.tif,Q102961225,M113028886,1984.021,"A Conversation with Guido di Brettinoro, (Purg...",1984,,A Conversation with Guido di Brettinoro - Vand...,https://iiif-manifest.library.vanderbilt.edu/g...,"Trial upload by script, linked by Quickstateme..."
1979.0264P.tif,Q102961508,M113040378,1979.0264P,Adoration of the Sheperds,1979,,Adoration of the Sheperds - Vanderbilt Fine Ar...,https://iiif-manifest.library.vanderbilt.edu/g...,
1993.222.tif,Q102961566,M113051913,1993.222,Album painting of two men in a cottage in a mo...,1993,,Album painting of two men in a cottage in a mo...,https://iiif-manifest.library.vanderbilt.edu/g...,
1979.0303P.tif,Q102961830,M113053575,1979.0303P,Autumn River,1979,,Autumn River - Vanderbilt Fine Arts Gallery - ...,https://iiif-manifest.library.vanderbilt.edu/g...,
1973.005.tif,Q102962942,M113053895,1973.005,Caney Fork Cabin,1973,,Caney Fork Cabin - Vanderbilt Fine Arts Galler...,https://iiif-manifest.library.vanderbilt.edu/g...,


In [24]:
# Perform an outer join of the combined_images and commons_images dataframes.
images_with_commons = pd.merge(combined_images, commons_images, how='outer', left_index=True, right_index=True)

# Save the combined images dataframe to a CSV file.
images_with_commons.to_csv('combined_images.csv')

In [25]:
# This was the upload file for Wikidata, so the Q IDs should be unique. There should be a one-to-one mapping between Q IDs and accession numbers, 
# so the accession numbers should also be unique. They should also be clean since they were used as the inventory number for the Wikidata upload.
works = pd.read_csv('../works_multiprop.csv', na_filter=False, dtype=str)
works = works.set_index('qid')

artworks = pd.read_csv('artwork_metadata.csv', na_filter=False, dtype=str)
artworks = artworks.set_index('qid')

In [26]:
works = works.drop(works.loc[:, 'inventory_number_collection':].columns, axis='columns')
works.head()

Unnamed: 0_level_0,label_en,description_en,inventory_number_uuid,inventory_number
qid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Q102305506,The Christ of the Breadlines,print by Fritz Eichenberg,60965535-6F8D-4CB7-B702-D6A84E576DA3,2017.001.062
Q102315563,Seated female funerary figure,ceramics by Artist Unknown,9438F518-B646-4E12-AADD-D5FF50DF3346,1994.413
Q102315787,Head of a Young Girl or a Goddess,sculpture by Artist Unknown,B5F33FEE-A2B6-4D77-8764-C3942DC09E1B,1969.014
Q102949359,"""I Hate You For Hitting My Mother,"" Minneapolis",photograph by Donna Ferrato,AE86C1A8-48F0-4FD4-92D2-6DB10C878E24,2013.042
Q102958387,(An Imaginary) Night Attack in the War Between...,print by Yoshitora,B7A0A58E-8E8B-4B6A-A1E0-1EE854BA62B0,1992.139 a-c


In [27]:
works = works.drop(works.loc[:, 'label_en':'inventory_number_uuid'].columns, axis='columns')
works.head()

Unnamed: 0_level_0,inventory_number
qid,Unnamed: 1_level_1
Q102305506,2017.001.062
Q102315563,1994.413
Q102315787,1969.014
Q102949359,2013.042
Q102958387,1992.139 a-c


In [28]:
# Perform an outer join of the combined_images and commons_images dataframes.
works_clean = pd.merge(works, artworks, how='outer', left_index=True, right_index=True)

# Save the combined images dataframe to a CSV file.
works_clean.to_csv('works_clean.csv')