# Script to screen works for possible Commons upload

## Configuration section

In [122]:
# Import modules
import json
import csv
import math
import datetime
from time import sleep
import requests
import re # regex
# Pandas for data frame management
import pandas as pd
# Fuzzy string matching
from fuzzywuzzy import fuzz # fuzzy logic matching

# Configuration
public_domain_categories = [
    'artist died before copyright cutoff', 
    'artist was born before 1800', 
    'assessed to be out of copyright', 
    'from style or period that ended prior to copyright cutoff',
    'inception prior to copyright cutoff'
]

# Options for filtering by image size
size_filter = 'pixsquared' # options: filetype, filesize, pixsquared
requrired_filetype = 'tiff' # not implemented (yet)
minimum_filesize = 1000
minimup_pixel_squared = 1000000

# Note: setting the index to be the Q ID requires that qid has a unique value for each row. This should be the case.
works_metadata = pd.read_csv('../works_multiprop.csv', na_filter=False, dtype = str)
works_metadata.set_index('qid', inplace=True)

image_dimensions = pd.read_csv('image_dimensions.csv', na_filter=False, dtype = str)
# Convert some columns to integers
image_dimensions[['kilobytes', 'height', 'width']] = image_dimensions[['kilobytes', 'height', 'width']].astype(int)

works_classification = pd.read_csv('../../gallery_buchanan/works_classification.csv', na_filter=False, dtype = str)
works_classification.set_index('qid', inplace=True)

works_ip_status = pd.read_csv('../items_status_abbrev.csv', na_filter=False, dtype = str)
works_ip_status.set_index('qid', inplace=True)

existing_images = pd.read_csv('commons_images.csv', na_filter=False, dtype = str)
existing_images.set_index('qid', inplace=True)

# For testing purposes, just use the first few rows of the works metadata
test_rows = 300
works_metadata = works_metadata.head(test_rows).copy()


In [110]:
works_metadata.head(2)

Unnamed: 0_level_0,label_en,description_en,inventory_number_uuid,inventory_number,inventory_number_collection,inventory_number_ref1_hash,inventory_number_ref1_statedIn,inventory_number_ref1_referenceUrl,inventory_number_ref1_retrieved_nodeId,inventory_number_ref1_retrieved_val,...,collection_ref1_retrieved_val,collection_ref1_retrieved_prec,location_uuid,location,location_ref1_hash,location_ref1_statedIn,location_ref1_referenceUrl,location_ref1_retrieved_nodeId,location_ref1_retrieved_val,location_ref1_retrieved_prec
qid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Q102305506,The Christ of the Breadlines,print by Fritz Eichenberg,60965535-6F8D-4CB7-B702-D6A84E576DA3,2017.001.062,Q18563658,d1f8238d0b4bf5fadeb75381feceacfdd7f12af5,,https://library.artstor.org/#/asset/26760365,0f364e2fcaf6b3e5870f19adb2cba00e,2020-11-30T00:00:00Z,...,2020-11-30T00:00:00Z,11,87C848D7-1589-4F67-9DBE-12FE22CFF157,Q18563658,d1f8238d0b4bf5fadeb75381feceacfdd7f12af5,,https://library.artstor.org/#/asset/26760365,0f364e2fcaf6b3e5870f19adb2cba00e,2020-11-30T00:00:00Z,11
Q102315563,Seated female funerary figure,ceramics by Artist Unknown,9438F518-B646-4E12-AADD-D5FF50DF3346,1994.413,Q18563658,7f7046f91b3d0ddd7d26cffcc9f1054db778fafa,,https://library.artstor.org/#/asset/26757434,0f364e2fcaf6b3e5870f19adb2cba00e,2020-11-30T00:00:00Z,...,2020-11-30T00:00:00Z,11,83482B6C-AC70-46B8-AC76-2E65F1C42351,Q18563658,7f7046f91b3d0ddd7d26cffcc9f1054db778fafa,,https://library.artstor.org/#/asset/26757434,0f364e2fcaf6b3e5870f19adb2cba00e,2020-11-30T00:00:00Z,11


In [111]:
image_dimensions.head(2)

Unnamed: 0,name,accession,kilobytes,height,width,subdir,extension
0,1956.058.tif,1956.058,197147,6540,10288,1956,tif
1,1956.028.jpg,1956.028,71,428,312,1956,jpg


In [22]:
works_classification.head(2)

Unnamed: 0_level_0,dimension,type,label
qid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Q103296133,2D,artists book,(Shinzo) Ukiyo E Ruiko; two volumes
Q102961187,2D,artists book,1 Sekunde artist's book


In [24]:
works_ip_status.head(2)

Unnamed: 0_level_0,accession_number,label_en,description_en,creator,instance_of,inception_val,inception_earliest_date_val,inception_latest_date_val,height_val,width_val,thickness_val,diameter_val,style_period,rights,media_url,status
qid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Q102976613,1970.067,Veduta della Piazza di Monte Cavallo (View of ...,print by Giovanni Battista Piranesi,Q316307,Q11060274,1908-01-01T00:00:00Z,1870-01-01T00:00:00Z,1945-01-01T00:00:00Z,18.5,27.75,,,,COPYRIGHT NOT EVALUATED,https://forum.jstor.org/assets/26753692/repres...,artist died before copyright cutoff
Q102976639,1970.062,Veduta Interna dell' Atrio del Portico di Otta...,print by Giovanni Battista Piranesi,Q316307,Q11060274,1908-01-01T00:00:00Z,1870-01-01T00:00:00Z,1945-01-01T00:00:00Z,16.5,21.75,,,,NO KNOWN COPYRIGHT,https://forum.jstor.org/assets/26753687/repres...,artist died before copyright cutoff


In [50]:
existing_images.head(2)

Unnamed: 0_level_0,accession_number,label_en,image_name,iiif_manifest,notes
qid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Q90716090,,Emily Thorn Vanderbilt Sloane,Emily Thorn Vanderbilt Sloane White.jpg,,
Q94702774,1979.0648P,A Baptismal Ceremony,Amico Aspertini.1474–1552A Baptismal Ceremony ...,,


In [124]:
# The row index is the Q ID and is a string. The work object is the data in the row and is a Pandas series
# The items in the row series can be referred to by their labels, which are the column headers, e.g. work['label_en']
for index, work in works_metadata.iterrows():    
    # Screen out images that are already in Commons
    if index in existing_images.index:
        continue
    
    # Screen for 2 dimensional works
    if index in works_classification.index:
        # Find the row whose index matches the Q ID of the work, then the item by name within the series (dimension)
        # Note: this method of location works because the Q ID index is unique for each row in the lookup table.
        if works_classification.loc[index, 'dimension'] != '2D': # skip this work if not 2D
            continue

    # Screen for public domain images
    # NOTE: the IP status was only done for cases where the script was able to match up image file names with accession numbers.
    # It should be done again to pick up more images based on the new image_dimensions.csv file after it's cleaned up.
    # There are at least a thousand works that will get screened out here because they aren't imaged.
    if not index in works_ip_status.index:
        continue
    else:
        ip_status = works_ip_status.loc[index, 'status']
        if not ip_status in public_domain_categories:
            continue
    
    # Screen for high resolution images
    image_dimension_frame = image_dimensions.loc[image_dimensions.accession == work['inventory_number']] # result is DataFrame
    if len(image_dimension_frame) == 0: # skip any works whose image can't be found in the dimensions data
        continue

    # Order rows by size in kB, then take the first row
    image_dimension_series = image_dimension_frame.sort_values(by=['kilobytes'], ascending=False).iloc[0]
    # Skip work if its image doesn't meet the minimum size requirement
    if size_filter == 'pixsquared':
        if image_dimension_series['height'] * image_dimension_series['width'] < minimup_pixel_squared:
            continue
    elif size_filter == 'filesize':
        if image_dimension_series['kilobytes'] < minimum_filesize:
            continue
    else: # don't apply a size filter
        pass
    
    print(index)
    print(work['label_en'], ip_status)
    print(image_dimension_series['subdir'] + '/' + image_dimension_series['name'])
    print(image_dimension_series['kilobytes'])
    print(image_dimension_series['height'], image_dimension_series['width'])
    print()
    


Q102961225
A Conversation with Guido di Brettinoro, (Purgatorio, Canto 14) from Illustrations to Dante's Divine Comedy inception prior to copyright cutoff
1984/1984.021.tif
71708
3071 3984

Q102961508
Adoration of the Sheperds artist died before copyright cutoff
1979/1979.0264P.tif
47619
2734 2971

Q102961566
Album painting of two men in a cottage in a mountain landscape from style or period that ended prior to copyright cutoff
1993/1993.222.tif
41471
3659 3866

Q102961830
Autumn River artist died before copyright cutoff
1979/1979.0303P.tif
59379
2607 3886

Q102962942
Caney Fork Cabin inception prior to copyright cutoff
1973/1973.005.tif
86592
3065 4820

Q102962954
Capt. Sterrett in the Schr Enterprise paying tribute to Tripoli, August 1801 inception prior to copyright cutoff
1979/1979.0121.tif
10210
1481 2353

Q102963263
Chinese Winter Landscape inception prior to copyright cutoff
1978/1978.014.tif
21022
1890 3790

