In [None]:
# Install a pip package in the current Jupyter kernel
import sys
!{sys.executable} -m pip install Pillow

# Code to check dimensions of images



In [None]:
from PIL import Image
from PIL.ExifTags import TAGS
import os
import datetime
import csv
import json
import exifread # https://github.com/ianare/exif-py

# function to use in sort of simple list
def sort_funct(row):
    return row

# write a list of dictionaries to a CSV file
def write_dicts_to_csv(table, filename, fieldnames):
    with open(filename, 'w', newline='', encoding='utf-8') as csv_file_object:
        writer = csv.DictWriter(csv_file_object, fieldnames=fieldnames)
        writer.writeheader()
        for row in table:
            writer.writerow(row)

def get_list_from_csv(filename):
    line_list = []
    with open(filename, 'rt', encoding='utf-8') as file_object:
        for one_line in file_object:
            line_list.append(one_line[:-1]) # remove dangling newline from end of line
    return line_list

working_directory = os.getcwd()
#working_directory = str(Path.home()) # gets path to home directory
image_dir = '/users/baskausj/gallery_digital_image_archive/'
print(image_dir)

# NOTE: do NOT open the suffixes.csv file with spreadsheet software unless you load the column as text! 
# Most software will do number conversions and mess up the spaces, parens, etc.
suffixes = get_list_from_csv(working_directory + '/' + 'suffixes.csv')

In [None]:
print(suffixes)

In [None]:

old_subdir_names = os.listdir(image_dir)
old_subdir_names.sort(key = sort_funct)
subdir_names = []
for subdir_name in old_subdir_names:
    if subdir_name[0] != '.':
        subdir_names.append(subdir_name)


In [None]:
print(subdir_names)

In [None]:
images = []
for subdir_name in subdir_names:
    #if subdir_name != '1999': # Uncomment to test with a single year
    #    continue
    image_subdir = image_dir + subdir_name
    print(image_subdir)
    items = os.listdir(image_subdir)
    # list comprehension to extract only files from the listed items
    image_names = [x for x in items if os.path.isfile(os.path.join(image_subdir, x))]
    for image_name in image_names:
        #if image_name != '1999.172ee.tif':
        #    continue
        if image_name[0] == '.': # skip hidden files
            continue
        image = {}
        image['name'] = image_name
        year = image_name[0:4]
        rest = image_name[5:]
        rest_pieces = rest.split('.') # separate into pieces by full stops
        extension = rest_pieces[len(rest_pieces)-1] # the last piece will be the file extension
        rest = '.'.join(rest_pieces[:-1]) # re-assemble the other pieces again, restoring the periods
        # remove any suffix from the end of the accession string
        no_suffix = True
        for suffix in suffixes:
            if len(rest) >= len(suffix) and suffix == rest[-len(suffix):]: # check if last part of string is equal to a given suffix
                #print(rest, rest[:-len(suffix)])
                no_suffix = False
                accession = year + '.' + rest[:-len(suffix)] # add the rest of the string minus the suffix
                break # stop checking suffixes
        if no_suffix:
            accession = year + '.' + rest
        #print(image_name, accession)
        image['accession'] = accession

        image_path = image_subdir + '/' + image_name
        # trap errors when the file isn't an image
        try:
            with Image.open(image_path) as img:
                width, height = img.size
        except:
            width = 0
            height = 0
            
        try:
            # First try to get the actual image creation date from the EXIF
            # Code from https://stackoverflow.com/questions/23064549/get-date-and-time-when-photo-was-taken-from-exif-data-using-pil
            with open(image_path, 'rb') as fh:
                tags = exifread.process_file(fh, stop_tag='EXIF DateTimeOriginal')
                date_taken = tags['EXIF DateTimeOriginal']
                create_date_string = str(date_taken)[:10].replace(':', '-')
                #print('EXIF DateTimeOriginal', create_date_string)
                if create_date_string == '0000-00-00':
                    raise Exception('Bad date')
                #print('image date')
        except:
            # If that's unavailable, then use the file creation date.
            # Note: this code is Mac/Linux-specific and would need to be modified if run on Windows.
            timestamp = os.stat(image_path).st_birthtime
            time_object = datetime.datetime.fromtimestamp(timestamp)
            create_date_string = time_object.strftime("%Y-%m-%d")
            #print('file date', create_date_string)
            

        if create_date_string == '1969-12-31':
            timestamp = os.stat(image_path).st_mtime 
            time_object = datetime.datetime.fromtimestamp(timestamp)
            create_date_string = time_object.strftime("%Y-%m-%d")
            #print('file modified', create_date_string)

        #print(image_path, create_date_string)

        #print(height, width)
        #print()
        image['kilobytes'] = round(os.path.getsize(image_path)/1024)
        image['height'] = height
        image['width'] = width
        image['create_date'] = create_date_string
        image['subdir'] = subdir_name
        image['extension'] = extension
        images.append(image)

#print(json.dumps(images, indent=2))
print('done')

In [None]:
print(len(images))

In [None]:
fieldnames = list(images[0].keys())
write_dicts_to_csv(images, working_directory + '/' + 'image_dimensions.csv', fieldnames)

print('done')

Hack to replace 1969-12-31 with last modified date

In [None]:
import pandas as pd
image_dimensions = pd.read_csv('image_dimensions.csv', na_filter=False, dtype = str)
images = pd.read_csv('images.csv', na_filter=False, dtype = str)

#images = images.head(200).copy(deep=True)

for index, image in images.iterrows():
    if image['create_date'] == '1969-12-31':
        correct_date = image_dimensions.loc[image_dimensions['name'] == image['name']].loc[:,'create_date'].tolist()[0]
        images.loc[images.index[index], 'create_date'] = correct_date
        
images.to_csv('images.csv', index = False)