In [1]:
from etltools.image import upload_file, upload_from_url

import os
import pandas as pd

# Guide
This short notebook can be used to upload the images referenced in a CSV file. The CSV will then be updated to include new persistent URLs for the images, which will be hosted on Azure.

However, please note that we assume that the images are all in **separate** columns in the CSV. 

In [2]:
# Load the CSV file
file_name = 'test.csv'

df = pd.read_csv(f'./data/{file_name}')
df.head()

Unnamed: 0,id,image
0,1,https://www.dhm.de/datenbank/goering/ccp_img_r...
1,2,https://kunstgraph.blob.core.windows.net/image...


In [3]:
# You need to set the columns containing the urls of images here please. 
image_columns = ['image']

# You need to set the column containing the unique identifier of the images here please.
identifier_column = 'id'
# Also, please set the name of the source where the images were found here.
source_name = 'wccp'

In [4]:
for image_column in image_columns:
    for index, row in df.iterrows():
        if 'http' in row[image_column]:
            image_url = upload_from_url(row[image_column], row[identifier_column], source_name)
        else:
            image_url = upload_file(directory_name=source_name, file_path=row[image_column], file_name=identifier_column)

        df.at[index, f"persistent_{image_column}"] = image_url


if not os.path.exists('output'):
    os.makedirs('output')

df.to_csv(f'output/{file_name}', index=False)