In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#%cd /content/drive/MyDrive/cv

/content/drive/MyDrive/cv


In [None]:
#!pip install split-folders
#!pip install pytest-shutil

In [1]:
import pandas as pd
import html
import numpy as np
import string
import re

In [None]:
artwork_df = pd.read_csv('data/raw/artwork.csv')

# decode html encoding, drops rows without any tag associated
for ind in artwork_df.index:
  if artwork_df['tag'][ind] == '[]':
    artwork_df = artwork_df.drop([ind], axis=0)
  else:
    artwork_df['artwork'][ind] = html.unescape(artwork_df['artwork'][ind])
    artwork_df['artist'][ind] = html.unescape(artwork_df['artist'][ind])
    artwork_df['serie'][ind] = html.unescape(artwork_df['serie'][ind])
    artwork_df['period'][ind] = html.unescape(artwork_df['period'][ind])
    artwork_df['gallery'][ind] = html.unescape(artwork_df['gallery'][ind])

# removes "
for col in artwork_df:
  if col != 'tag':
    artwork_df[col] = artwork_df[col].str.replace('"','')

artwork_df.to_csv('data/processed/artwork.csv')

In [None]:
sampling_df = artwork_df.groupby('style', group_keys=False).apply(lambda x: x.sample(frac=0.20))
sampling_df.to_csv('data/processed/artwork_sampled.csv')

In [2]:
# filters tags
tag_df = pd.read_csv('data/raw/tag_count.csv')

th = 200 # minimum occurrences
count = 0
removed = 0
tag_df['t.name'] = tag_df['t.name'].str.replace('"', '')
tag_dictionary = {}
for ind in tag_df.index:
  if tag_df['count'][ind] >= th :
    tag_dictionary[tag_df['t.name'][ind]] = tag_df['count'][ind]
  else :
    count += tag_df['count'][ind]
    tag_dictionary.update({'Other' : count})
    tag_df = tag_df.drop([ind], axis=0)
    removed = removed+1

other = ['Other', count]
tag_df.loc[len(tag_df.index)] = other
tag_df = tag_df.sort_values('count', ascending=False)
tag_df.to_csv('data/processed/tag_filtered.csv')
print('{} tags with less than {} occurrences, {} remaining'.format(removed, th, len(tag_dictionary)))

5151 tags with less than 200 occurrences, 274 remaining


In [5]:
def tag_substitution(input, output):
    artwork_df = pd.read_csv(input)

    # replace tags with less than 200 occurrences with 'Other'
    for ind in artwork_df.index:
      tags = re.sub('[\[\]]', '', artwork_df['tag'][ind])
      tag_tokens = tags.split(', "')
      clean_tokens = []
      for t in tag_tokens:
        clean_tokens.append(re.sub('["]', '', t))

      for t in clean_tokens:
        if t not in tag_dictionary:
          artwork_df['tag'].loc[ind] = re.sub('"*'+t+'"*', 'Other', artwork_df['tag'][ind])

    artwork_df.to_csv(output)

In [None]:
tag_substitution(r'data/processed/artwork_sampled.csv', r'data/processed/artwork_filtered.csv')

In [None]:
# drops artworks in raw/artwork_captions.csv if not in processed/artwork_filtered.csv, else decodes html encoding
caption_df = pd.read_csv('data/raw/artwork_captions.csv')
artwork_df = pd.read_csv('data/processed/artwork_filtered.csv')
artwork_list = artwork_df['art_file'].values.tolist()

for ind in caption_df.index:
  if caption_df['image'][ind] not in artwork_list:
    caption_df = caption_df.drop([ind], axis=0)
  else:
    caption_df['name'][ind] = html.unescape(caption_df['name'][ind])

caption_df = caption_df.sort_values('image', ascending=True)
caption_df.to_csv('data/processed/artwork_caption.csv')

# drops artworks in processed/artwork_filtered.csv if not in processed/artwork_caption.csv
caption_list = caption_df['image'].values.tolist()
for ind in artwork_df.index:
  if artwork_df['art_file'][ind] not in caption_list:
    artwork_df = artwork_df.drop([ind], axis=0)

artwork_df = artwork_df.sort_values('art_file', ascending=True)
artwork_df.to_csv('data/processed/artwork_filtered.csv')

In [None]:
# generates an artwork description
def generate_description(df, ind):
  desc = {
    'artwork' : df['artwork'][ind],
    #'artist' : df['artist'][ind],
    'style' : df['style'][ind],
    'genre' : df['genre'][ind],
    'media' : df['media'][ind],
    'emotion' : df['emotion'][ind]
    }

  d = 'The artwork is entitled [{}]'.format(desc['artwork'])
  if desc['style'] != '[]':
    d = d + ', painted following the {} art style'.format(desc['style'])
  if desc['genre'] != '[]':
    d = d + ', belongs to the {} genre'.format(desc['genre'])
  if desc['media'] != '[]':
    d = d + ', made of {}'.format(desc['media'])
  if desc['emotion'] != '[]':
    d = d + ', elicits {} emotion'.format(desc['emotion'])
  d = d + '.'

  return d

In [None]:
# merge description and caption
caption_df = pd.read_csv('data/processed/artwork_caption.csv')

#description_df = pd.read_csv('data/processed/artwork_description.csv')
artwork_df = pd.read_csv('data/processed/artwork_filtered.csv')

info_frame = {'image': [], 'artwork': [], 'caption' : [], 'info' : [], 'tag' : []}
info_df = pd.DataFrame(data=info_frame)

pattern = r'"?\[?\'?(.*?)\'?(,)*\]?"?'
replacement = r'\1\2'

#artwork_list = artwork_df['art_file'].values.tolist()
for ind in artwork_df.index:
  if artwork_df['art_file'][ind] == caption_df['image'][ind]:
    description = generate_description(artwork_df, ind)
    tags = re.sub(pattern, replacement, artwork_df['tag'][ind])
    info = [artwork_df['art_file'][ind], # image
            artwork_df['artwork'][ind], # artwork
            caption_df['caption'][ind], # caption
            description, # description
            tags] #tag
    info_df.loc[len(info_df.index)] = info

info_df.to_csv('data/processed/artwork_info.csv')

In [None]:
import os
import pandas as pd
from datetime import datetime
import shutil
import pathlib

def copy_artworks(info_path, art_path, target_path):
    df = pd.read_csv(info_path)
    artworks = df['image'].values.tolist()
    files = pathlib.Path(art_path)
    print("[LOG] {} fetching artwork dataset...".format(datetime.now()))
    art = []
    for f in files.glob('*.jpg'):
      try:
        art.append(os.path.basename(f))
      except:
        print('An error occurred with {}'.format(f))

    print("[LOG] {} copying artworks to {}...".format(datetime.now(), target_path))
    copied = 0
    for f in art:
      if f in artworks and not os.path.isfile(os.path.join(target_path, f)):
          shutil.copy2(os.path.join(art_path, f), target_path)
          copied = copied+1
    print("[LOG] {} {} artworks copied to {}!".format(datetime.now(), copied, target_path))

# splits processed/artwork_info.csv into training, test and validation .csv, according to their respective files
def split_frame(frame, set):
    df = pd.read_csv(r"data/processed/artwork_info.csv")
    folders = os.listdir(set)
    for fold in folders:
        print("[LOG] {} fetching {} data...".format(datetime.now(), fold))
        set_frame = {'image': [], 'artwork': [], 'caption' : [], 'description' : [], 'tag' : []}
        set_df = pd.DataFrame(data=set_frame)
        files = os.scandir(os.path.join(set, f'{fold}/images-resized/'))
        for f in files:
            ind = df[df['image'] == f.name].index.values[0]
            info = [df['image'][ind], # image
                    df['artwork'][ind], # artwork
                    df['caption'][ind], # caption
                    df['info'][ind], # description
                    df['tag'][ind]] #tag
            set_df.loc[len(set_df.index)] = info

        set_df = set_df.sort_values('image', ascending=True)
        set_df.to_csv(os.path.join(set, f'{fold}/{fold}.csv'))
        print("[LOG] {} {} data successfully fetched!".format(datetime.now(), fold))

In [None]:
copy_artworks(info_path=r'data/processed/artwork_info.csv', art_path=r'raw-images/images-resized', target_path=r'raw-images/images-filtered')

In [None]:
import splitfolders

splitfolders.ratio(input=r'raw-images/images-filtered', output='dataset', seed=42, ratio=(.8, 0.0, 0.2))
split_frame(frame=r'data/processed/artwork_info.csv', set=r'dataset/')