## **Magic functions**

In [107]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

## **Required installation**

In [108]:
!pip install fastai fastbook nbdev



## **Necessary imports**

In [109]:
import os
import numpy as np
import pandas as pd
from fastai import *
from fastbook import *
from fastai.vision.all import *

## **Mounting drive**

In [110]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## **Folder initialization**

In [111]:
%cd /content/drive/MyDrive/artwork_description_generator/data/

/content/drive/MyDrive/artwork_description_generator/data


## **csv files path**

In [112]:
csv = "csvFiles/"

In [113]:
files = os.listdir(f'{csv}')

In [114]:
files

['Netherlands.csv', 'UnitedStates.csv']

## **Creation of the dataframe**

In [115]:
df = pd.DataFrame()
for file in files:
  data_frame = pd.read_csv(f"{csv}/{file}")
  df = pd.concat([df, data_frame], ignore_index = True)

## **Dataframe shape**

In [116]:
def df_shape(df):
  print(f'Number of rows: {df.shape[0]}')
  print(f'Number of columns: {df.shape[1]}')

## **Viewing dataframe head**

In [117]:
df.head()

Unnamed: 0,ids,artists,mediums,titles,descriptions,urls
0,nl_1,Piet Mondrian,Oil on canvas,"Lozenge Composition with Yellow, Black, Blue, Red, and Gray","Piet Mondrian, a painter of the revolutionary international movement De Stijl (the Style), argued that “the straight line tells the truth.” Why, then, we might wonder, would he choose to hang a painting off axis, where its edges imply dynamic diagonals? Among other motivations, rotating the canvas allowed Mondrian to reconsider a question he spent his career exploring, namely, the relationship between the contents of a painting and what contains them. In Lozenge Composition, the squared-off black lines imply enclosure, while a single line (above the blue area) extends to the slanted edge, ...",https://www.artic.edu/artworks/109819/lozenge-composition-with-yellow-black-blue-red-and-gray
1,nl_2,Jan Sanders van Hemessen,Oil on panel,Judith,"Judith was considered one of the most heroic women of the Old Testament. According to the biblical story, when her city was besieged by the Assyrian army, the beautiful young widow gained access to the quarters of the general Holofernes. After winning his confidence and getting him drunk, she took his sword and cut off his head, thereby saving the Jewish people. Although Judith was often shown richly and exotically clothed, Jan Sanders van Hemessen chose to present her as a monumental nude, aggressively brandishing her sword even after severing Holofernes’s head.Van Hemessen was one of the...",https://www.artic.edu/artworks/4575/judith
2,nl_3,Joachim Antonisz. Wtewael,Oil on copper,The Battle between the Gods and the Giants,"The subject of the victory of the gods of Olympus over the ancient race of giants provided Joachim Wtewael with the opportunity to depict exaggerated athletic poses and striking contrasts of space and light. From the clouds, the Olympian gods wield their attributes as weapons: Jupiter hurls thunderbolts; Neptune brandishes his triton; and Mercury uses his caduceus as a spear. The helmeted figure on the right is Minerva, the goddess of wisdom and war. The painting’s gemlike effect results from the use of a copper support and from its small scale. The artist’s self-conscious display of his s...",https://www.artic.edu/artworks/105466/the-battle-between-the-gods-and-the-giants
3,nl_4,Paulus Potter,Oil on panel,Two Cows and a Young Bull beside a Fence in a Meadow,"Paulus Potter, a prolific painter and etcher during his short life, elevated images of cows, oxen, and other domestic animals to majestic emblems of nature. His lavish attention to the physical appearances of such beasts—the varied texture and coloring of their hair, their characteristic poses, their bulky contours—borders on portraiture and likely derived from drawings he made from life. With Potter, animal painting blossomed into an independent genre in the Dutch Republic.",https://www.artic.edu/artworks/146953/two-cows-and-a-young-bull-beside-a-fence-in-a-meadow
4,nl_5,Pieter Jansz. Quast,Etching in black on paper,"Lame Beggar Asking for Alms, from T is al verwart-gaern (It’s already confusing)",,https://www.artic.edu/artworks/81/lame-beggar-asking-for-alms-from-t-is-al-verwart-gaern-it-s-already-confusing


In [118]:
df.isna().sum()

ids                0
artists          127
mediums            2
titles             3
descriptions    3871
urls               0
dtype: int64

In [119]:
images_path = 'images'

In [120]:
img_folders = os.listdir(f'{images_path}')

In [121]:
img_folders

['UnitedStates', 'Netherlands']

In [122]:
str(get_image_files(f"{images_path}/{img_folders[0]}")[0]).split("/")[2].split(".")[0]

'usa_1987'

In [123]:
get_image_files_sorted(f"{images_path}/{img_folders[0]}")

(#2970) [Path('images/UnitedStates/usa_1.jpg'),Path('images/UnitedStates/usa_10.jpg'),Path('images/UnitedStates/usa_100.jpg'),Path('images/UnitedStates/usa_1000.jpg'),Path('images/UnitedStates/usa_1001.jpg'),Path('images/UnitedStates/usa_1002.jpg'),Path('images/UnitedStates/usa_1003.jpg'),Path('images/UnitedStates/usa_1004.jpg'),Path('images/UnitedStates/usa_1005.jpg'),Path('images/UnitedStates/usa_1006.jpg')...]

In [124]:
img_ids = []

In [125]:
for folder in img_folders:
  temp = [str(path).split("/")[2].split(".")[0] for path in get_image_files_sorted(f"{images_path}/{folder}")]
  img_ids.extend(temp)

In [126]:
len(img_ids)

4981

In [127]:
df.columns

Index(['ids', 'artists', 'mediums', 'titles', 'descriptions', 'urls'], dtype='object')

In [128]:
corrupted_image_indices_to_drop = [index for index in range(len(df)) if df.iloc[index]['ids'] not in img_ids]

In [129]:
df.iloc[72]

ids                                                                                   nl_73
artists                                                                       Evert Pieters
mediums                                                           Graphite drawing on paper
titles                                              Little Girl Cutting an Apple in Kitchen
descriptions                                                                            NaN
urls            https://www.artic.edu/artworks/7481/little-girl-cutting-an-apple-in-kitchen
Name: 72, dtype: object

In [130]:
corrupted_image_indices_to_drop

[72,
 271,
 285,
 310,
 315,
 323,
 324,
 332,
 341,
 343,
 345,
 354,
 368,
 479,
 496,
 679,
 725,
 782,
 1770,
 1771,
 1807,
 1959,
 2006,
 2150,
 2331,
 2630,
 2728,
 2873,
 2880,
 3205,
 3375,
 3409,
 3525,
 3863,
 3880,
 3893,
 3907,
 4022,
 4085,
 4262,
 4279,
 4350,
 4397,
 4443,
 4615,
 4635,
 4687,
 4812,
 4819,
 4882,
 4917]

In [131]:
df = df.drop(corrupted_image_indices_to_drop).reset_index(drop=True)

In [132]:
df_shape(df)

Number of rows: 4981
Number of columns: 6


In [133]:
for path in get_image_files_sorted(f"{images_path}/{img_folders[0]}"):
  print(path)
  print(str(path).split("/")[2].split(".")[0])

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
images/UnitedStates/usa_1425.jpg
usa_1425
images/UnitedStates/usa_1426.jpg
usa_1426
images/UnitedStates/usa_1427.jpg
usa_1427
images/UnitedStates/usa_1428.jpg
usa_1428
images/UnitedStates/usa_1429.jpg
usa_1429
images/UnitedStates/usa_143.jpg
usa_143
images/UnitedStates/usa_1430.jpg
usa_1430
images/UnitedStates/usa_1431.jpg
usa_1431
images/UnitedStates/usa_1432.jpg
usa_1432
images/UnitedStates/usa_1433.jpg
usa_1433
images/UnitedStates/usa_1434.jpg
usa_1434
images/UnitedStates/usa_1435.jpg
usa_1435
images/UnitedStates/usa_1436.jpg
usa_1436
images/UnitedStates/usa_1437.jpg
usa_1437
images/UnitedStates/usa_1438.jpg
usa_1438
images/UnitedStates/usa_1439.jpg
usa_1439
images/UnitedStates/usa_144.jpg
usa_144
images/UnitedStates/usa_1440.jpg
usa_1440
images/UnitedStates/usa_1441.jpg
usa_1441
images/UnitedStates/usa_1442.jpg
usa_1442
images/UnitedStates/usa_1443.jpg
usa_1443
images/UnitedStates/usa_1444.jpg
usa_1444
images/UnitedSt

In [134]:
df['descriptions'].value_counts().sum()

1155

In [135]:
df['artists'].value_counts()

Hendrick Goltzius            206
Irving Penn                  154
Jan Saenredam                119
Ivan Albright                103
Irma Boom (Designer)          87
                            ... 
Marguerite Zorach              1
Harriet Hosmer (Sculptor)      1
Mariko Mori                    1
Wilson Henry Irvine            1
Christina Ramberg              1
Name: artists, Length: 1200, dtype: int64

In [136]:
df['titles'].value_counts()

Untitled               98
Self-Portrait          15
Sponge Box             14
Fragment               14
anonymous histories    10
                       ..
Harvest Talk            1
Croquet Scene           1
View of Cotopaxi        1
The Room No. VI         1
Bar, New York City      1
Name: titles, Length: 4298, dtype: int64

In [137]:
df['titles'].value_counts().sum()

4978

In [138]:
df['mediums'].value_counts().sum()

4979

In [139]:
df['mediums'].value_counts()

Gelatin silver print                                                                                                        508
Engraving on ivory laid paper                                                                                               386
Oil on canvas                                                                                                               368
Etching on ivory laid paper                                                                                                 227
Engraving on laid paper                                                                                                     130
                                                                                                                           ... 
Color video, sound; 29 min. loop                                                                                              1
Oil on silk                                                                                             

## **Images Root folder path**

In [140]:
images_root_path = "/content/drive/MyDrive/artwork_description_generator/data/images/"

## **Mapping abbreviations with root folder name**

In [160]:
path_dict = {
    'usa' : 'UnitedStates',
    'nl' : 'Netherlands',
    'ch' : 'China'
}

## **Listing image paths**

In [142]:
image_paths = [ f"{images_root_path}{path_dict[df.iloc[i]['ids'].split('_')[0]]}/{df.iloc[i]['ids']}.jpg" for i in range(len(df))]

## **New column with image paths**

In [143]:
df['images_path'] = image_paths

In [144]:
# for i in range(len(df)):
#   prefix = df.iloc[i]['ids'].split("_")[0]
#   country = path_dict[prefix]
#   image_paths.append(f"{images_root_path}{country}/{df.iloc[i]['ids']}.jpg")

In [145]:
# image_paths

In [146]:
titles_dict = df['titles'].value_counts().to_dict()

In [147]:
multiple_same_titles = {key: 0 for key, value in titles_dict.items() if value > 1}

In [148]:
indices_to_drop = []

In [149]:
for i in range(len(df)):
  if df.iloc[i]['titles'] in multiple_same_titles.keys() and multiple_same_titles[df.iloc[i]['titles']] == 0:
    multiple_same_titles[df.iloc[i]['titles']] = 1
  elif df.iloc[i]['titles'] in multiple_same_titles.keys() and multiple_same_titles[df.iloc[i]['titles']] == 1:
    indices_to_drop.append(i)

In [150]:
len(indices_to_drop)

680

In [151]:
df = df.drop(indices_to_drop).reset_index(drop=True)
df.head()

In [159]:
df_shape(df)

Number of rows: 4301
Number of columns: 2


In [157]:
df = df.drop(columns=['ids', 'artists', 'mediums', 'descriptions', 'urls'], axis=1)

In [158]:
df.head()

Unnamed: 0,titles,images_path
0,"Lozenge Composition with Yellow, Black, Blue, Red, and Gray",/content/drive/MyDrive/artwork_description_generator/data/images/Netherlands/nl_1.jpg
1,Judith,/content/drive/MyDrive/artwork_description_generator/data/images/Netherlands/nl_2.jpg
2,The Battle between the Gods and the Giants,/content/drive/MyDrive/artwork_description_generator/data/images/Netherlands/nl_3.jpg
3,Two Cows and a Young Bull beside a Fence in a Meadow,/content/drive/MyDrive/artwork_description_generator/data/images/Netherlands/nl_4.jpg
4,"Lame Beggar Asking for Alms, from T is al verwart-gaern (It’s already confusing)",/content/drive/MyDrive/artwork_description_generator/data/images/Netherlands/nl_5.jpg


In [161]:
df.to_csv("image_title_generation_data.csv", index=False)