## Libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import os
import random
import seaborn as sns
import cv2

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import tensorflow as tf

from keras.applications.vgg19 import VGG19, preprocess_input
from keras.callbacks import EarlyStopping
from keras.layers import (BatchNormalization, 
                                     Dense, 
                                     Dropout, 
                                     Flatten, 
                                     Input,
                                     RandomFlip, 
                                     RandomRotation,
                                     RandomTranslation,
                                     RandomZoom)

from keras.models import Model, Sequential


sns.set_style('whitegrid')

ImportError: cannot import name 'OrderedDict' from 'typing' (C:\Users\user\.pyenv\pyenv-win\versions\3.7.0\lib\typing.py)

#### Config

In [None]:
DATA_PATH = '../data/extracted/'
DATA_PATH_IMAGE = os.path.join(DATA_PATH, 'images', 'images')
DATA_PATH_RESIZED = os.path.join(DATA_PATH, 'resized', 'resized')
IN_NO_PAINTINGS = 100
BASE_MODEL_TRAIN = False
IMG_HEIGHT, IMG_WIDTH = 256, 256
TRAIN_INPUT_SHAPE = (256, 256, 3)

### Read data

In [None]:
print(os.listdir(DATA_PATH))

In [None]:
df = pd.read_csv(os.path.join(DATA_PATH, 'artists.csv'), encoding='utf-8')
df.shape

## Data Preparation

### 2.1 Exploration

In [None]:
df = df[df['name'] != 'Albrecht DÃ¼rer']

In [None]:
# Sort df by number of paintings
df = df.sort_values(by=['paintings'], ascending=False)
df.head(10)

In [None]:
df = df.drop(['bio', 'wikipedia'], axis=1)


# Sort artists by number of paintings
df = df.sort_values(by=['paintings'], ascending=False)

# Create a dataframe with artists having more than 200 paintings
artists_top = df[df['paintings'] >= 100].reset_index()
artists_top = artists_top[['name', 'paintings']]
artists_top['class_weight'] = max(artists_top.paintings)/artists_top.paintings
artists_top

Because the data set is very imbalanced, the class weights are calculated from the total number of paintings and the number each artist has. This will be later used to weighten the loss and somewhat counteract the imbalance.

In [None]:

class_weights = artists_top['class_weight'].to_dict()
name_mapping = artists_top['name'].to_dict()
name_mapping_reverse = {y: x for x, y in name_mapping.items()}
n_classes = len(class_weights)


In [None]:
len(class_weights)

In [None]:
label_list = artists_top['name'].tolist()
print(f"Number of artists: {len(label_list)}")

### Print few random paintings

In [None]:
# Print few random paintings
n = 5
fig, axes = plt.subplots(1, n, figsize=(20,10))

for i in range(n):
    random_artist = random.choice(artists_top['name'].str.replace(' ', '_').values)
    random_image = random.choice(os.listdir(os.path.join(DATA_PATH_IMAGE, random_artist)))
    random_image_file = os.path.join(DATA_PATH_IMAGE, random_artist, random_image)
    image = plt.imread(random_image_file)
    axes[i].imshow(image)
    axes[i].set_title(random_artist.replace('_', ' '))
    axes[i].axis('off')

plt.show()


### Images

In [None]:
img_list = []
label_list = []

for file in os.listdir(DATA_PATH_RESIZED):
    idx = file.rfind('_')
    name = file[:idx].replace('_', ' ')
    if name in name_mapping_reverse.keys():
        label_list.append(name_mapping_reverse[name])
        img = cv2.imread(os.path.join(DATA_PATH_RESIZED, file))
        img = cv2.resize(img,(IMG_HEIGHT, IMG_WIDTH))
        img_list.append(img)
    
images = np.asarray(img_list)
labels = np.asarray(label_list)

In [None]:
images.shape

In [None]:
labels.shape

Because of the small amount of data available, a 80-10-10-split is used to maximize the number of images available for training.

- 80 - train
- 10 - validation
- 10 - test

In [None]:
X, X_test, y, y_test = train_test_split(images, labels, test_size = 0.1, random_state = 13)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 1/9, random_state = 13)

## 3.Impementation and Training

### 3.1 Loading the Pretrained Model

In [None]:
base_model = VGG19(include_top = False,
                   classes = n_classes, 
                   input_shape = TRAIN_INPUT_SHAPE,)

base_model.trainable = BASE_MODEL_TRAIN
base_model.summary()

In [None]:
data_augmentation = Sequential([
    RandomFlip('horizontal'),
    RandomFlip('vertical'),
    RandomRotation(0.2),
    RandomZoom(0.1),
    RandomTranslation(0.1, 0.1),
])

prediction = Sequential([
    Flatten(),
    Dense(512),
    BatchNormalization(),
    Dropout(0.1),
    Dense(512),
    Dense(n_classes, activation = 'softmax'),
])

In [None]:
inputs = Input(shape=TRAIN_INPUT_SHAPE)
x = data_augmentation(inputs)
x = preprocess_input(x)
x = base_model(x)
outputs = prediction(x)
model = Model(inputs, outputs)

In [None]:
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.summary()

### 3.2 Training the Model

In [None]:
epochs = 50
batch_size = 16

early_stopping = EarlyStopping(patience = 20, 
                               verbose = 2, 
                               restore_best_weights = True)
history = model.fit(X_train,
                    y_train,
                    validation_data = (X_val, y_val),
                    class_weight = class_weights,
                    epochs = epochs,
                    batch_size = batch_size,
                    callbacks = early_stopping)

### 4.1 Performance on Testing Data