# Wikiart data cleanup:
## The images and metadata were downloaded using the amazing script from https://github.com/lucasdavid/wikiart, which was adapted to convert more data (artist name and title) in the final CSV file

### First, load the dataset (only metadata,  not the images)

In [None]:
from matplotlib import pyplot as plt
import numpy as np
import pandas
data = pandas.read_csv('wikiart.csv', encoding='utf-8')
data.head()

### Only keep what is necessary for the app / model and remove NANs

In [None]:
labels = data[['contentId','title','artistName','url', 'style','artistUrl']].copy()
labels=labels.dropna(subset=["style"])
# Only keep the 25 most prolific styles (enough data left for training)
style_count = labels['style'].value_counts()
Styles = style_count[0:25]
Styles

### Remove parenthesis from labels (will cause trouble when calling descriptions later in the app)

In [None]:
labels=labels.replace({'Naïve Art (Primitivism)': 'Naïve Art'})
labels=labels.replace({'Art Nouveau (Modern)': 'Modern'})
labels=labels.replace({'Mannerism (Late Renaissance)': 'Late Renaissance'})
style_count = labels['style'].value_counts()
Styles = style_count[0:25]
Styles

### Visualize the data distribution

In [None]:
with plt.xkcd():
    fig = plt.figure(num=None, figsize=(8, 10), dpi=100, facecolor='w', edgecolor='k')
    ax = fig.add_axes((0.2, 0.9, 0.9, 0.5))
    Styles.plot(kind = "barh")
    plt.xlabel('NUMBER OF ARTWORKS')
    plt.title("PROLIFIC STYLES FROM WIKIART DATA")

    plt.show()
    fig.savefig('Wikiart_data_overview_25.pdf',bbox_inches='tight')


### Some urls were not found, leading to no data downloads: need to remove them

In [None]:
# Identify corrupted images:
import cv2
import os
# Drop rows not containing the above styles:
Styles = Styles.index
DATA = labels[labels['style'].isin(Styles)]
DATA['contentId']=DATA['contentId'].apply(str)
bad_images = []
for (i, filename) in enumerate(DATA.contentId):
    imagePath = '/Users/lizbaldo/Desktop/wikiart-master/wikiart/images/' + filename + '.jpg'
    # Verify if file exists:
    if os.path.exists(imagePath):
        image = cv2.imread(imagePath)
        try:
            cv2.resize(image, (128,128)).flatten()
        except:
            print('corrupted: ' + imagePath)    
            bad_images.append(filename)
    else:
        print('no path: ' + imagePath)    
        bad_images.append(filename)
        
    # show an update every 200 images until the last image
    if i > 0 and ((i + 1)% 200 == 0 or i ==len(DATA.contentId)-1):
        print("[INFO] processed {}/{}".format(i+1, len(DATA.contentId)))

images_to_remove = set(bad_images)
DATA = DATA[~DATA['contentId'].isin(images_to_remove)]

### Separate and move images from the download folder into training and validation folders

In [None]:
#Split data into 700 images for training and 300 images for testing for each class:
for (i,str_style) in enumerate(Styles):
    print(str_style)
    DATA_tmp = DATA[DATA['style'].isin([str_style])]
    training = DATA_tmp.sample(n=700)
    training['contentId'] = training['contentId'].apply(str)
    filename1 = training['contentId']
    for str_1 in filename1:
        try:
            os.rename('/Users/lizbaldo/Desktop/wikiart-master/wikiart/images/' + str_1 + '.jpg', '/Users/lizbaldo/Desktop/wikiart-master/wikiart/train/' + str_style + '/' + str_1 + '.jpg')
        except:
            print(str_1 + 'not found')
    DATA_tmp = DATA_tmp[~DATA_tmp['contentId'].isin(filename1)]
    testing = DATA_tmp.sample(n=300)
    filename2 = testing['contentId'].apply(str)
    for str_2 in filename2:
        try:
            os.rename('/Users/lizbaldo/Desktop/wikiart-master/wikiart/images/' + str_2 + '.jpg', '/Users/lizbaldo/Desktop/wikiart-master/wikiart/val/' + str_style + '/' + str_2 + '.jpg')
        except:
            print(str_2 + 'not found')
    # show an update every 10 images until the last image
    if i > 0 and ((i + 1)% 10 == 0 or i ==1000-1):
        print("[INFO] processed {}/{}".format(i+1, 1000))

### Clean up artists names

In [None]:
for (i,name) in enumerate(DATA['artistUrl']):
    if i > 0 and (i + 1)% 200 == 0:
        print("[INFO] processed {}".format(i+1))
    try:
        tmp1 = name.replace("-", " ")
        tmp2 = tmp1.title()
        DATA['artistName'].iloc[i] = tmp2
    except:
        print(name)
DATA.head()

### Save into new csv file

In [None]:
data.to_csv('cleaned_wikiart_data.csv')