In [227]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [228]:
artworks = pd.read_csv('https://media.githubusercontent.com/media/MuseumofModernArt/collection/master/Artworks.csv')

In [229]:
artworks.head(3)

Unnamed: 0,Title,Artist,ConstituentID,ArtistBio,Nationality,BeginDate,EndDate,Gender,Date,Medium,...,ThumbnailURL,Circumference (cm),Depth (cm),Diameter (cm),Height (cm),Length (cm),Weight (kg),Width (cm),Seat Height (cm),Duration (sec.)
0,"Ferdinandsbrücke Project, Vienna, Austria, Ele...",Otto Wagner,6210,"(Austrian, 1841–1918)",(Austrian),(1841),(1918),(Male),1896,Ink and cut-and-pasted painted pages on paper,...,http://www.moma.org/media/W1siZiIsIjU5NDA1Il0s...,,,,48.6,,,168.9,,
1,"City of Music, National Superior Conservatory ...",Christian de Portzamparc,7470,"(French, born 1944)",(French),(1944),(0),(Male),1987,Paint and colored pencil on print,...,http://www.moma.org/media/W1siZiIsIjk3Il0sWyJw...,,,,40.6401,,,29.8451,,
2,"Villa near Vienna Project, Outside Vienna, Aus...",Emil Hoppe,7605,"(Austrian, 1876–1957)",(Austrian),(1876),(1957),(Male),1903,"Graphite, pen, color pencil, ink, and gouache ...",...,http://www.moma.org/media/W1siZiIsIjk4Il0sWyJw...,,,,34.3,,,31.8,,


In [230]:
artworks = artworks[['Artist', 'Nationality', 'Gender', 'Date', 'Department',
                    'DateAcquired', 'URL', 'ThumbnailURL', 'Height (cm)', 'Width (cm)']]

In [231]:
# Convert URL's to booleans.
artworks['URL'] = artworks['URL'].notnull()*1
artworks['ThumbnailURL'] = artworks['ThumbnailURL'].notnull()*1

In [232]:
# Drop films and some other tricky rows.
artworks = artworks[artworks['Department']!='Film']
artworks = artworks[artworks['Department']!='Media and Performance Art']
artworks = artworks[artworks['Department']!='Fluxus Collection']

In [233]:
# Drop missing data.
artworks = artworks.dropna()

In [234]:
artworks.head(3)

Unnamed: 0,Artist,Nationality,Gender,Date,Department,DateAcquired,URL,ThumbnailURL,Height (cm),Width (cm)
0,Otto Wagner,(Austrian),(Male),1896,Architecture & Design,1996-04-09,1,1,48.6,168.9
1,Christian de Portzamparc,(French),(Male),1987,Architecture & Design,1995-01-17,1,1,40.6401,29.8451
2,Emil Hoppe,(Austrian),(Male),1903,Architecture & Design,1997-01-15,1,1,34.3,31.8


In [235]:
artworks['DateAcquired'] = pd.to_datetime(artworks.DateAcquired)
artworks['YearAcquired'] = artworks.DateAcquired.dt.year
artworks['YearAcquired'].dtype

dtype('int64')

In [236]:
# Remove multiple nationalities, genders, and artists.
artworks.loc[artworks['Gender'].str.contains('\) \('), 'Gender'] = '\(multiple_persons\)'
artworks.loc[artworks['Nationality'].str.contains('\) \('), 'Nationality'] = '\(multiple_nationalities\)'
artworks.loc[artworks['Artist'].str.contains(','), 'Artist'] = 'Multiple_Artists'

# Convert dates to start date, cutting down number of distinct examples.
artworks['Date'] = pd.Series(artworks.Date.str.extract(
    '([0-9]{4})', expand=False))[:-1]

# Final column drops and NA drop.
X = artworks.drop(['Department', 'DateAcquired', 'Artist', 'Nationality', 'Date'], 1)
X.head(3)

Unnamed: 0,Gender,URL,ThumbnailURL,Height (cm),Width (cm),YearAcquired
0,(Male),1,1,48.6,168.9,1996
1,(Male),1,1,40.6401,29.8451,1995
2,(Male),1,1,34.3,31.8,1997


In [250]:
artworks["Gender"].unique

<bound method Series.unique of 0                       (Male)
1                       (Male)
2                       (Male)
3                       (Male)
4                       (Male)
5                       (Male)
6                       (Male)
7                       (Male)
8                       (Male)
9                       (Male)
10                      (Male)
11                      (Male)
12                      (Male)
13                      (Male)
14                      (Male)
15                      (Male)
16                      (Male)
17                      (Male)
18                      (Male)
19                      (Male)
20                      (Male)
21                      (Male)
22                      (Male)
23                      (Male)
24                      (Male)
25                      (Male)
26                      (Male)
27                      (Male)
28                      (Male)
29                      (Male)
                  ...         
135050  

In [237]:
# Create dummies separately.
artists = pd.get_dummies(artworks.Artist)
nationalities = pd.get_dummies(artworks.Nationality)
dates = pd.get_dummies(artworks.Date)

In [239]:
# Concat with other variables, but artists slows this wayyyyy down so we'll keep it out for now
X = pd.get_dummies(X, sparse=True)
X = pd.concat([X, nationalities, dates], axis=1)

y = artworks.Department

In [240]:
X.shape

(105745, 314)

In [241]:
y.unique()

array(['Architecture & Design', 'Prints & Illustrated Books', 'Drawings',
       'Painting & Sculpture', 'Photography'], dtype=object)

In [242]:
# encode output variable
#from sklearn.preprocessing import LabelEncoder, OneHotEncoder
#encoder = LabelEncoder()
#encoded_y = encoder.fit_transform(y)
#ohe = OneHotEncoder(categorical_features=[0])
#y = ohe.fit_transform(encoded_y.reshape(-1,1))

In [243]:
#from sklearn.model_selection import train_test_split

#X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [244]:
#from keras.models import Sequential
#from keras.layers import Dense

#classifier = Sequential()

In [245]:
#classifier.add(Dense(units=158,kernel_initializer='uniform',activation="relu",input_dim=314))

In [246]:
#classifier.add(Dense(units=158,kernel_initializer='uniform',activation="relu"))

In [247]:
#classifier.add(Dense(units=158,kernel_initializer='uniform',activation="sigmoid"))

In [248]:
#classifier.compile(optimizer = "adam",loss = 'binary_crossentropy',metrics=["accuracy"])

In [251]:
#classifier.fit(X_train,y_train,batch_size=10,epochs=100)

In [252]:
from sklearn.neural_network import MLPClassifier

# Establish and fit the model, with a single, 1000 perceptron layer.
mlp = MLPClassifier(hidden_layer_sizes=(1000,))
mlp.fit(X, y)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(1000,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [254]:
mlp.score(X, y)

0.5042980755591281

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(mlp,X,y,cv=5)



Increasing the number of hidden layers beyond the sufficient number of layers will cause accuracy in the test set to decrease because it will cause the network to overfit the training set (it will learn the training data but it will not be able to generalize to new unseen data)