# Drill Playing with layers

Now it's your turn. Using the space below, experiment with different hidden layer structures. You can try this on a subset of the data to improve runtime. See how things vary. See what seems to matter the most. Feel free to manipulate other parameters as well. It may also be beneficial to do some real feature selection work...

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import Perceptron

In [146]:
artworks = pd.read_csv('https://media.githubusercontent.com/media/MuseumofModernArt/collection/master/Artworks.csv')

In [147]:
artworks.columns

Index(['Title', 'Artist', 'ConstituentID', 'ArtistBio', 'Nationality',
       'BeginDate', 'EndDate', 'Gender', 'Date', 'Medium', 'Dimensions',
       'CreditLine', 'AccessionNumber', 'Classification', 'Department',
       'DateAcquired', 'Cataloged', 'ObjectID', 'URL', 'ThumbnailURL',
       'Circumference (cm)', 'Depth (cm)', 'Diameter (cm)', 'Height (cm)',
       'Length (cm)', 'Weight (kg)', 'Width (cm)', 'Seat Height (cm)',
       'Duration (sec.)'],
      dtype='object')

In [148]:
# Select Columns.
artworks = artworks[['Artist', 'Nationality', 'Gender', 'Date', 'Department',
                    'DateAcquired', 'URL', 'ThumbnailURL', 'Height (cm)', 'Width (cm)']]

# Convert URL's to booleans.
artworks['URL'] = artworks['URL'].notnull()
artworks['ThumbnailURL'] = artworks['ThumbnailURL'].notnull()

# Drop films and some other tricky rows.
artworks = artworks[artworks['Department']!='Film']
artworks = artworks[artworks['Department']!='Media and Performance Art']
artworks = artworks[artworks['Department']!='Fluxus Collection']


# Convert dates to start date, cutting down number of distinct examples.
artworks['Date'] = pd.Series(artworks.Date.str.extract(
    '([0-9]{4})', expand=False))[:-1]
artworks['Date'] = pd.to_numeric(artworks['Date'], errors='coerce')

# Drop missing data.
artworks = artworks.dropna()

In [149]:
artworks.head()

Unnamed: 0,Artist,Nationality,Gender,Date,Department,DateAcquired,URL,ThumbnailURL,Height (cm),Width (cm)
0,Otto Wagner,(Austrian),(Male),1896.0,Architecture & Design,1996-04-09,True,True,48.6,168.9
1,Christian de Portzamparc,(French),(Male),1987.0,Architecture & Design,1995-01-17,True,True,40.6401,29.8451
2,Emil Hoppe,(Austrian),(Male),1903.0,Architecture & Design,1997-01-15,True,True,34.3,31.8
3,Bernard Tschumi,(),(Male),1980.0,Architecture & Design,1995-01-17,True,True,50.8,50.8
4,Emil Hoppe,(Austrian),(Male),1903.0,Architecture & Design,1997-01-15,True,True,38.4,19.1


In [150]:
artworks['DateAcquired'] = pd.to_datetime(artworks.DateAcquired)
artworks['YearAcquired'] = artworks.DateAcquired.dt.year
artworks['YearAcquired'].dtype

dtype('int64')

In [151]:
# Remove multiple nationalities, genders, and artists.
artworks.loc[artworks['Gender'].str.contains('\) \('), 'Gender'] = '(multiple_persons)'
artworks.loc[artworks['Nationality'].str.contains('\) \('), 'Nationality'] = '(multiple_nationalities)'
artworks.loc[artworks['Artist'].str.contains(','), 'Artist'] = 'Multiple_Artists'



# Final column drops and NA drop.
X = artworks.drop(['Department', 'DateAcquired', 'Artist', 'Nationality', 'Gender'], 1)

# Create dummies separately.
artists = pd.get_dummies(artworks.Artist)
nationalities = pd.get_dummies(artworks.Nationality)
gender = pd.get_dummies(artworks.Gender)

# No need to get dates for dummies?
#dates = pd.get_dummies(artworks.Date)

# Concat with other variables, but artists slows this wayyyyy down so we'll keep it out for now
#X = pd.get_dummies(sparse=True)
X = pd.concat([X, nationalities], axis=1)
X = pd.concat([X, gender], axis=1)
Y = artworks.Department



In [152]:
X.Date.isnull().sum()

0

In [153]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 103885 entries, 0 to 133542
Columns: 115 entries, Date to (multiple_persons)
dtypes: bool(2), float64(3), int64(1), uint8(109)
memory usage: 15.0 MB


In [154]:
# Reduce number of observations to 1000 improve runtime

# Add y back in
X_y = pd.concat([X, artworks.Department], axis=1)
# Take samples
X_small= X_y.sample(n=10000, replace=False)
# Split y out again
y = X_small['Department']
X_small.drop(['Department'], inplace=True, axis=1)
print(X_small.columns)

Index(['Date', 'URL', 'ThumbnailURL', 'Height (cm)', 'Width (cm)',
       'YearAcquired', '()', '(Albanian)', '(Algerian)', '(American)',
       ...
       '(Venezuelan)', '(Yugoslav)', '(Zimbabwean)',
       '(multiple_nationalities)', '(nationality unknown)', '()', '(Female)',
       '(Male)', '(male)', '(multiple_persons)'],
      dtype='object', length=115)


## The basic NN

In [155]:
# Import the model.
from sklearn.neural_network import MLPClassifier

# Establish and fit the model, with a single, 100 perceptron layer.
# Reduce iterations to 200
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=200)
mlp.fit(X_small, y)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [156]:
mlp.score(X_small, y)

0.5207

In [157]:
# No test train split, so overfitting is likely

from sklearn.model_selection import cross_val_score
cross_val_score(mlp, X_small, y, cv=5)

array([0.51622566, 0.58      , 0.5555    , 0.54127064, 0.52352352])

## Adding hidden layers and reducing feature set

In [160]:
# Create instance with 2 layers of 100 each
mlp = MLPClassifier(hidden_layer_sizes=(100,100,), alpha=0.0001, max_iter=200)
mlp.fit(X_small, y)
mlp.score(X_small, y)

0.494

In [161]:
# No test train split, so overfitting is likely

from sklearn.model_selection import cross_val_score
cross_val_score(mlp, X_small, y, cv=5)

array([0.32401398, 0.2355    , 0.526     , 0.52976488, 0.24024024])

In [54]:
#Create instance with 3 layers of 10 each
# Higher alpha to reduce runtime
mlp = MLPClassifier(hidden_layer_sizes=(100,100,100,), alpha=0.01, max_iter=100)
mlp.fit(X_small, y)
mlp.score(X_small, y)

0.3921

In [164]:
mlp = MLPClassifier(hidden_layer_sizes=(500,), max_iter=200)
mlp.fit(X_small, y)
mlp.score(X_small, y)

0.2298