# Supervised Neural Nets

In [1]:
# Import modules.
import time
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier

In [2]:
# Load data.
raw_data = pd.read_csv('https://media.githubusercontent.com/media/MuseumofModernArt/collection/master/Artworks.csv')

In [3]:
def viewDf(dataframe):
    """Prints general dataframe info."""
    print('General info:')
    print(dataframe.info())
    print('\nNull Counts:\n', dataframe.isnull().sum())
    print('\nStatistics:\n', dataframe.describe())
    print('\nViewing...')
    return dataframe.head()

viewDf(raw_data)

General info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136759 entries, 0 to 136758
Data columns (total 29 columns):
Title                 136720 non-null object
Artist                135301 non-null object
ConstituentID         135301 non-null object
ArtistBio             131293 non-null object
Nationality           135301 non-null object
BeginDate             135301 non-null object
EndDate               135301 non-null object
Gender                135301 non-null object
Date                  134348 non-null object
Medium                125694 non-null object
Dimensions            125749 non-null object
CreditLine            133870 non-null object
AccessionNumber       136759 non-null object
Classification        136759 non-null object
Department            136759 non-null object
DateAcquired          129922 non-null object
Cataloged             136759 non-null object
ObjectID              136759 non-null int64
URL                   79218 non-null object
ThumbnailURL          

Unnamed: 0,Title,Artist,ConstituentID,ArtistBio,Nationality,BeginDate,EndDate,Gender,Date,Medium,...,ThumbnailURL,Circumference (cm),Depth (cm),Diameter (cm),Height (cm),Length (cm),Weight (kg),Width (cm),Seat Height (cm),Duration (sec.)
0,"Ferdinandsbrücke Project, Vienna, Austria, Ele...",Otto Wagner,6210,"(Austrian, 1841–1918)",(Austrian),(1841),(1918),(Male),1896,Ink and cut-and-pasted painted pages on paper,...,http://www.moma.org/media/W1siZiIsIjU5NDA1Il0s...,,,,48.6,,,168.9,,
1,"City of Music, National Superior Conservatory ...",Christian de Portzamparc,7470,"(French, born 1944)",(French),(1944),(0),(Male),1987,Paint and colored pencil on print,...,http://www.moma.org/media/W1siZiIsIjk3Il0sWyJw...,,,,40.6401,,,29.8451,,
2,"Villa near Vienna Project, Outside Vienna, Aus...",Emil Hoppe,7605,"(Austrian, 1876–1957)",(Austrian),(1876),(1957),(Male),1903,"Graphite, pen, color pencil, ink, and gouache ...",...,http://www.moma.org/media/W1siZiIsIjk4Il0sWyJw...,,,,34.3,,,31.8,,
3,"The Manhattan Transcripts Project, New York, N...",Bernard Tschumi,7056,"(French and Swiss, born Switzerland 1944)",(),(1944),(0),(Male),1980,Photographic reproduction with colored synthet...,...,http://www.moma.org/media/W1siZiIsIjEyNCJdLFsi...,,,,50.8,,,50.8,,
4,"Villa, project, outside Vienna, Austria, Exter...",Emil Hoppe,7605,"(Austrian, 1876–1957)",(Austrian),(1876),(1957),(Male),1903,"Graphite, color pencil, ink, and gouache on tr...",...,http://www.moma.org/media/W1siZiIsIjEyNiJdLFsi...,,,,38.4,,,19.1,,


In [4]:
print(raw_data.columns)
raw_data.head()

Index(['Title', 'Artist', 'ConstituentID', 'ArtistBio', 'Nationality',
       'BeginDate', 'EndDate', 'Gender', 'Date', 'Medium', 'Dimensions',
       'CreditLine', 'AccessionNumber', 'Classification', 'Department',
       'DateAcquired', 'Cataloged', 'ObjectID', 'URL', 'ThumbnailURL',
       'Circumference (cm)', 'Depth (cm)', 'Diameter (cm)', 'Height (cm)',
       'Length (cm)', 'Weight (kg)', 'Width (cm)', 'Seat Height (cm)',
       'Duration (sec.)'],
      dtype='object')


Unnamed: 0,Title,Artist,ConstituentID,ArtistBio,Nationality,BeginDate,EndDate,Gender,Date,Medium,...,ThumbnailURL,Circumference (cm),Depth (cm),Diameter (cm),Height (cm),Length (cm),Weight (kg),Width (cm),Seat Height (cm),Duration (sec.)
0,"Ferdinandsbrücke Project, Vienna, Austria, Ele...",Otto Wagner,6210,"(Austrian, 1841–1918)",(Austrian),(1841),(1918),(Male),1896,Ink and cut-and-pasted painted pages on paper,...,http://www.moma.org/media/W1siZiIsIjU5NDA1Il0s...,,,,48.6,,,168.9,,
1,"City of Music, National Superior Conservatory ...",Christian de Portzamparc,7470,"(French, born 1944)",(French),(1944),(0),(Male),1987,Paint and colored pencil on print,...,http://www.moma.org/media/W1siZiIsIjk3Il0sWyJw...,,,,40.6401,,,29.8451,,
2,"Villa near Vienna Project, Outside Vienna, Aus...",Emil Hoppe,7605,"(Austrian, 1876–1957)",(Austrian),(1876),(1957),(Male),1903,"Graphite, pen, color pencil, ink, and gouache ...",...,http://www.moma.org/media/W1siZiIsIjk4Il0sWyJw...,,,,34.3,,,31.8,,
3,"The Manhattan Transcripts Project, New York, N...",Bernard Tschumi,7056,"(French and Swiss, born Switzerland 1944)",(),(1944),(0),(Male),1980,Photographic reproduction with colored synthet...,...,http://www.moma.org/media/W1siZiIsIjEyNCJdLFsi...,,,,50.8,,,50.8,,
4,"Villa, project, outside Vienna, Austria, Exter...",Emil Hoppe,7605,"(Austrian, 1876–1957)",(Austrian),(1876),(1957),(Male),1903,"Graphite, color pencil, ink, and gouache on tr...",...,http://www.moma.org/media/W1siZiIsIjEyNiJdLFsi...,,,,38.4,,,19.1,,


# Process Data

In [5]:
# Select Columns.
artworks = raw_data[['Artist', 'Nationality', 'Gender', 'Date', 'Department',
                    'DateAcquired', 'URL', 'ThumbnailURL', 'Height (cm)', 'Width (cm)']]

In [6]:
# To avoid setting with copy warning...
artworks = artworks.copy()

In [7]:
# Convert URL's to booleans.
artworks['URL'] = artworks['URL'].notnull()
artworks['ThumbnailURL'] = artworks['ThumbnailURL'].notnull()

In [8]:
# Drop films and some other tricky rows.
artworks = artworks[artworks['Department'] != 'Film']
artworks = artworks[artworks['Department'] != 'Media and Performance Art']
artworks = artworks[artworks['Department'] != 'Fluxus Collection']

In [9]:
# Drop missing data.
artworks = artworks.dropna()

In [10]:
# View dataframe.
artworks.head()

Unnamed: 0,Artist,Nationality,Gender,Date,Department,DateAcquired,URL,ThumbnailURL,Height (cm),Width (cm)
0,Otto Wagner,(Austrian),(Male),1896,Architecture & Design,1996-04-09,True,True,48.6,168.9
1,Christian de Portzamparc,(French),(Male),1987,Architecture & Design,1995-01-17,True,True,40.6401,29.8451
2,Emil Hoppe,(Austrian),(Male),1903,Architecture & Design,1997-01-15,True,True,34.3,31.8
3,Bernard Tschumi,(),(Male),1980,Architecture & Design,1995-01-17,True,True,50.8,50.8
4,Emil Hoppe,(Austrian),(Male),1903,Architecture & Design,1997-01-15,True,True,38.4,19.1


## Building the Model

Use Multi-Layer Perceptron modeling (MLP) to see if we can classify the department a piece should go into using everything but the department name.

First, let's ensure correct typing for our data & do other cleaning.

In [11]:
# Get data types.
artworks.dtypes

Artist           object
Nationality      object
Gender           object
Date             object
Department       object
DateAcquired     object
URL                bool
ThumbnailURL       bool
Height (cm)     float64
Width (cm)      float64
dtype: object

In [12]:
# DateAcquired column is an object, let's transform that to datetime object
# & add a feature for just the year the artwork was acquired.
artworks['DateAcquired'] = pd.to_datetime(artworks.DateAcquired)
artworks['YearAcquired'] = artworks.DateAcquired.dt.year
artworks['YearAcquired'].dtype

dtype('int64')

In [13]:
# Remove multiple nationalities, genders, and artists.
artworks.loc[artworks['Gender'].str.contains('\) \('), 'Gender'] = '\(multiple_persons\)'
artworks.loc[artworks['Nationality'].str.contains('\) \('), 'Nationality'] = '\(multiple_nationalities\)'
artworks.loc[artworks['Artist'].str.contains(','), 'Artist'] = 'Multiple_Artists'

In [14]:
artworks.head()

Unnamed: 0,Artist,Nationality,Gender,Date,Department,DateAcquired,URL,ThumbnailURL,Height (cm),Width (cm),YearAcquired
0,Otto Wagner,(Austrian),(Male),1896,Architecture & Design,1996-04-09,True,True,48.6,168.9,1996
1,Christian de Portzamparc,(French),(Male),1987,Architecture & Design,1995-01-17,True,True,40.6401,29.8451,1995
2,Emil Hoppe,(Austrian),(Male),1903,Architecture & Design,1997-01-15,True,True,34.3,31.8,1997
3,Bernard Tschumi,(),(Male),1980,Architecture & Design,1995-01-17,True,True,50.8,50.8,1995
4,Emil Hoppe,(Austrian),(Male),1903,Architecture & Design,1997-01-15,True,True,38.4,19.1,1997


In [15]:
# Convert dates to start date, cutting down number of distinct examples.
artworks['Date'] = pd.Series(artworks.Date.str.extract(
    '([0-9]{4})', expand=False))[:-1]

In [16]:
# Final column drops and NA drop.
X = artworks.drop(['Department', 'DateAcquired', 'Artist', 'Nationality', 'Date'], 1)

In [17]:
# Create dummies separately.
artists = pd.get_dummies(artworks.Artist)
nationalities = pd.get_dummies(artworks.Nationality)
dates = pd.get_dummies(artworks.Date)

In [18]:
# Concat with other variables, but artists slows this way down so we'll 
# keep it out for now.
X = pd.get_dummies(X, sparse = True)
X = pd.concat([X, nationalities, dates], axis = 1)
Y = artworks.Department

### Prep complete, let's build the model.
Neural networks are computationally intensive, this may take several minutes to run.

In [22]:
# Import the model.
from sklearn.neural_network import MLPClassifier

# Establish and fit the model, with a single, 1000 perceptron layer.
mlp = MLPClassifier(hidden_layer_sizes = (1000,))
mlp.fit(X, Y)
mlp.score(X, Y)
Y.value_counts() / len(Y)
from sklearn.model_selection import cross_val_score
cross_val_score(mlp, X, Y, cv = 5)

array([0.67050809, 0.7695143 , 0.50046659, 0.5864402 , 0.48756474])

### Results

Model seems to overfit, though there is still remaining performance when validated with cross validation. This occurs in neural networks that aren't given enough data for the number of features present. 

Increasing the layer size will increase runtime. 

Note we created bools for artist's name but left them out. Both of the above points are the reason for that - it would take much longer to run and it would be much more prone to overfitting.

### Model Parameters
1. Hidden Layer Size
2. Alpha
3. Activation Function

We included 1 parameter: **Hidden Layer Size**. This tells us how many and how big to make our layers. Pass in a tuple that specifies each layer's size. Our network is 1000 neurons wide and 1 layer. (100, 4, ) would create a network with 2 layers, one 100 wide and the other 4. 

How many layers? Determined by computational resources & cross validation searching for convergence. It is generally less than the number of input variables you have. 

You can also set an **Alpha**, in which NN like this use as a regularization parameter that penalizes large coefficients just like we discussed in the advanced regression section. Alpha scales that penalty. 

**Activation Function**: determines whether the output from an individual perceptron is binary or continuous. By default, this is a 'relu', or 'rectified linear unit function' function. In this exercise, we used this binary function, but we discussed the *sigmoid* as a reasonable alternative. The *sigmoid* (called 'logistic' by SKLearn because it's a 'logistic sigmoid function') allows for continuous variables btwn 0 and 1, which allows for a more nuanced model. It does come at the cost of increased computational complexity. 

### Experiment with different hidden layer structures

Try on a subset to improve runtime. 

In [31]:
# Reduce size for runtime improvement.
df_sample = artworks.sample(frac = 0.1)

# Final column drops and NA drop.
X_sample = df_sample.drop(['Department', 'DateAcquired', 'Artist', 'Nationality', 'Date'], 1)

# Create dummies separately.
#artists_sample = pd.get_dummies(df_sample.Artist)
nationalities_sample = pd.get_dummies(df_sample.Nationality)
dates_sample = pd.get_dummies(df_sample.Date)

# Concat with other variables, keeping out artists.
X_sample = pd.get_dummies(X_sample, sparse = True)
X_sample = pd.concat([X_sample, nationalities_sample, dates_sample], axis = 1)
Y_sample = df_sample.Department

In [115]:
"""
# hidden_layer_sizes : tuple, length = n_layers - 2, default (100,)
# alpha : float, optional, default 0.0001
# activation : {‘identity’, ‘logistic’, ‘tanh’, ‘relu’}, default ‘relu’
"""

# Try the different layers.
def runMLP(layerSize, alphaSize, activationFunct):
    print('\nUsing parameters: {}, {}, {}'.format(layerSize, alphaSize, activationFunct))
    start_time = time.time()
    from sklearn.neural_network import MLPClassifier
    mlp = MLPClassifier(hidden_layer_sizes=layerSize, alpha=alphaSize, activation=activationFunct)
    mlp.fit(X_sample, Y_sample)
    print(mlp.score(X_sample, Y_sample))
    from sklearn.model_selection import cross_val_score
    print(cross_val_score(mlp, X_sample, Y_sample, cv=5))
    print('Time taken: {} seconds'.format('%.1f' % (time.time() - start_time)))

In [116]:
# Parameter inputs.
list_layer = [50, 100, 150]
list_alpha = [0.00001, 0.0001, 0.001]
list_activation = ['identity', 'relu', 'logistic']

In [117]:
for a in list_layer:
    for b in list_alpha:
        for c in list_activation:
            runMLP(a, b, c)


Using parameters: 50, 1e-05, identity
0.7055804404628593
[0.64398882 0.65251866 0.65702287 0.64845938 0.67772069]
Time taken: 4.8 seconds

Using parameters: 50, 1e-05, relu
0.7242441209406495
[0.67148183 0.70615672 0.69062063 0.67553688 0.68425969]
Time taken: 8.2 seconds

Using parameters: 50, 1e-05, logistic
0.6422172452407615
[0.63280522 0.63899254 0.6364909  0.70868347 0.62727697]
Time taken: 18.3 seconds

Using parameters: 50, 0.0001, identity
0.6921425905188503
[0.67520969 0.59375    0.68735418 0.69561158 0.67164876]
Time taken: 5.5 seconds

Using parameters: 50, 0.0001, relu
0.6928891377379619
[0.68126747 0.65858209 0.62529165 0.66666667 0.66697805]
Time taken: 9.1 seconds

Using parameters: 50, 0.0001, logistic
0.7243374393430384
[0.71342032 0.72108209 0.69108726 0.71195145 0.65390005]
Time taken: 27.6 seconds

Using parameters: 50, 0.001, identity
0.6749720044792833
[0.64305685 0.63386194 0.67102193 0.6372549  0.60532461]
Time taken: 5.4 seconds

Using parameters: 50, 0.001, 

100 layers, 0.00001 alpha, and logistic parameters were best performing in terms of score. 