# Art Museum Staff Demographics Dashboard: 2015 and 2018

In [11]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import category_encoders as ce
import matplotlib.pyplot as pl

This is a dataset that I compiled beginning in 2014, with support from the Mellon Foundation.  In this initiative, I surveyed on behalf of Mellon several hundred museums in the US to gather demographic data. The data that was submitted comes from their HR systems. I received ~250 excel spreadsheets which were appended into a single dataset in 2015. In 2018, another cycle of data was collected. An anonymized version of the data will be published with the National Archive for Data on Arts and Culture (NADAC), which is part of ICPSR. 

In this project, I will prepare the data for analysis, explore the data with visualizations, fit the data to a decision tree, tune the parameters of the decision tree in order to maximize its predictions, explore the data using a partial dependence plot, and fit the data to a gradient boosted model. I will then prepare a python file to be hosted on github with a pickled model, and deploy a dashboard as a web application on streamlit. 

- Prep
- Vis
- Fit
- PDP
- GBM
- Dash

## Prep

In [13]:
df = pd.read_csv('/Users/liamsweeney/IthakaSR/LS_NADAC_1_29_2022.csv')

In [16]:
df.head()

Unnamed: 0,job_title,job_type,eeo_job_c,exemption,employment,ft/pt,education,race,ethnicity,gender,...,Unnamed: 245,Unnamed: 246,Unnamed: 247,Unnamed: 248,Unnamed: 249,Unnamed: 250,Unnamed: 251,Unnamed: 252,Unnamed: 253,Unnamed: 254
0,administrative associate,support/administrator,administrative support,non-exempt,regular employee,full time,bachelor's degree,white,no,female,...,,,,,,,,,,
1,curator,curators,professionals,exempt,regular employee,full time,master's degree,white,no,male,...,,,,,,,,,,
2,facility manager,facilities,professionals,exempt,regular employee,full time,master's degree,white,no,female,...,,,,,,,,,,
3,museum registrar,registrar,professionals,exempt,regular employee,full time,master's degree,white,no,female,...,,,,,,,,,,
4,educational programs manager,education,professionals,exempt,regular employee,full time,phd,white,no,female,...,,,,,,,,,,


In [18]:
df.dropna(how='all', axis=1)

Unnamed: 0,job_title,job_type,eeo_job_c,exemption,employment,ft/pt,education,race,ethnicity,gender,...,year,affiliation,count,size,anonymized,state,zip,type,budget,Unnamed: 254
0,administrative associate,support/administrator,administrative support,non-exempt,regular employee,full time,bachelor's degree,white,no,female,...,2018,AAM,1,,musuem 296,,,,,
1,curator,curators,professionals,exempt,regular employee,full time,master's degree,white,no,male,...,2018,AAM,2,,musuem 296,,,,,
2,facility manager,facilities,professionals,exempt,regular employee,full time,master's degree,white,no,female,...,2018,AAM,3,,musuem 296,,,,,
3,museum registrar,registrar,professionals,exempt,regular employee,full time,master's degree,white,no,female,...,2018,AAM,4,,musuem 296,,,,,
4,educational programs manager,education,professionals,exempt,regular employee,full time,phd,white,no,female,...,2018,AAM,5,,musuem 296,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52819,,membership/development (includes event planning),decline to state,decline to state,decline to state,decline to state,decline to state,white,no,male,...,2015,AAM,52820,small,musuem 284,,,,,
52820,,curators,decline to state,decline to state,decline to state,decline to state,decline to state,white,no,female,...,2015,AAM,52821,small,musuem 284,,,,,
52821,,membership/development (includes event planning),decline to state,decline to state,decline to state,decline to state,decline to state,white,no,female,...,2015,AAM,52822,small,musuem 284,,,,,
52822,,membership/development (includes event planning),decline to state,decline to state,decline to state,decline to state,decline to state,white,no,female,...,2015,AAM,52823,small,musuem 284,,,,,


In [17]:
df.drop("origin_filename", axis = 1)

KeyError: "['origin_filename'] not found in axis"

In [None]:
df.isnull().sum()

In [23]:
df.dtypes

job_title             object
job_type              object
eeo_job_c             object
exemption             object
employment            object
ft_pt                 object
education             object
race                  object
ethnicity             object
gender                object
protected             object
disability            object
year_of_b             object
year_of_h             object
origin_filename       object
repeat_participant    object
year                   int64
affiliation           object
count                  int64
size                  object
anonymized            object
dtype: object

In [24]:
grouping_Year = df.groupby(['year'])

In [25]:
df['year'] = df['year'].apply(str)

In [26]:
df.dtypes

job_title             object
job_type              object
eeo_job_c             object
exemption             object
employment            object
ft_pt                 object
education             object
race                  object
ethnicity             object
gender                object
protected             object
disability            object
year_of_b             object
year_of_h             object
origin_filename       object
repeat_participant    object
year                  object
affiliation           object
count                  int64
size                  object
anonymized            object
dtype: object

In [27]:
df.head()

Unnamed: 0,job_title,job_type,eeo_job_c,exemption,employment,ft_pt,education,race,ethnicity,gender,protected,disability,year_of_b,year_of_h,origin_filename,repeat_participant,year,affiliation,count,size,anonymized
0,Administrative Associate,Support/Administrator,Administrative Support,Non-exempt,Regular employee,Full time,Bachelor's Degree,White,No,Female,No,Person without a disability,1982,2015,Art Museum of West Virginia University,0,2018,AAM,1,,Musuem 296
1,Curator,Curators,Professionals,Exempt,Regular employee,Full time,Master's Degree,White,No,Male,No,Person without a disability,1962,2010,Art Museum of West Virginia University,0,2018,AAM,2,,Musuem 296
2,Facility Manager,Facilities,Professionals,Exempt,Regular employee,Full time,Master's Degree,White,No,Female,No,Person without a disability,1982,2015,Art Museum of West Virginia University,0,2018,AAM,3,,Musuem 296
3,Museum Registrar,Registrar,Professionals,Exempt,Regular employee,Full time,Master's Degree,White,No,Female,No,Person without a disability,1980,2018,Art Museum of West Virginia University,0,2018,AAM,4,,Musuem 296
4,Educational Programs Manager,Education,Professionals,Exempt,Regular employee,Full time,PhD,White,No,Female,No,Person without a disability,1982,2015,Art Museum of West Virginia University,0,2018,AAM,5,,Musuem 296


In [38]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


TypeError: 'DataFrame' object is not callable

In [39]:
print(df.columns.tolist())

['job_title', 'job_type', 'eeo_job_c', 'exemption', 'employment', 'ft_pt', 'education', 'race', 'ethnicity', 'gender', 'protected', 'disability', 'year_of_b', 'year_of_h', 'origin_filename', 'repeat_participant', 'year', 'affiliation', 'count', 'size', 'anonymized']


In [29]:
new_df=df.dropna(how='all', axis=1)

In [30]:
new_df.columns = new_df.columns.str.replace(' ', '')

In [31]:
new_df['job_type']=new_df['job_type'].apply(lambda x: x.strip())

In [33]:
new_df.head()

Unnamed: 0,job_title,job_type,eeo_job_c,exemption,employment,ft_pt,education,race,ethnicity,gender,protected,disability,year_of_b,year_of_h,origin_filename,repeat_participant,year,affiliation,count,size,anonymized
0,Administrative Associate,Support/Administrator,Administrative Support,Non-exempt,Regular employee,Full time,Bachelor's Degree,White,No,Female,No,Person without a disability,1982,2015,Art Museum of West Virginia University,0,2018,AAM,1,,Musuem 296
1,Curator,Curators,Professionals,Exempt,Regular employee,Full time,Master's Degree,White,No,Male,No,Person without a disability,1962,2010,Art Museum of West Virginia University,0,2018,AAM,2,,Musuem 296
2,Facility Manager,Facilities,Professionals,Exempt,Regular employee,Full time,Master's Degree,White,No,Female,No,Person without a disability,1982,2015,Art Museum of West Virginia University,0,2018,AAM,3,,Musuem 296
3,Museum Registrar,Registrar,Professionals,Exempt,Regular employee,Full time,Master's Degree,White,No,Female,No,Person without a disability,1980,2018,Art Museum of West Virginia University,0,2018,AAM,4,,Musuem 296
4,Educational Programs Manager,Education,Professionals,Exempt,Regular employee,Full time,PhD,White,No,Female,No,Person without a disability,1982,2015,Art Museum of West Virginia University,0,2018,AAM,5,,Musuem 296


In [36]:
new_df['job_type'].unique()

array(['Support/Administrator', 'Curators', 'Facilities', 'Registrar',
       'Education', 'Preparators/Handlers',
       'Museum Leadership (includes executive positions)',
       'Visitor Services', 'Decline to state',
       'Exhibition Design and Construction (includes Fabrication)',
       'Marketing/Public Relations',
       'Membership/Development (includes Event Planning)',
       'Rights/Reproduction (includes Photography)',
       'Information Technology/Web Development',
       'Finance/Human Resources', 'Retail and Store', 'Archive/Library',
       'Security', 'Publication/Editorial', 'Conservators', 'education',
       'curators', 'decline to state', 'Preparators/handlers',
       'retail and Store', 'visitor Services', 'preparators/Handlers',
       'Development', 'security',
       'rights/Reproduction (includes Photography)',
       'finance/Human Resources',
       'membership/Development (includes Event Planning)', 'registrar',
       'Decline to State', 'FACILITIES',

In [57]:
grouping_gender_origin=new_df.groupby(['gender', 'origin_filename'])

In [58]:
new_df['eeo_job_c'].unique()

array(['Administrative Support', 'Professionals',
       'Executive/Sr Officials & Mgrs', 'Service Workers', 'Technicians',
       'Laborers & Helpers', 'First/Mid Officials & Mgrs',
       'Service workers', 'Sales workers', 'Craft workers',
       'Decline to state', 'Sales Workers', 'Operatives', 'Craft Workers',
       'Administrative support', 'laborers & helpers',
       'laborers & Helpers', 'decline to state', 'administrative Support',
       'service Workers', 'ADministrative Support',
       'first/Mid Officials & Mgrs', 'sales workers', 'craft workers',
       'professionals', 'Administrative Support ',
       'Executive/SR Officials & Mgrs'], dtype=object)

In [59]:
new_df['exemption']=new_df['exemption'].apply(lambda x: x.strip())

In [60]:
new_df['exemption'].unique()

array(['Non-exempt', 'Exempt', 'Decline to state', 'Non-Exempt',
       'non-exempt', 'exempt', 'decline to state', 'Decline to State',
       'EXEMPT'], dtype=object)

In [61]:
new_df['employment']=new_df['employment'].apply(lambda x:x.strip())

In [62]:
new_df['education']=new_df['education'].apply(lambda x:x.strip())

In [63]:
new_df['ethnicity']=new_df['ethnicity'].apply(lambda x:x.strip())

In [64]:
new_df['disability']=new_df['disability'].apply(lambda x:x.strip())

In [65]:
new_df['disability'].unique()

array(['Person without a disability', 'Person with a disability',
       'Decline to state', 'Decline to State', 'decline to state',
       'person without a disability', 'Person with a disabilty'],
      dtype=object)

In [66]:
new_df['race']=new_df['race'].apply(lambda x: x.strip())

In [67]:
new_df['race'].unique()

array(['White', 'Black or African American', 'Asian',
       'Native Hawaiian or Pacific Islander', 'Decline to state',
       'Two or more races', 'American Indian or Alaskan Native', 'ASIAN',
       'WHITE', 'Two or More Races', 'white', 'asian',
       'Two or more Races', 'decline to state',
       'black or African American', 'American Indian Or Alaskan Native',
       'Two Or More Races', 'Decline to State'], dtype=object)

In [68]:
new_df['gender']=new_df['gender'].apply(lambda x: x.strip())

In [69]:
new_df['gender'].unique()

array(['Female', 'Male', 'Decline to state', 'Non-binary',
       'Not specified', 'decline to state', 'Not Specified', 'male',
       'female', 'Decline to State', 'FEMALE', 'MALE'], dtype=object)