# US Museums

Here we would like to investigate the income of US museums.

Data is provided by [Kaggle](https://www.kaggle.com/imls/museum-directory).

In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import ipywidgets as wg

from lets_plot import *
from lets_plot.geo_data import *

The geodata is provided by © OpenStreetMap contributors and is made available here under the Open Database License (ODbL).


In [2]:
LetsPlot.setup_html()

## Preparation

In [3]:
museums_df = pd.read_csv("https://raw.githubusercontent.com/JetBrains/lets-plot-docs/master/data/museums.csv",
                         low_memory=False)
museums_df = museums_df.rename(columns={'State (Administrative Location)': 'State',
                                        'City (Administrative Location)': 'City'})
museums_df = museums_df[['Museum Name', 'Museum Type', 'State', 'City', 'Longitude', 'Latitude', 'Income']]
museums_df.City = museums_df.City.str.capitalize()
museums_df.Income = museums_df.Income.fillna(0)
museums_df = museums_df[museums_df.Income > 0]
museums_df = museums_df[(~museums_df.Latitude.isna())&(~museums_df.Longitude.isna())]

##  Total and Mean Income of Museums by State

In [4]:
state_income_df = museums_df.groupby('State').agg(
    size=('Income', 'size'),
    sum=('Income', 'sum'),
    mean=('Income', 'mean')
).reset_index()

state_income_df = state_income_df.sort_values(by=['sum', 'State'])

p1 = ggplot() + \
    geom_bar(aes(x='State', y='sum', fill='size'), data=state_income_df,
             stat='identity', size=2, color='black', sampling=sampling_pick(n=51),
             tooltips=layer_tooltips().line('@State')\
                                      .format('@sum', '.3~s')\
                                      .line('total income|@sum')\
                                      .line('number of museums|@size')) + \
    scale_fill_gradient(name='number of museums', low='#006d2c', high='#edf8e9') + \
    ggtitle('Total Income of Museums by State') + \
    scale_y_log10(name='total income') + xlab('state') + \
    theme_classic() + theme(axis='blank', axis_title=element_text())

state_income_df = state_income_df.sort_values(by=['mean', 'State'])

p2 = ggplot() + \
    geom_bar(aes(x='State', y='mean', fill='size'), data=state_income_df,
             stat='identity', size=1, color='black', sampling=sampling_pick(n=51),
             tooltips=layer_tooltips().line('@State')\
                                      .format('@mean', '.3~s')\
                                      .line('mean income|@mean')\
                                      .line('number of museums|@size')) + \
    scale_fill_gradient(name='number of museums', low='#006d2c', high='#edf8e9') + \
    ggtitle('Mean Income of Museums by State') + \
    scale_y_log10('mean income') + xlab('state') + \
    theme_classic() + theme(axis='blank', axis_title=element_text())

gggrid([p1, p2], ncol=1) + ggsize(800, 400)

## More Explicit Relation Between Income and Number of Museums

In [5]:
state_income_df = state_income_df.sort_values(by='size')

p1 = ggplot() + \
    geom_point(aes(x='sum', y='size', fill='State'),
               data=state_income_df, shape=21, color='white', size=5,
               tooltips=layer_tooltips().line('@State')) + \
    scale_fill_discrete(guide='none') + \
    scale_x_continuous('total income', trans="sqrt", format='.3~s') + \
    ylab('number of museums')

p2 = ggplot() + \
    geom_point(aes(x='mean', y='size', fill='State'),
               data=state_income_df, shape=21, color='white', size=5,
               tooltips=layer_tooltips().line('@State')) + \
    scale_fill_discrete(guide='none') + \
    scale_x_continuous('mean income', trans="sqrt", format='.3~s') + \
    ylab('number of museums')

gggrid([p1, p2])

## Total and Mean Income of Museums by Type

In [6]:
type_income_df = museums_df.groupby('Museum Type').agg(
    size=('Income', 'size'),
    sum=('Income', 'sum'),
    mean=('Income', 'mean')
).reset_index()

type_income_df = type_income_df.sort_values(by='sum')

p1 = ggplot() + \
    geom_bar(aes(x='Museum Type', y='sum', fill='size'), data=type_income_df,
             stat='identity', color='black', size=3,
             tooltips=layer_tooltips().line('^x')\
                                      .format('@sum', '.3~s')\
                                      .line('total income|@sum')\
                                      .line('number of museums|@size')) + \
    scale_fill_gradient(low='#006d2c', high='#edf8e9') + \
    scale_y_log10(name='total income', format='.3~s') + \
    xlab('museum type') + \
    theme(legend_position='none', axis_text_x='blank', axis_ticks='blank', axis_line='blank')

type_income_df = type_income_df.sort_values(by='mean')

p2 = ggplot() + \
    geom_bar(aes(x='Museum Type', y='mean', fill='size'), data=type_income_df,
             stat='identity', color='black', size=3,
             tooltips=layer_tooltips().line('^x')\
                                      .format('@mean', '.3~s')\
                                      .line('mean income|@mean')\
                                      .line('number of museums|@size')) + \
    scale_fill_gradient(low='#006d2c', high='#edf8e9') + \
    scale_y_log10(name='mean income', format='.3~s') + \
    xlab('museum type') + \
    theme(legend_position='none', axis_text_x='blank', axis_ticks='blank', axis_line='blank')

gggrid([p1, p2])

## Relation Between Income and Number of Museums of Given Type

In [7]:
type_income_df = type_income_df.sort_values(by='size')

p1 = ggplot() + \
    geom_point(aes(x='sum', y='size', color='Museum Type'),
               data=type_income_df, size=5,
               tooltips=layer_tooltips().line('^color')) + \
    scale_x_continuous('total income', format='.3~s') + \
    scale_y_continuous('number of museums', format='d') + \
    scale_color_discrete(name='')

p2 = ggplot() + \
    geom_point(aes(x='mean', y='size', color='Museum Type'),
               data=type_income_df, size=5,
               tooltips=layer_tooltips().line('^color')) + \
    scale_x_continuous('total income', format='.3~s') + \
    scale_y_continuous('number of museums', format='d') + \
    scale_color_discrete(guide='none')

gggrid([
    p1 + theme(legend_position='none'),
    p2
])

## Museums on Map

Let's start from Washington, D.C.

In [8]:
def plot_museums_on_map(data):
    return ggplot() + \
        geom_livemap() + \
        geom_point(aes(x='Longitude', y='Latitude', fill='Museum Type', size='Income'),
                   data=data, shape=21, color='black',
                   tooltips=layer_tooltips().title('^fill')\
                                            .format('@Longitude', '.4f').line('longitude|@Longitude')\
                                            .format('@Latitude', '.4f').line('latitude|@Latitude')\
                                            .format('@Income', '.3~s')\
                                            .line('income|@Income')) + \
        scale_size(range=[2, 7], guide="none") + \
        scale_fill_discrete(name = "") + \
        ggsize(600, 450) + \
        theme(legend_position='bottom', legend_direction="vertical")

In [9]:
plot_museums_on_map(museums_df[museums_df.State == 'DC'])

There is a problem with the data: some museums are assigned to wrong states.

Well, we can fix it by using geocoding.

In [10]:
us_sc_df = pd.read_csv('https://raw.githubusercontent.com/JetBrains/lets-plot-docs/refs/heads/master/data/us_state_codes.csv')

states_gdf = geocode_states('US-48').get_boundaries(resolution=4)

states_df = states_gdf.merge(us_sc_df, on='state')
states_df = states_df[['state', 'code', 'geometry']]

states_gdf = gpd.GeoDataFrame(states_df, geometry='geometry')

In [11]:
def find_state(lon, lat):
    point = Point(lon, lat)
    state_gdf = states_gdf[states_gdf.contains(point)]
    return state_gdf.iloc[0].code if state_gdf.shape[0] == 1 else None

def fix_state(record):
    record.State = find_state(record.Longitude, record.Latitude) or record.State
    return record

museums_df = museums_df.apply(fix_state, axis=1)

Another try:

In [12]:
plot_museums_on_map(museums_df[museums_df.State == 'DC'])

Now it looks just as expected.

So then we make the same plots for a few other US states.

In [13]:
gggrid([
    plot_museums_on_map(museums_df[museums_df.State == state]) + \
        theme(legend_position='right') + \
        ggtitle(state)
    for state in ['WA', 'NY', 'MT', 'TX']
], ncol=1)

## Which Museum Types Bring the Greatest Income to the State?

In [14]:
total_income_df = museums_df.groupby(['State', 'Museum Type'])["Income"].sum().to_frame().reset_index()
total_income_df = total_income_df.sort_values(by=['State', 'Income'], ascending=[True, False]).reset_index(drop=True)
top_by_income_museums_df = total_income_df.groupby('State').first().reset_index()
top_by_income_museums_gdf = gpd.GeoDataFrame(
    top_by_income_museums_df.merge(states_gdf, left_on='State', right_on='code'),
    geometry='geometry'
)

In [15]:
ggplot() + \
    geom_polygon(aes(fill='Museum Type', alpha='Income'),
                 data=top_by_income_museums_gdf, color='white',
                 tooltips=layer_tooltips().line('@State')\
                                          .line('most profitable museum type|^fill')\
                                          .format('^alpha', '.3~s')
                                          .line('total income for current museum type|^alpha')) + \
    scale_alpha(name='', range=[.2, 1], trans='log10', format='.3~s') + \
    ggtitle('Most Profitable Museum Types by State') + \
    ggsize(750, 350) + \
    theme_classic() + theme(axis='blank')