# Name Map

Continue to explore SSA name data by plotting name ranking by state using choropleth map. The SSA name data is only at the state level, so no need for a complex map with many geo features - just state outline is needed.

Input box for manually inputting one name.
Slider to select year to display so you can animate ranking by state over time.
Use min, max ranking of name over full date range (1910,2020) to set min, max color ranges.


In [1]:
import pandas as pd
import plotly.express as px
import json

In [2]:
files_path = 'E:/UserLo/source/repos/learning/Name Surfer/'
states_geojson = "gz_2010_us_040_00_500k.json"
with open(files_path+states_geojson,"r") as f_h:
    states_geojson_dict = json.load(f_h)

In [73]:
states_geojson_dict.keys()

dict_keys(['type', 'features'])

In [74]:
states_geojson_dict['type']

'FeatureCollection'

In [96]:
states_geojson_dict['features'][0]

{'type': 'Feature',
 'properties': {'GEO_ID': '0400000US23',
  'STATE': '23',
  'NAME': 'Maine',
  'LSAD': '',
  'CENSUSAREA': 30842.923},
 'geometry': {'type': 'MultiPolygon',
  'coordinates': [[[[-67.619761, 44.519754],
     [-67.61541, 44.521973],
     [-67.587738, 44.516196],
     [-67.582113, 44.513459],
     [-67.589259, 44.50084],
     [-67.590627, 44.49415],
     [-67.580288, 44.488068],
     [-67.562651, 44.472104],
     [-67.569189, 44.455531],
     [-67.571774, 44.453403],
     [-67.574206, 44.45173],
     [-67.588346, 44.449754],
     [-67.592755, 44.458572],
     [-67.604919, 44.502056],
     [-67.607199, 44.503576],
     [-67.614954, 44.503576],
     [-67.619211, 44.506009],
     [-67.619761, 44.519754]]],
   [[[-68.498637, 44.369686],
     [-68.478785, 44.319563],
     [-68.489641, 44.313705],
     [-68.515173, 44.324797],
     [-68.523857, 44.322397],
     [-68.533481, 44.329958],
     [-68.530394, 44.333583],
     [-68.528635, 44.344605],
     [-68.520671, 44.358357],


In [3]:
state_prop_list = [state_feature_dict['properties'] for state_feature_dict in states_geojson_dict['features']]

In [4]:
state_prop_df = pd.DataFrame(state_prop_list).sort_values("NAME")

In [5]:
state_prop_df.head()

Unnamed: 0,GEO_ID,STATE,NAME,LSAD,CENSUSAREA
18,0400000US01,1,Alabama,,50645.326
19,0400000US02,2,Alaska,,570640.95
20,0400000US04,4,Arizona,,113594.084
21,0400000US05,5,Arkansas,,52035.477
22,0400000US06,6,California,,155779.22


In [6]:
all_state_names_dict = {
'Name':['Alabama','Alaska','Arizona','Arkansas','California','Colorado','Connecticut','Delaware','District of Columbia',
'Florida','Georgia','Hawaii','Idaho','Illinois','Indiana','Iowa','Kansas','Kentucky','Louisiana','Maine',
'Maryland','Massachusetts','Michigan','Minnesota','Mississippi','Missouri','Montana','Nebraska','Nevada',
'New Hampshire','New Jersey','New Mexico','New York','North Carolina','North Dakota','Ohio','Oklahoma','Oregon',
'Pennsylvania','Rhode Island','South Carolina','South Dakota','Tennessee','Texas','Utah','Vermont','Virginia',
'Washington','West Virginia','Wisconsin','Wyoming'],

'Abbreviation':['AL','AK','AZ','AR','CA','CO','CT','DE','DC','FL','GA','HI','ID','IL','IN','IA','KS','KY','LA','ME','MD','MA',
'MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','OH','OK','OR','PA','RI','SC','SD','TN','TX',
'UT','VT','VA','WA','WV','WI','WY']
}


In [7]:
states_df = pd.DataFrame(all_state_names_dict)
states_df.head()

Unnamed: 0,Name,Abbreviation
0,Alabama,AL
1,Alaska,AK
2,Arizona,AZ
3,Arkansas,AR
4,California,CA


In [8]:
# Need to make the Name column the index so that is can be used in the dataframe join method
states_iname_df = states_df.set_index('Name')
states_iname_df.head()

Unnamed: 0_level_0,Abbreviation
Name,Unnamed: 1_level_1
Alabama,AL
Alaska,AK
Arizona,AZ
Arkansas,AR
California,CA


In [23]:
# Puerto Rico is in the GeoJSON but not in list of states that have name data, so remove it from GeoJSON states_geojson_dict
state_prop_abb_df_l = state_prop_df.join(states_iname_df,on='NAME', how='left')
state_prop_abb_df_l[state_prop_abb_df_l.isna().any(axis=1)]

Unnamed: 0,GEO_ID,STATE,NAME,LSAD,CENSUSAREA,Abbreviation
16,0400000US72,72,Puerto Rico,,3423.775,


In [28]:
state_prop_abb_df_l.isna().sum(axis=0)

GEO_ID          0
STATE           0
NAME            0
LSAD            0
CENSUSAREA      0
Abbreviation    1
dtype: int64

## Remove Puerto Rico from states_geojson_dict

Since the SSA doesn't name occurrences for that Puerto Rico, remove it from states_geojson_dict.


In [16]:
# The value of the 'features' key is a list of 52 dicts - 50 states plus Washington DC and Puerto Rico. There is data for DC so it can stay, but PR (index 16)
# needs to be removed.
len(states_geojson_dict['features'])

52

In [17]:
# Pop PR out of the list
pr_dict = states_geojson_dict['features'].pop(16)
pr_dict

{'type': 'Feature',
 'properties': {'GEO_ID': '0400000US72',
  'STATE': '72',
  'NAME': 'Puerto Rico',
  'LSAD': '',
  'CENSUSAREA': 3423.775},
 'geometry': {'type': 'MultiPolygon',
  'coordinates': [[[[-65.587335, 18.381994],
     [-65.591215, 18.383793],
     [-65.592667, 18.387243],
     [-65.593393, 18.391237],
     [-65.588128, 18.391237],
     [-65.584537, 18.386939],
     [-65.583571, 18.383751],
     [-65.587335, 18.381994]]],
   [[[-67.477976, 18.378945],
     [-67.485499, 18.382224],
     [-67.489346, 18.387441],
     [-67.482204, 18.39147],
     [-67.477922, 18.390544],
     [-67.472368, 18.382801],
     [-67.477976, 18.378945]]],
   [[[-65.572248, 18.381757],
     [-65.568979, 18.382125],
     [-65.566355, 18.378212],
     [-65.571788, 18.375623],
     [-65.572248, 18.381757]]],
   [[[-65.571523, 18.351635],
     [-65.569013, 18.352794],
     [-65.565744, 18.349524],
     [-65.565072, 18.347499],
     [-65.568702, 18.342399],
     [-65.571523, 18.351635]]],
   [[[-65.266169

In [14]:
state_prop_abb_df = state_prop_df.join(states_iname_df,on='NAME', how='inner')
state_prop_abb_df.head()

Unnamed: 0,GEO_ID,STATE,NAME,LSAD,CENSUSAREA,Abbreviation
18,0400000US01,1,Alabama,,50645.326,AL
19,0400000US02,2,Alaska,,570640.95,AK
20,0400000US04,4,Arizona,,113594.084,AZ
21,0400000US05,5,Arkansas,,52035.477,AR
22,0400000US06,6,California,,155779.22,CA


In [115]:
state_prop_abb_df[state_prop_abb_df['GEO_ID']=='0400000US04']

Unnamed: 0,GEO_ID,STATE,NAME,LSAD,CENSUSAREA,Abbreviation
20,0400000US04,4,Arizona,,113594.084,AZ


In [18]:
# Put each row in state_prop_abb_df into a dict that will be used to replace 'properties' in states_geojson_dict so it will also contain the 'Abbreviation' item.
# Try it with Arizona
state_prop_abb_dict = state_prop_abb_df[state_prop_abb_df['GEO_ID']=='0400000US04'].to_dict(orient='records')[0]
state_prop_abb_dict

{'GEO_ID': '0400000US04',
 'STATE': '04',
 'NAME': 'Arizona',
 'LSAD': '',
 'CENSUSAREA': 113594.084,
 'Abbreviation': 'AZ'}

In [19]:
# Put each row in state_prop_abb_df into a dict that will be used to replace 'properties' in states_geojson_dict so it will also contain the 'Abbreviation' item.
# Perform for all states
for s_dict in states_geojson_dict['features']:
    # Match up the GEO_ID values so the matching one is replaced
    new_prop_dict = state_prop_abb_df[state_prop_abb_df['GEO_ID']==s_dict['properties']['GEO_ID']].to_dict(orient='records')[0]
    s_dict['properties']=new_prop_dict

In [21]:
# See if it worked for AZ
states_geojson_dict['features'][19]

{'type': 'Feature',
 'properties': {'GEO_ID': '0400000US04',
  'STATE': '04',
  'NAME': 'Arizona',
  'LSAD': '',
  'CENSUSAREA': 113594.084,
  'Abbreviation': 'AZ'},
 'geometry': {'type': 'Polygon',
  'coordinates': [[[-109.045223, 36.999084],
    [-109.045244, 36.969489],
    [-109.045272, 36.968871],
    [-109.045407, 36.874998],
    [-109.045433, 36.874589],
    [-109.045431, 36.500001],
    [-109.046183, 36.181751],
    [-109.045729, 36.117028],
    [-109.045973, 36.002338],
    [-109.046011, 35.925896],
    [-109.046054, 35.92586],
    [-109.046055, 35.888721],
    [-109.046024, 35.8798],
    [-109.046295, 35.616517],
    [-109.046296, 35.614251],
    [-109.046509, 35.54644],
    [-109.046481, 35.546326],
    [-109.046796, 35.363606],
    [-109.046084, 35.250025],
    [-109.045851, 34.959718],
    [-109.046072, 34.828566],
    [-109.045624, 34.814226],
    [-109.046104, 34.799981],
    [-109.045363, 34.785406],
    [-109.046086, 34.771016],
    [-109.046156, 34.579291],
    [-109.

## Save clean GeoJSON

Now that Puerto Rico has been removed from states_geojson_dict and the two-letter state abbreviations have been added to tbe 'properties', dump states_geojson_dict as  JSON text and write it to a file for safe keeping.


In [29]:
# Save the clean states_geojson_dict with the added state abbreviations to a GeoJSON file for save keeping
files_path = 'E:/UserLo/source/repos/learning/Name Surfer/'
states_geojson = "gz_2010_us_040_00_500k_removePR_added_state_abbreviations.json"
with open(files_path+states_geojson,"w") as f_h:
    states_geojson_clean= json.dumps(states_geojson_dict)
    f_h.write(states_geojson_clean)

## Create name rankings for plotting on choropleth

Let's do name rankings for all years, states and names into one (large) multi-index Series so that when it comes to plotting it will just be simple matter of selecting the data. Also, the ramking Series can be saved as a feather binary for faster loading that dumping as a JSON or some other format.

**Multi-index Series** is a good data structure for this type of info. (Ignore sex for now. It will slightly increase the occurrence count for names that are common to  males and females.) See <https://pandas.pydata.org/docs/user_guide/advanced.html>

Index('Year','StateAbbreviation', 'Name')  Rank

Name will be the superset of all names for the entire range of years.



In [31]:
from pathlib import Path
import numpy as np

In [74]:
# Let's play with multi-index for a small range of indicies populated with random ints
from numpy.random import default_rng
rng = default_rng() # Just use default random number generator

years = [str(year) for year in range(1910,1915,1)]
st_abb = ['AK','AL','AR']
names = ['John','Paul', 'George']
n_vals = len(years)*len(st_abb)*len(names)
vals = rng.integers(low=0, high=1000, size=n_vals)

multi_idx = pd.MultiIndex.from_product([years, st_abb,names],
                           names=['years', 'st_abb','names'])
rank_series = pd.Series(vals,name='Rank',index = multi_idx)

In [75]:
rank_series.head(5)

years  st_abb  names 
1910   AK      John      445
               Paul      157
               George    120
       AL      John      813
               Paul      302
Name: Rank, dtype: int64

In [76]:
# We will eventually plot the rank for all states for a particular name. Easy to get the data using multiindex slice
idx = pd.IndexSlice
rank_series.loc[idx['1911',:,'John']]

st_abb
AK    402
AL    119
AR    668
Name: Rank, dtype: int64

### Load name occurrence files

In [35]:
def load_name_files(file_limit=np.inf):
    """
    Read the names data into pandas dataframe
    Create an empty DataFrame since the files don't have headers
    """
    names_df = pd.DataFrame(
        columns=['State', 'Sex', 'Year', 'Name', 'NumOccurrences'])

    files_path = Path(
        'E:/UserLo/source/repos/learning/Name Surfer/NamesByState')
        #'https://github.com/MrLRTripp/NameSurfer/tree/main/NamesByState')
    files_list = list(files_path.glob('*.txt'))

    file_count=0
    for f in files_list:
        if file_count<file_limit:
            with f.open("r") as f_h:
                state_df = pd.read_csv(f_h, header=None, names=[
                                       'State', 'Sex', 'Year', 'Name', 'NumOccurrences'])
                names_df = pd.concat([names_df, state_df], axis=0, copy=False)
                file_count += 1
        else:
            break
            

    names_df = names_df.astype({'Year': 'int32', 'NumOccurrences': 'int32'})
    return names_df

In [77]:
names_3_df = load_name_files(3)

### Generate ranks for each state

The function **compute_for_year_ranges** is going to work out very nice.

By giving it the full range start, stop, step, you will get all the names and ranks. The names will be the superset of all names in the 
SSA occurrence files for the state and sexes. 

Call **compute_for_year_ranges** for each state. Then transform the results into the desired multi-index Series

Index('Year','StateAbbreviation', 'Name')  Rank

The color range will be the min and max values over all states for a particular year and name

In [78]:
# Functions used to create name rank history DataFrame
def compute_name_occurences(df, states, sexes, years):
    """ df is the names DataFrame that has columns 
    ['State', 'Sex', 'Year', 'Name', 'NumOccurrences'], Year and NumOccurrences are int32
    states, sexes and years can be list, range, or set
    Note:
    Between 10% and 20% Male and Female names are the same so be aware when using both sexes
    """
    distinct_names = df[(df['State'].isin(states)) & (
        df['Sex'].isin(sexes)) & (df['Year'].isin(years))]['Name'].unique()

    name_occurrences_df = df[(df['State'].isin(states)) & (df['Sex'].isin(sexes)) & (df['Year'].isin(years)) &
                             (df['Name'].isin(distinct_names))].groupby(by=['Name']).sum()
    name_occurrences_df = name_occurrences_df.sort_values(
        'NumOccurrences', ascending=False).reset_index()

    return name_occurrences_df[['Name', 'NumOccurrences']]

In [79]:
def compute_for_year_ranges(df, year_range, states, sexes):
    """
    All the names with the same count will get the same rank. There are many names that have same count.
    df is the names DataFrame that has columns 
    ['State', 'Sex', 'Year', 'Name', 'NumOccurrences'], Year and NumOccurrences are int32

    year_range must be a range object 
    """
    # name_rank_year_ranges_df has index of all names. Columns = [Rank_<Year_range_1>,Rank_<Year_range_2>, Rank_<Year_range_3>, ...]
    name_rank_year_ranges_df = pd.DataFrame()   # Empty df to hold accumulated ranks

    if states == ['All']:
        states = df['State'].unique()  # All the states in the input DataFrame

    # TODO: if year_range == 'All' then set range(1900,2020)

    for yr in year_range:
        # Each value of yr will be the start year of the sub_year_range.
        # yr+yr.step will be the stop year of the sub_year_range.
        # sub_year_range will have a step size of 1
        sub_year_range = range(yr, yr+year_range.step, 1)

        # Find number of occurrences for a given name
        name_occurrences_df = compute_name_occurences(
            df, states, sexes, sub_year_range)
        count_num_occ = name_occurrences_df.groupby('NumOccurrences').count(
        ).sort_values('NumOccurrences', ascending=False).reset_index()

        # Compute rank for a given NumOccurrences. Then match name to number of occurrences
        # It makes sense to iterate using iterrows since the current_rank keeps accumulating
        # ['NumOccurrences','NumNames','Rank']
        # NumNames is the number of Names that have number of occurrences equal to NumOccurrences
        all_ranks_list = []
        current_rank = 1
        for idx, r in count_num_occ.iterrows():
            if idx != 0:
                current_rank += count_num_occ.loc[idx-1, 'Name']

            all_ranks_list.append(
                [r['NumOccurrences'], r['Name'], current_rank])
        all_ranks_df = pd.DataFrame(all_ranks_list, columns=[
                                    'NumOccurrences', 'NumNames', f'{yr}'])

        # merge performs a database join of the type specified by how=
        # Set Name as the index to make it easier to get the rank using .loc
        # nameRank_df has index of all the names. Columns = [NumOccurrences,	NumNames,	Rank]
        nameRank_df = name_occurrences_df.merge(
            all_ranks_df, on='NumOccurrences', how='inner').set_index('Name')
        # Get just the Rank and merge it with name_rank_year_ranges_df
        merged_e_df = name_rank_year_ranges_df.merge(
            nameRank_df[[f'{yr}']], left_index=True, right_index=True, how='outer')

         # Don't replace NaN with 0. Plotly handles nan, by just skipping those values which is what we want.
        # merged_e_df.fillna(value=0, inplace=True)
        name_rank_year_ranges_df = merged_e_df

    return name_rank_year_ranges_df

In [105]:

# Generate rankings just for AK for a small number of years
name_rank_AK_yr = compute_for_year_ranges(names_3_df, range(1911,1921,1), ['AK'], ['M','F'])

In [119]:
sample_5_AK_df = name_rank_AK_yr[name_rank_AK_yr.notna().all(axis=1)].head(5)
sample_5_AK_df

Unnamed: 0_level_0,1911,1912,1913,1914,1915,1916,1917,1918,1919,1920
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
George,11.0,3.0,5.0,12.0,16.0,5.0,5.0,13.0,3.0,6.0
Helen,6.0,6.0,8.0,12.0,6.0,5.0,7.0,8.0,11.0,7.0
James,3.0,14.0,5.0,3.0,11.0,4.0,7.0,15.0,3.0,7.0
John,1.0,1.0,2.0,2.0,2.0,1.0,1.0,2.0,1.0,2.0
Margaret,4.0,14.0,5.0,4.0,11.0,18.0,5.0,5.0,5.0,3.0


In [120]:
name_idx_list = list(sample_5_AK_df.index)
name_idx_list

['George', 'Helen', 'James', 'John', 'Margaret']

In [124]:
# Add state as first level index
state_name_idx = pd.MultiIndex.from_product([['AK'],name_idx_list],
                           names=['st_abb','names'])

state_name_idx

MultiIndex([('AK',   'George'),
            ('AK',    'Helen'),
            ('AK',    'James'),
            ('AK',     'John'),
            ('AK', 'Margaret')],
           names=['st_abb', 'names'])

In [125]:
sample_5_AK_df.index = state_name_idx
sample_5_AK_df

Unnamed: 0_level_0,Unnamed: 1_level_0,1911,1912,1913,1914,1915,1916,1917,1918,1919,1920
st_abb,names,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AK,George,11.0,3.0,5.0,12.0,16.0,5.0,5.0,13.0,3.0,6.0
AK,Helen,6.0,6.0,8.0,12.0,6.0,5.0,7.0,8.0,11.0,7.0
AK,James,3.0,14.0,5.0,3.0,11.0,4.0,7.0,15.0,3.0,7.0
AK,John,1.0,1.0,2.0,2.0,2.0,1.0,1.0,2.0,1.0,2.0
AK,Margaret,4.0,14.0,5.0,4.0,11.0,18.0,5.0,5.0,5.0,3.0


In [126]:
sample_5_AK_df.xs(("AK", "James"), level=("st_abb", "names"), axis=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,1911,1912,1913,1914,1915,1916,1917,1918,1919,1920
st_abb,names,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AK,James,3.0,14.0,5.0,3.0,11.0,4.0,7.0,15.0,3.0,7.0


In [129]:
# Take a sample of rankings from AR with same names and concatenate with AK
name_rank_AR_yr = compute_for_year_ranges(names_3_df, range(1911,1921,1), ['AR'], ['M','F'])
#sample_5_AR_df = name_rank_AR_yr[name_rank_AR_yr.notna().all(axis=1)].head(5)
sample_5_AR_df = name_rank_AR_yr.loc[name_idx_list]
state_name_idx = pd.MultiIndex.from_product([['AR'],list(sample_5_AR_df.index)],
                           names=['st_abb','names'])
sample_5_AR_df.index = state_name_idx

result_df = pd.concat([sample_5_AK_df,sample_5_AR_df])

result_df

Unnamed: 0_level_0,Unnamed: 1_level_0,1911,1912,1913,1914,1915,1916,1917,1918,1919,1920
st_abb,names,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AK,George,11.0,3.0,5.0,12.0,16.0,5.0,5.0,13.0,3.0,6.0
AK,Helen,6.0,6.0,8.0,12.0,6.0,5.0,7.0,8.0,11.0,7.0
AK,James,3.0,14.0,5.0,3.0,11.0,4.0,7.0,15.0,3.0,7.0
AK,John,1.0,1.0,2.0,2.0,2.0,1.0,1.0,2.0,1.0,2.0
AK,Margaret,4.0,14.0,5.0,4.0,11.0,18.0,5.0,5.0,5.0,3.0
AR,George,7.0,10.0,9.0,10.0,8.0,11.0,11.0,10.0,10.0,12.0
AR,Helen,13.0,9.0,14.0,13.0,11.0,14.0,9.0,12.0,11.0,10.0
AR,James,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
AR,John,5.0,4.0,4.0,3.0,4.0,4.0,4.0,4.0,4.0,4.0
AR,Margaret,41.0,28.0,18.0,21.0,20.0,26.0,19.0,15.0,20.0,16.0


In [167]:
def concat_state_names_rank(names_df, year_range, state_list):
    # Create an empty result dataframe with the same MultiIndex so concat will work as desired
    result_df=pd.DataFrame()
    template_idx = pd.MultiIndex.from_arrays([[],[]],
                               names=['st_abb','names'])
    result_df.index = template_idx

    name_sample = ['George', 'Helen', 'James', 'John', 'Margaret']
    # loop through state_list
    for state in state_list:
        # compute_for_year_ranges expects a list of states so convert individual state into a list
        name_rank_1_state_df = compute_for_year_ranges(names_df, year_range, [state], ['M','F']) 
        sample_df = name_rank_1_state_df.loc[name_sample]
        state_name_idx = pd.MultiIndex.from_product([[state],list(sample_df.index)],
                                   names=['st_abb','names'])
        sample_df.index = state_name_idx

        result_df = pd.concat([result_df,sample_df])

    return result_df

In [156]:
names_3_df.head()

Unnamed: 0,State,Sex,Year,Name,NumOccurrences
0,AK,F,1910,Mary,14
1,AK,F,1910,Annie,12
2,AK,F,1910,Anna,10
3,AK,F,1910,Margaret,8
4,AK,F,1910,Helen,7


In [157]:
state_list = list(names_3_df['State'].unique())
state_list

['AK', 'AL', 'AR']

In [168]:
result_df = concat_state_names_rank(names_3_df, range(1911,1921,1), state_list)

In [169]:
result_df

Unnamed: 0_level_0,Unnamed: 1_level_0,1911,1912,1913,1914,1915,1916,1917,1918,1919,1920
st_abb,names,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AK,George,11.0,3.0,5.0,12.0,16.0,5.0,5.0,13.0,3.0,6.0
AK,Helen,6.0,6.0,8.0,12.0,6.0,5.0,7.0,8.0,11.0,7.0
AK,James,3.0,14.0,5.0,3.0,11.0,4.0,7.0,15.0,3.0,7.0
AK,John,1.0,1.0,2.0,2.0,2.0,1.0,1.0,2.0,1.0,2.0
AK,Margaret,4.0,14.0,5.0,4.0,11.0,18.0,5.0,5.0,5.0,3.0
AL,George,10.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
AL,Helen,66.0,39.0,46.0,35.0,40.0,37.0,30.0,25.0,22.0,18.0
AL,James,3.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
AL,John,4.0,4.0,3.0,3.0,3.0,5.0,3.0,3.0,4.0,3.0
AL,Margaret,22.0,27.0,14.0,23.0,13.0,17.0,11.0,9.0,13.0,10.0


In [170]:
# Now get a slice across all states for one name so we have rankings for that name across all states and years
result_df.xs("James", level=("names"), axis=0)

Unnamed: 0_level_0,1911,1912,1913,1914,1915,1916,1917,1918,1919,1920
st_abb,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AK,3.0,14.0,5.0,3.0,11.0,4.0,7.0,15.0,3.0,7.0
AL,3.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
AR,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [171]:
# The color should be scaled over all states
color_max = result_df.xs("James", level=("names"), axis=0).max().max()
color_min = result_df.xs("James", level=("names"), axis=0).min().min()
print (f'{color_min=}  {color_max=}\nNote: Need to invert colors and legend since rank of 1.0 is highest.')

color_min=1.0  color_max=15.0
Note: Need to invert colors and legend since rank of 1.0 is highest.


In [172]:
# The select for a particular year to display
result_df.xs("James", level=("names"), axis=0).loc[:,'1918']

st_abb
AK    15.0
AL     2.0
AR     1.0
Name: 1918, dtype: float64

In [145]:
import plotly.express as px

df = px.data.election()

In [146]:
df.head()

Unnamed: 0,district,Coderre,Bergeron,Joly,total,winner,result,district_id
0,101-Bois-de-Liesse,2481,1829,3024,7334,Joly,plurality,101
1,102-Cap-Saint-Jacques,2525,1163,2675,6363,Joly,plurality,102
2,11-Sault-au-Récollet,3348,2770,2532,8650,Coderre,plurality,11
3,111-Mile-End,1734,4782,2514,9030,Bergeron,majority,111
4,112-DeLorimier,1770,5933,3044,10747,Bergeron,majority,112


In [152]:
df_district_df = df.set_index('district')
df_district_df.head()

Unnamed: 0_level_0,Coderre,Bergeron,Joly,total,winner,result,district_id
district,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
101-Bois-de-Liesse,2481,1829,3024,7334,Joly,plurality,101
102-Cap-Saint-Jacques,2525,1163,2675,6363,Joly,plurality,102
11-Sault-au-Récollet,3348,2770,2532,8650,Coderre,plurality,11
111-Mile-End,1734,4782,2514,9030,Bergeron,majority,111
112-DeLorimier,1770,5933,3044,10747,Bergeron,majority,112


In [147]:
geojson = px.data.election_geojson()

In [150]:
geojson['features'][0]['properties']

{'district': '11-Sault-au-Récollet'}

In [151]:
#fig = px.choropleth(df, geojson=geojson, color="Bergeron",
#                    locations="district", featureidkey="properties.district",
#                    projection="mercator"
#                   )

fig = px.choropleth(df, geojson=geojson, color="Bergeron",
                    locations="district", featureidkey="properties.district",
                    projection="mercator"
                   )
fig.update_geos(fitbounds="locations", visible=False)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

In [181]:
# result_df.xs("James", level=("names"), axis=0).loc[:,'1918']
fig = px.choropleth(
    locations=list(result_df.xs("James", level=("names"), axis=0).index), 
    locationmode="USA-states", 
    color=result_df.xs("James", level=("names"), axis=0).loc[:,'1918'], 
    scope="usa")

fig.update_traces(
    colorbar_title_text='Rank')

fig.show()