In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

plt.style.use('fivethirtyeight')
sns.set_context("notebook")

Now let's play around a bit with the large baby names dataset we saw in lecture 1. We'll start by loading that dataset from the social security administration's website.

To keep the data small enough to avoid crashing datahub, we're going to look at only California rather than looking at the national dataset.

In [2]:
import urllib.request
import os.path
import zipfile

data_url = "https://www.ssa.gov/oact/babynames/state/namesbystate.zip"
local_filename = "babynamesbystate.zip"
if not os.path.exists(local_filename): # if the data exists don't download again
    with urllib.request.urlopen(data_url) as resp, open(local_filename, 'wb') as f:
        f.write(resp.read())

zf = zipfile.ZipFile(local_filename, 'r')

ca_name = 'CA.TXT'
field_names = ['State', 'Sex', 'Year', 'Name', 'Count']
with zf.open(ca_name) as fh:
    babynames = pd.read_csv(fh, header=None, names=field_names)

babynames.sample(5)

Unnamed: 0,State,Sex,Year,Name,Count
217517,CA,F,2017,Aubree,267
174791,CA,F,2006,Chantelle,10
41798,CA,F,1959,Danae,6
309690,CA,M,1992,Dawit,6
255002,CA,M,1958,Walt,13


Goal 1: Find the most popular baby name in California in 2018

In [3]:
babynames[babynames["Year"] == 2018].sort_values(by = "Count", ascending = False).head(5)

Unnamed: 0,State,Sex,Year,Name,Count
221131,CA,F,2018,Emma,2722
378377,CA,M,2018,Noah,2555
221132,CA,F,2018,Mia,2484
221133,CA,F,2018,Olivia,2456
378378,CA,M,2018,Liam,2405


Goal 2: Baby names that start with j. 

In [47]:
starts_with_j = babynames["Name"].str.startswith('J')
starts_with_j.head(10)

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7    False
8    False
9    False
Name: Name, dtype: bool

In [48]:
pd.DataFrame(babynames[starts_with_j]["Name"].unique()).sample(10)

Unnamed: 0,0
817,Jennessy
1219,Jaziel
861,Jeylin
155,Jolie
618,Jadyn
626,Jaina
529,Jasminne
619,Jailene
1518,Jasir
138,Justina


Goal 2: Sort names by their length.

Approach 1a: Create a new series of only the lengths. Then add that series to the dataframe as a column. Then sort by that column. Then drop that column.

In [46]:
#create a new series of only the lengths
babyname_lengths = babynames["Name"].map(len)

#add that series to the dataframe as a column
babynames["name_lengths"] = babyname_lengths

#sort by that column
babynames_by_length = babynames.sort_values(by = "name_length")

#drop that column
babynames_by_length = babynames_by_length.drop("name_length", 1)
babynames_by_length.head(5)

Unnamed: 0,State,Sex,Year,Name,Count,name_lengths
118370,CA,F,1991,Vy,11,2
69928,CA,F,1974,Jo,29,2
288078,CA,M,1982,Rj,5,2
254904,CA,M,1958,Ty,18,2
289422,CA,M,1983,Cy,7,2


Approach 1b: Same as 1a, but use str.len() to generate the lengths of the strings.

In [49]:
#create a new series of only the lengths
babyname_lengths = babynames["Name"].str.len()

#add that series to the dataframe as a column
babynames["name_lengths"] = babyname_lengths

#sort by that column
babynames_by_length = babynames.sort_values(by = "name_length")

#drop that column
babynames_by_length = babynames_by_length.drop("name_length", 1)
babynames_by_length.head(5)

Unnamed: 0,State,Sex,Year,Name,Count,name_lengths
118370,CA,F,1991,Vy,11,2
69928,CA,F,1974,Jo,29,2
288078,CA,M,1982,Rj,5,2
254904,CA,M,1958,Ty,18,2
289422,CA,M,1983,Cy,7,2


Approach 2: Generate an index that is in the order we want. Pass that index to loc.

In [57]:
babynames.loc[babynames["Name"].str.len().sort_values().index].head(5)

Unnamed: 0,State,Sex,Year,Name,Count,name_length,name_lengths
118370,CA,F,1991,Vy,11,2,2
69928,CA,F,1974,Jo,29,2,2
288078,CA,M,1982,Rj,5,2,2
254904,CA,M,1958,Ty,18,2,2
289422,CA,M,1983,Cy,7,2,2


How does this work exactly? Let's break it into pieces.

In [58]:
lengths_sorted_by_length = babynames["Name"].str.len().sort_values()
lengths_sorted_by_length.head(5)

118370    2
69928     2
288078    2
254904    2
289422    2
Name: Name, dtype: int64

In [60]:
index_sorted_by_length = lengths_sorted_by_length.index
index_sorted_by_length

Int64Index([118370,  69928, 288078, 254904, 289422,  87850, 125467, 139792,
            268976, 125511,
            ...
            324728, 297806, 312618, 307053, 329695, 322733, 322558, 299240,
            319423, 312731],
           dtype='int64', length=381214)

In [64]:
# now pass the index to loc. This is yet another way 
# that loc can be used that we did not discuss in lecture 4.
babynames.loc[index_sorted_by_length].head(5)

Unnamed: 0,State,Sex,Year,Name,Count,name_length,name_lengths
118370,CA,F,1991,Vy,11,2,2
69928,CA,F,1974,Jo,29,2,2
288078,CA,M,1982,Rj,5,2,2
254904,CA,M,1958,Ty,18,2,2
289422,CA,M,1983,Cy,7,2,2


Goal 4: Name whose popularity has changed the most. Also tough.

First we need to define change in popularity. For simplicity, for this problem, let's define it as the difference between a name's maximum occurrence and minimum occurrence. For example, let's consider the name Jennifer.

In [67]:
jennifer_counts = babynames.query("Name == 'Jennifer'")["Count"]
jennifer_counts.head(5)

13609     5
16322     5
16990     6
17530    13
18206    24
Name: Count, dtype: int64

In [68]:
max(jennifer_counts) - min(jennifer_counts)

6061

In [73]:
def domain_size(series):
    return max(series) - min(series)

In [74]:
domain_size(jennifer_counts)

6061

In [77]:
name_domain_sizes = babynames.groupby("Name").agg(domain_size)
name_domain_sizes.head(5)

Unnamed: 0_level_0,Year,Count,name_length,name_lengths
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Aadan,6,2,0,0
Aaden,11,138,0,0
Aadhav,3,2,0,0
Aadhira,1,4,0,0
Aadhya,11,45,0,0


In [76]:
name_domain_sizes.sort_values("Count", ascending=False)

Unnamed: 0_level_0,Year,Count,name_length,name_lengths
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Michael,108,8257,0,0
David,108,8111,0,0
John,108,7320,0,0
Robert,108,7095,0,0
Jessica,81,6946,0,0
Linda,106,6754,0,0
Christopher,103,6714,0,0
Jennifer,84,6061,0,0
Daniel,108,5761,0,0
Mark,106,5538,0,0


These goals are hard with our tools so far. Will discuss next ime.