# Lecture 4 –Fall 2023

A demonstration of advanced `pandas` syntax to accompany Lecture 4.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [75]:
import numpy as np
import pandas as pd
import plotly.express as px

## Dataset: California baby names

In today's lecture, we'll work with the `babynames` dataset, which contains information about the names of infants born in California.

The cell below pulls census data from a government website and then loads it into a usable form. The code shown here is outside of the scope of Data 100, but you're encouraged to dig into it if you are interested!

In [95]:
import urllib.request
import os.path
import zipfile

data_url = "https://www.ssa.gov/oact/babynames/state/namesbystate.zip"
local_filename = "babynamesbystate.zip"
if not os.path.exists(local_filename): # If the data exists don't download again
    with urllib.request.urlopen(data_url) as resp, open(local_filename, 'wb') as f:
        f.write(resp.read())

zf = zipfile.ZipFile(local_filename, 'r')

ca_name = 'CA.TXT'
field_names = ['State', 'Sex', 'Year', 'Name', 'Count']
with zf.open(ca_name) as fh:
    babynames = pd.read_csv(fh, header=None, names=field_names)

babynames.head()

Unnamed: 0,State,Sex,Year,Name,Count
0,CA,F,1910,Mary,295
1,CA,F,1910,Helen,239
2,CA,F,1910,Dorothy,220
3,CA,F,1910,Margaret,163
4,CA,F,1910,Frances,134


### Exercises
We want to obtain the first three baby names with `count > 250`.

1.Code this using head()

2.Code this using loc

3.Code this using iloc

4.Code this using []


In [96]:
# Answer Here

babynames[babynames["Count"]>250].head(3)

Unnamed: 0,State,Sex,Year,Name,Count
0,CA,F,1910,Mary,295
233,CA,F,1911,Mary,390
484,CA,F,1912,Mary,534


In [97]:
# Answer Here
babynames.loc[babynames["Count"]>250].head(3)

Unnamed: 0,State,Sex,Year,Name,Count
0,CA,F,1910,Mary,295
233,CA,F,1911,Mary,390
484,CA,F,1912,Mary,534


In [98]:
# Answer Here
babynames[babynames["Count"]>250].iloc[:3]

Unnamed: 0,State,Sex,Year,Name,Count
0,CA,F,1910,Mary,295
233,CA,F,1911,Mary,390
484,CA,F,1912,Mary,534


In [99]:
# Answer Here
display(babynames[["Name", "Count", "State", "Year"]][babynames["Count"]>250].head(3))

Unnamed: 0,Name,Count,State,Year
0,Mary,295,CA,1910
233,Mary,390,CA,1911
484,Mary,534,CA,1912


### `.isin` for Selection based on a list, array, or `Series`

In [100]:
# Note: The parentheses surrounding the code make it possible to break the code into multiple lines for readability
babynames[(babynames["Name"] == "Bella") |
              (babynames["Name"] == "Alex") |
              (babynames["Name"] == "Narges") |
              (babynames["Name"] == "Lisa")]


Unnamed: 0,State,Sex,Year,Name,Count
6289,CA,F,1923,Bella,5
7512,CA,F,1925,Bella,8
12368,CA,F,1932,Lisa,5
14741,CA,F,1936,Lisa,8
17084,CA,F,1939,Lisa,5
...,...,...,...,...,...
399773,CA,M,2019,Alex,438
402648,CA,M,2020,Alex,379
405452,CA,M,2021,Alex,334
408335,CA,M,2022,Alex,345


In [101]:
# A more concise method to achieve the above: .isin
#Answer Here
babynames[babynames["Name"].isin(["Bella","Alex","Nargas", "lisa"])]

Unnamed: 0,State,Sex,Year,Name,Count
6289,CA,F,1923,Bella,5
7512,CA,F,1925,Bella,8
35480,CA,F,1955,Bella,5
43422,CA,F,1960,Alex,7
45526,CA,F,1961,Alex,5
...,...,...,...,...,...
399773,CA,M,2019,Alex,438
402648,CA,M,2020,Alex,379
405452,CA,M,2021,Alex,334
408335,CA,M,2022,Alex,345


### `.str` Functions for Defining a Condition

In [102]:
# What if we only want names that start with "J"?
#Answer Here
babynames[babynames["Name"].str.startswith("J")]

Unnamed: 0,State,Sex,Year,Name,Count
16,CA,F,1910,Josephine,66
44,CA,F,1910,Jean,35
46,CA,F,1910,Jessie,32
59,CA,F,1910,Julia,28
66,CA,F,1910,Juanita,25
...,...,...,...,...,...
413714,CA,M,2023,Jj,5
413715,CA,M,2023,Johnathon,5
413716,CA,M,2023,Jorden,5
413717,CA,M,2023,Jozef,5


# Custom Sort

In [103]:
# Sort a Series Containing Names
babynames["Name"].sort_values()

387660      Aadan
369654      Aadan
372774      Aadan
401876    Aadarsh
388799      Aaden
           ...   
232190      Zyrah
220708      Zyrah
217445      Zyrah
197542      Zyrah
408216      Zyrus
Name: Name, Length: 413894, dtype: object

In [104]:
# Sort a DataFrame – there are lots of Michaels in California
babynames[babynames["Name"]=="Michael"]

Unnamed: 0,State,Sex,Year,Name,Count
16200,CA,F,1938,Michael,7
16853,CA,F,1939,Michael,9
17559,CA,F,1940,Michael,12
18484,CA,F,1941,Michael,8
19060,CA,F,1942,Michael,25
...,...,...,...,...,...
399687,CA,M,2019,Michael,1238
402558,CA,M,2020,Michael,1079
405352,CA,M,2021,Michael,1089
408242,CA,M,2022,Michael,1046


### Approach 1: Create a temporary column

In [105]:
# Create a Series of the length of each name
babynames["name_length"]= babynames["Name"].str.len()

# Add the Series as a new column to the DataFrame

# Sort the DataFrame by the new column
babynames.sort_values("name_length")

Unnamed: 0,State,Sex,Year,Name,Count,name_length
83016,CA,F,1979,Ji,5,2
331174,CA,M,1993,Vu,5,2
298821,CA,M,1978,Al,13,2
277555,CA,M,1962,Ty,55,2
404824,CA,M,2020,Jj,6,2
...,...,...,...,...,...,...
337819,CA,M,1996,Franciscojavier,8,15
325562,CA,M,1991,Franciscojavier,6,15
316193,CA,M,1987,Franciscojavier,5,15
317627,CA,M,1988,Franciscojavier,10,15


In [106]:
# drop new column
babynames=babynames.drop('name_length', axis=1)
babynames

Unnamed: 0,State,Sex,Year,Name,Count
0,CA,F,1910,Mary,295
1,CA,F,1910,Helen,239
2,CA,F,1910,Dorothy,220
3,CA,F,1910,Margaret,163
4,CA,F,1910,Frances,134
...,...,...,...,...,...
413889,CA,M,2023,Ziah,5
413890,CA,M,2023,Ziaire,5
413891,CA,M,2023,Zidane,5
413892,CA,M,2023,Zyan,5


### Approach 2: Sorting using the `key` argument

---



In [107]:
# Answer Here
babynames.sort_values("Name", key=lambda x: x.str.len())

Unnamed: 0,State,Sex,Year,Name,Count
83016,CA,F,1979,Ji,5
331174,CA,M,1993,Vu,5
298821,CA,M,1978,Al,13
277555,CA,M,1962,Ty,55
404824,CA,M,2020,Jj,6
...,...,...,...,...,...
337819,CA,M,1996,Franciscojavier,8
325562,CA,M,1991,Franciscojavier,6
316193,CA,M,1987,Franciscojavier,5
317627,CA,M,1988,Franciscojavier,10


### Approach 3: Sorting Using the `map` Function

We can also use the Python map function if we want to use an arbitrarily defined function. Suppose we want to sort by the number of occurrences of "dr" plus the number of occurences of "ea".

In [108]:

# Define a function to count occurrences of 'dr' and 'ea'
def dr_ea_count(string):
    return string.count("dr")+ string.count("ea")

# Apply the function to each name in the "Name" column and add as a new column
babynames["dr_ea_count"]= babynames["Name"].map(dr_ea_count)
# Sort the DataFrame by the new column in descending order
babynames=babynames.sort_values(by= "dr_ea_count" , ascending = False ) 
# Display the top rows
babynames.head()

Unnamed: 0,State,Sex,Year,Name,Count,dr_ea_count
115965,CA,F,1990,Deandrea,5,3
311780,CA,M,1985,Deandrea,6,3
108738,CA,F,1988,Deandrea,5,3
131037,CA,F,1994,Leandrea,5,3
101982,CA,F,1986,Deandrea,6,3


In [109]:
# Drop the `dr_ea_count` column
babynames= babynames.drop('dr_ea_count', axis=1)
display(babynames)

Unnamed: 0,State,Sex,Year,Name,Count
115965,CA,F,1990,Deandrea,5
311780,CA,M,1985,Deandrea,6
108738,CA,F,1988,Deandrea,5
131037,CA,F,1994,Leandrea,5
101982,CA,F,1986,Deandrea,6
...,...,...,...,...,...
141303,CA,F,1997,Lindy,6
141302,CA,F,1997,Lilyanna,6
141301,CA,F,1997,Layne,6
141300,CA,F,1997,Latrice,6


## Grouping

Group rows that share a common feature, then aggregate data across the group.

In this example, we count the total number of babies born in each year (considering only a small subset of the data, for simplicity).

<img src="images/groupby.png" width="800"/>

In [110]:
# DataFrame with baby gril names only
babynames[babynames["Sex"]== 'F' ].head()
# Answer Here
#Groupby similar features like year and apply aggregate
babynames= babynames.groupby("Year")["Count"].sum()
display(babynames)

# Answer Here
# Sort by Count

# Sort by Count in descending order

# Answer Here


Year
1910      9163
1911      9983
1912     17946
1913     22094
1914     26926
         ...  
2019    387325
2020    363307
2021    363206
2022    361960
2023    342550
Name: Count, Length: 114, dtype: int64

In [None]:
# print first 10 entries


In [None]:
# the total baby count in each year
# Answer Here


There are many different aggregation functions we can use, all of which are useful in different applications.

In [None]:
# What is the earliest year in which each name appeared?
# Answer Here

In [None]:
# What is the largest single-year count of each name?
# Answer Here

In [None]:
#Can you find the most popular baby name in the state of California (CA) for each year? use idxmax function.
#Provide a list of years along with the corresponding most popular names."
result = babynames.groupby("Year")['Count'].idxmax()
#Answer Here

## Case Study: Name "Popularity"

In this exercise, let's find the name with sex "F" that has dropped most in popularity since its peak usage. We'll start by filtering `babynames` to only include names corresponding to sex "F".

In [None]:
#Answer Here

In [None]:
# We sort the data by year

To build our intuition on how to answer our research question, let's visualize the prevalence of the name "Jennifer" over time.

In [None]:
# We'll talk about how to generate plots in a later lecture
fig = px.line(f_babynames[f_babynames["Name"] == "Jennifer"],
              x = "Year", y = "Count")
fig.update_layout(font_size = 18,
                  autosize=False,
                 width=1000,
                  height=400)

We'll need a mathematical definition for the change in popularity of a name.

Define the metric "ratio to peak" (RTP). We'll calculate this as the count of the name in 2022 (the most recent year for which we have data) divided by the largest count of this name in *any* year.

A demo calculation for Jennifer:

In [None]:
# Find the highest Jennifer 'count'


In [None]:
# Remember that we sorted f_babynames by year.
# This means that grabbing the final entry gives us the most recent count of Jennifers: 114
# In 2022, the most recent year for which we have data, 114 Jennifers were born


In [None]:
# Compute the RTP


We can also write a function that produces the `ratio_to_peak`for a given `Series`. This will allow us to use `.groupby` to speed up our computation for all names in the dataset.

In [None]:
# define the function for RTP
"""
Compute the RTP for a Series containing the counts per year for a single name
"""


In [None]:
# Construct a Series containing our Jennifer count data

# Then, find the RTP using the function define above


Now, let's use `.groupby` to compute the RTPs for *all* names in the dataset.

You may see a warning message when running the cell below. As discussed in lecture, `pandas` can't apply an aggregation function to non-numeric data (it doens't make sense to divide "CA" by a number). By default, `.groupby` will drop any columns that cannot be aggregated.

In [None]:
# Results in a TypeError
#rtp_table = f_babynames.groupby("Name").agg(ratio_to_peak)
#rtp_table

In [None]:
# Find the RTP fro all names at once using groupby as describe in lec slides


To avoid the warning message above, we explicitly extract only the columns relevant to our analysis before using `.agg`.

In [None]:
# Recompute the RTPs, but only performing the calculation on the "Count" column


In [None]:
# Rename "Count" to "Count RTP" for clarity


In [None]:
# What name has fallen the most in popularity?


We can visualize the decrease in the popularity of the name "?:"

In [None]:
def plot_name(*names):
    fig = px.line(f_babynames[f_babynames["Name"].isin(names)],
                  x = "Year", y = "Count", color="Name",
                  title=f"Popularity for: {names}")
    fig.update_layout(font_size = 18,
                  autosize=False,
                  width=1000,
                  height=400)
    return fig
# pass the name into plot_name
plot_name("-")

In [None]:
# Find the 10 names that have decreased the most in popularity
# Answer Here

In [None]:
plot_name(*top10)

For fun, try plotting your name or your friends' names.