# Baby Names Analysis - Tell a Story

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
%matplotlib inline

### Load Data

In [None]:
baby_names = pd.read_pickle(r'.\DATA\names\yob_all.pkl')

In [None]:
baby_names.head(2)

### Total Population

In [None]:
baby_names.groupby('sex').births.sum()

In [None]:
sum(baby_names.births)

### Exploring the Data
* Total Births by sex and year

In [None]:
baby_names.groupby(['year','sex']).births.sum()

In [None]:
birthdate = baby_names.pivot_table(values='births', index=['year'],columns=['sex'], aggfunc=np.sum)
birthdate.head(10)

In [None]:
birthdate.plot(title='Total Births by sex and year')

## Adding Proportion

*  The proportions will be the number of births out of each total births grouped by year and sex. 

In [None]:
#find out what proportion of the baby population for each year is represented by each name.
def add_prop(group):
    # integer division floors
    births = group.births.astype(float)
    group['prop'] = births / births.sum()
    return group


baby_names = baby_names.groupby(['year','sex']).apply(add_prop)

baby_names.head()

In [None]:
sum(baby_names.births[(baby_names.year == 1880) & (baby_names.sex == 'F')])

In [None]:
7065 / 90993

In [None]:
baby_names.groupby(['year','sex']).prop.sum()

In [None]:
baby_names.sort_values(by='prop',ascending=False)

### Subsetting Top 1000 Names

In [None]:
# subset top 1000 births

def get_top1000(group):
    return group.sort_values(by='prop',ascending=False)[:1000] #.sort_index(by='births', ascending=False)[:1000]

top1000 = baby_names.groupby(['year','sex']).apply(get_top1000)

top1000.head(3)


In [None]:
top1000.groupby(['year','sex']).count()

In [None]:
baby_names[(baby_names.year == 1880) & (baby_names.sex == 'F')].count()

### Which spelling is more popular?

In [None]:
birthrate = baby_names.pivot_table('births',index='year',columns='name',aggfunc=sum)
birthrate.head()

In [None]:
myNames = birthrate[['Holly','Hollie']]
myNames

In [None]:
myNames.plot(title='Which spelling is more popular?')

In [None]:
myNames.plot(subplots=True,title='Which spelling is more popular?')

### analyze naming trends

In [None]:
total_births = pd.pivot_table(top1000, values='births', index=['year'],columns=['name'], aggfunc=np.sum)
total_births

In [None]:
#names growing out of favor? 
subset = total_births[['John','Harry','Mary','Marilyn']]

subset.plot(subplots=True, figsize=(12,10),grid=False,title='Number of births per year')

# Name Diversity

* the drop in births for certain names have something to do with the name **diversity**:
    * what parents choose to name their child.
    * fewer parents choosing common names for children
    * The trend changes from 1950's onwards.

In [None]:
# exploring increases in naming diversity
table = top1000.pivot_table(values='prop', index=['year'],columns=['sex'], aggfunc=np.sum)
table.head(10)

In [None]:
table.plot(title='Sum of table1000.prop by year and sex', yticks=np.linspace(0,1.2,13), xticks=range(1880,2020,10))

#### Story:
* proportion total starts at 1.0 in 1880, and slowly drops in 1960 for females and in 1970 for males. 
* The decline in proportion of births accounted by the top 1000 names has declined:74% for females and 85% for males by 2010.
* **That means the share of births for other names outside of the top 1000 has risen**. 
* More parents are choosing different, more uncommon names to call their newborns.

### top 50% of names covered with 25 names

In [None]:
girls = top1000[top1000.sex == 'F']
girls.head()

boys = top1000[top1000.sex == 'M']
boys.head(2)

In [None]:
# names proportion going down from 1 from top 1000 names

df = boys[boys.year==2010]
df.head()

In [None]:
prop_cumsum = df.sort_values(by='prop',ascending=False).prop.cumsum()
prop_cumsum[:10]

In [None]:
prop_cumsum.searchsorted(0.5)

#### Story:
* index 116 where sum is .5 (50%)
* **117** names in top 50% in 2010

In [None]:
df1900 = boys[boys.year==1900]
prop1900 = df1900.sort_values(by='prop',ascending=False).prop.cumsum()
prop1900.searchsorted(0.5)+1


#### Story:
* in 1900, top 50% of names covered with **25 names**
* so there is a large increase in name diversity

In [None]:
def get_quantile(group, q=0.5):
    group = group.sort_values(by='prop',ascending=False).prop.cumsum()

    return group.searchsorted(q)[0] + 1

diversity = top1000.groupby(['year','sex']).apply(get_quantile)

diversity


In [None]:
diversity = diversity.unstack('sex')
diversity

In [None]:
diversity.plot(title='Number of popular names in top 50%')


## More Analysis - last letter of a name

* Male and Female Baby Name Last Letter Proportions

In [None]:
# use lambda function
get_last_letter = lambda x: x[-1]

baby_names['last_letter'] = baby_names.name.map(get_last_letter)
baby_names.head()

In [None]:
# create pivot table last_letters in rows

table = baby_names.pivot_table(values='births', index=['last_letter'], columns=['sex', 'year'], aggfunc=sum)
table.head()  


In [None]:
# subset only certain years
   
subtable = table.reindex(columns=[1910,1960,2010], level='year')
subtable.head()


In [None]:
#create proportion out of total in each year and sex

subtable.sum()


In [None]:
letter_prop = subtable / subtable.sum().astype(float)
letter_prop

In [None]:
import matplotlib.pyplot as plt
fig, axes = plt.subplots(2, 1, figsize=(10,8))
letter_prop['M'].plot(kind='bar', rot=0, ax=axes[0], title='Male')

letter_prop['F'].plot(kind='bar', rot=0, ax=axes[1], title='Female', legend=False)


### Story
* From the three selected years 1910, 1960, and 2010, we see different distributions in last letters between male and female.
* For females, 'a', 'e', 'y', and 'n' were mostly popular throughout the years.
* For Males, 'n', 'e' , and 'y' were mostly popular throughout the years.
* **hummm** We do see an unusual surge in the last letter of 'n' for male names in 2010. 
* Male last letters were more **even distributed** across the alphabet, compared to female last letters.

### Select Letters Throughout the Years

In [None]:
table.head() 

In [None]:
# normalize by year and sex
letter_prop = table/table.sum().astype(float)

letter_prop.head()

In [None]:
# subset last letters of boy names
dny_ts = letter_prop.ix[['d','n','y'],'M'].T

dny_ts.head()

In [None]:
dny_ts.plot(title='Selected last letters of male baby names')
# last letter female names
lny_ts = letter_prop.ix[['l','n','y'],'F'].T
lny_ts.plot(title='Selected last letters of female baby names')

### Name Flipping

In [None]:
# boy names that became girl names and reverse

all_names = top1000.name.unique()
all_names.shape


In [None]:
mask = np.array(['lesl' in x.lower() for x in all_names])
lesley_like = all_names[mask]

lesley_like

In [None]:
filtered = top1000[top1000.name.isin(lesley_like)]
filtered.tail()

In [None]:
filtered.groupby('name').births.sum()


In [None]:
table = filtered.pivot_table(values='births', index=['year'], columns=['sex'], aggfunc=sum)
table



In [None]:
#table = table.div(table.sum(1), axis=0)
#table.tail(1000)


In [None]:

table.plot(style={'M': 'k-', 'F': 'k--'})