In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from IPython.core.display import display
% matplotlib inline
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual

sns.set()
sns.set_context('talk')
np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option('display.max_rows', 7)
pd.set_option('display.max_columns', 8)
pd.set_option('precision', 2)

def display_df(df, rows=pd.options.display.max_rows,
               cols=pd.options.display.max_columns):
	with pd.option_context('display.max_rows', rows,
	                       'display.max_columns', cols):
		display(df)


Data Acquisition and Data Cleaning

In [None]:
import pandas as pd

students = pd.read_csv('roster.csv')
students

We see that there are entries that vary by features that we are not
interested in, eg capitalization. We can make everything lower case.


In [None]:
students['Name'] = students['Name'].str.lower()
students


We are going to explore our data.

In [None]:
students

In [None]:
print("There are", len(students), "students on the roster.")

to look at the meaning of the fields

In [None]:
students['Role'].value_counts().to_frame()

How many names?

In [None]:
sns.distplot(students['Name'].str.len(), rug=True, bins=np.arange(12),
             axlabel="Number of Characters")
plt.xlim(0, 12)
plt.xticks(np.arange(12))
plt.ylabel('Proportion per character');

What is a specific name in the list

In [None]:
students['Name'][5]

The text asks these questions...

1. "Do the first names of students in Data 100 tell us the distribution of sex in the class?"
1. "Do the first names of students in Data 100 tell us the distribution of ages in the class?"

And it follows the below to answer those questios

In [None]:
import urllib.request
import os.path

data_url = "https://www.ssa.gov/oact/babynames/names.zip"
local_filename = "babynames.zip"
if not os.path.exists(
		local_filename):  # if the data exists don't download again
	with urllib.request.urlopen(data_url) as resp, open(local_filename,
	                                                    'wb') as f:
		f.write(resp.read())

import zipfile

babynames = []
with zipfile.ZipFile(local_filename, "r") as zf:
	data_files = [f for f in zf.filelist if f.filename[-3:] == "txt"]

	def extract_year_from_filename(fn):
		return int(fn[3:7])

	for f in data_files:
		year = extract_year_from_filename(f.filename)
		with zf.open(f) as fp:
			df = pd.read_csv(fp, names=["Name", "Sex", "Count"])
			df["Year"] = year
			babynames.append(df)
babynames = pd.concat(babynames)
babynames

In [None]:
import urllib.request
import os.path

data_url = "https://www.ssa.gov/oact/babynames/names.zip"
local_filename = "babynames.zip"
if not os.path.exists(
		local_filename):  # if the data exists don't download again
	with urllib.request.urlopen(data_url) as resp, open(local_filename,
	                                                    'wb') as f:
		f.write(resp.read())

import zipfile

babynames = []
with zipfile.ZipFile(local_filename, "r") as zf:
	data_files = [f for f in zf.filelist if f.filename[-3:] == "txt"]

	def extract_year_from_filename(fn):
		return int(fn[3:7])

	for f in data_files:
		year = extract_year_from_filename(f.filename)
		with zf.open(f) as fp:
			df = pd.read_csv(fp, names=["Name", "Sex", "Count"])
			df["Year"] = year
			babynames.append(df)
babynames = pd.concat(babynames)
babynames

As the book explains, "it looks like the dataset contains names, the
sex given to the baby, the number of babies with that name, and the
year of birth for those babies. To be sure, we check the dataset description from the SSN Office ([https://www.ssa.gov/oact/babynames/background.html](https://www.ssa.gov/oact/babynames/background.html)). "

They begin by plotting the number of male and female babies born each
year:

In [None]:
pivot_year_name_count = pd.pivot_table(babynames, index='Year',
                                       columns='Sex', values='Count',
                                       aggfunc=np.sum)

pink_blue = ["#E188DB", "#334FFF"]
with sns.color_palette(sns.color_palette(pink_blue)):
	pivot_year_name_count.plot(marker=".")
	plt.title("Registered Names vs Year Stratified by Sex")
	plt.ylabel('Names Registered that Year')

In [None]:
babynames['Name'] = babynames['Name'].str.lower()
babynames

Then, we count up how many male and female babies were born in total for each name:

In [None]:
sex_counts = pd.pivot_table(babynames, index='Name', columns='Sex',
                            values='Count', aggfunc='sum',
                            fill_value=0., margins=True)
sex_counts

"To determine whether a name is more popular for male or female babies, we can compute the proportion of times the name was given to a female baby."

In [None]:
prop_female = sex_counts['F'] / sex_counts['All']
sex_counts['prop_female'] = prop_female
sex_counts

Looking at the proportion of female names given during a time of year

In [None]:
def sex_from_name(name):
	if name in sex_counts.index:
		prop = sex_counts.loc[name, 'prop_female']
		return 'F' if prop > 0.5 else 'M'
	else:
		return 'Name not in dataset'

sex_from_name('sam')

To determine age from a name

In [None]:
def avg_year(group):
	return np.average(group['Year'], weights=group['Count'])

avg_years = (babynames.groupby('Name').apply(avg_year).rename(
		'avg_year').to_frame())
avg_years

As previously mentioned, the authors develope a function to determine
 the average birth year using a given name.

In [None]:
def year_from_name(name):
	return (avg_years.loc[
		        name, 'avg_year'] if name in avg_years.index else None)

interact(year_from_name, name='fernando');

Then mark each name in Data 100 with its inferred birth year.

In [None]:
students['year'] = students['Name'].apply(year_from_name)
students

Plot the distribution

In [None]:
sns.distplot(students['year'].dropna());

The average year:

In [None]:
students['year'].mean()

The class has an average age of 35 years old

In [None]:
names = babynames.set_index('Name').sort_values('Year')
john = names.loc['john']
john[john['Sex'] == 'M'].plot('Year', 'Count')
plt.title('Frequency of "John"');

As the text explains, "It appears that the average birth year does not
provide an accurate estimate for a given person's age in general. In a few cases, however, a person's first name is quite revealing!"

In [None]:
names = babynames.set_index('Name').sort_values('Year')
kanye = names.loc['kanye']
kanye[kanye['Sex'] == 'M'].plot('Year', 'Count')
plt.title('Frequency of "Kanye"');