# Analyzing Baby Names

In [None]:
import pandas as pd
from matplotlib.pylab import plt 
%matplotlib inline
import numpy as np
from random import sample

# Read and Write data

### Read the file yob2000.txt 

In [None]:
df_2000 = pd.read_csv('yob2000.txt' , names=['name', 'gender', 'births'])
df_2000

### Print the first 10 entries

In [None]:
first_10=df_2000.head(10)
first_10

### Write the data to a different file

In [None]:
df_write=first_10.to_csv('df_2000_first_10.txt')

# Calculate the total births

### Calculate the sum of the birth count column in the file yob2000.txt

In [None]:
total_births = df_2000['births'].sum()
total_births

# Separate boys / girls

### Calculate separate sums for boys and girls

In [None]:
boys_girls = df_2000.groupby('gender').sum()
boys_girls

### Plot both sums in a bar plot

In [None]:
boys_girls.plot.bar()

# Frequent names

### Count how many names occur at least 1000 times in the file yob2000.txt

In [None]:
df_2000[df_2000['births'] >= 1000].count()

# Relative amount

### Create a new column containing the percentage of a name on the total births of a given year

In [None]:
temp = df_2000['births']/total_births
df_2000['percentage'] = temp*100
df_2000

### Verify that the sum of percentages is 100%.

In [None]:
df_2000['percentage'].sum()

### Calculate the percentage of the top 10 names on all births

In [None]:
top_10 = df_2000.sort_values(by='percentage', ascending=False)
top_10.head(10).births.sum()

# Search your name

### Identify and print all lines containing your name in the year 2000

In [None]:
name_search = df_2000.set_index('name')
name_search.loc['Mary']

# Bar plot

### Create a bar plot showing 5 selected names for the year 2000

In [None]:
name_plot = df_2000.sample(n=5)
plot_5 = name_plot.plot.bar(x='name', y='births')
plot_5

# Read all names

### To read the complete dataset, we need to loop though all file names

In [None]:
years = range(1880,2017)
data = []
columns = ['name', 'gender', 'births']

for y in years:
    fn = 'yob' + str(y) + '.txt'
    df = pd.read_csv(fn, names=columns)
    df['years'] = y
    data.append(df)
df = pd.concat(data)
df

# Plot a time series

### Extract all rows containing your name from the variable df

In [None]:
my_name = df.set_index('name')

search_mary = my_name.loc['Mary']
search_mary

### Plot the number of babies having your name and gender over time
### Make the plot nicer by adding row/column labels and a title
### Change the color and thickness of the line

In [None]:
male_mary = search_mary.set_index('gender').loc['M']
male_mary.columns = ['Male', 'Year']

female_mary = search_mary.set_index('gender').loc['F']
female_mary.columns = ['Female','Year']

all_mary = pd.merge(male_mary, female_mary, on='Year')

ax = plt.gca()

all_mary.plot(kind = 'line', x='Year', y='Male', ax=ax, lw=4, color = 'blue')
all_mary.plot(kind = 'line', x='Year', y='Female', secondary_y=True, ax=ax, color = 'red')

plt.title('Mary 1880-2017')

### Save the plot as a high-resolution diagram

In [None]:
plt.savefig('Mary.png', dpi=1200)

# Celebrities

### Plot time lines of names of celebrities
### Try actors, presidents, princesses, Star Wars, GoT, motorcycles, boot camp participants … 

In [None]:
celeb_name = df.set_index('name')

celebrity = celeb_name.loc['Cristiano']
cristiano = celebrity.rename(columns={ 'births' : 'Births', 'years' : 'Year'})

ax=plt.gca()

cristiano.plot(kind = 'line', x='Year', y='Births', ax=ax)
plt.title('Cristiano')

# Normalize

### Divide the number of births by the total number of births in that year to obtain the relative frequency

In [None]:
lookup_table = df.groupby('years').births.sum()
lookup_table = lookup_table.reset_index()
df_new = pd.merge(lookup_table,df, on='years')
df_new['relative_frequency'] = df_new['births_y']/df_new['births_x']

### Plot the time series of your name or the celebrity names again

In [None]:
new_celeb_name = df_new.set_index('name')

celebrity = new_celeb_name.loc['Cristiano']

ax = plt.gca()

celebrity.plot(kind = 'bar', x='years', y='relative_frequency', ax=ax)
plt.title('Cristiano_Relative_Frequency')

# Name diversity

### Have the baby names become more diverse over time?

In [None]:
diversity = df.groupby('years').name.count()
diversity.plot.line()

# Long names

### Add an extra column that contains the length of the name

In [None]:
df['Name_length'] = df['name'].apply(len)

### Print the 10 longest names to the screen

In [None]:
longest_10 = df.sort_values(by='Name_length', ascending=False)
longest_10.head(10)

# First letter statistics

### Add an extra column that contains the first letter of the name

In [None]:
df['first_letter'] = df['name'].apply(lambda x: x[0])

### Count how many names start with ‘A’

In [None]:
new_count = df.set_index('first_letter')
count_A = new_count.loc['A']
count_A

### Plot the relative occurence of initials over time

In [None]:
ax=plt.gca()

count_A.plot(kind = 'line', x='years', y='births', ax=ax)
plt.title('First Occurance of A')

# Last letter statistics

### Add an extra column that contains the last letter of the name

In [None]:
df['last_letter']=df['name'].apply(lambda x: x[-1])

### Count how many names end with ‘A’

In [None]:
new_count_last = df.set_index('last_letter')
count_A_last = new_count_last.loc['a']
count_A_last.count()

### Plot the relative occurence of initials over time

In [None]:
ax=plt.gca()

count_A_last.plot(kind = 'line', x='years', y='births', ax=ax)
plt.title('Last Occurance of A')

### Separate by boys/girls

In [None]:
ax=plt.gca()

no_of_males = count_A_last.set_index('gender').loc['M']
no_of_females = count_A_last.set_index('gender').loc['F']

no_of_males.plot(kind = 'line', x='years', y='births', ax=ax, color = 'black')
no_of_females.plot(kind = 'line', x='years', y='births', ax=ax)

# e-rich Names

### Find all names that contain the character ‘e’ at least four times.

In [None]:
df['e_count'] = df['name'].str.count('e')

most_e = df.sort_values(by='e_count', ascending=False)
e_name = df[df['e_count'] >= 4]

e_name.count()
e_name