In [1]:
%matplotlib notebook
import pandas as pd
from matplotlib import pyplot as pp
import seaborn as sns
import numpy as np
import json
from collections import defaultdict
from collections import Counter

# 3) US Baby Names 1880–2017

Podatki: http://www.ssa.gov/oact/babynames/limits.html.

In [2]:
!head -n 10 data/DATA_03_names/yob1880.txt

Mary,F,7065
Anna,F,2604
Emma,F,2003
Elizabeth,F,1939
Minnie,F,1746
Margaret,F,1578
Ida,F,1472
Alice,F,1414
Bertha,F,1320
Sarah,F,1288


In [4]:
names1880=pd.read_csv('data/DATA_03_names/yob1880.txt',names=['ime','spol','stevilo'])
names1880.head()

Unnamed: 0,ime,spol,stevilo
0,Mary,F,7065
1,Anna,F,2604
2,Emma,F,2003
3,Elizabeth,F,1939
4,Minnie,F,1746


In [6]:
names1881=pd.read_csv('data/DATA_03_names/yob1881.txt',names=['ime','spol','stevilo'])
names1881.head()

Unnamed: 0,ime,spol,stevilo
0,Mary,F,6919
1,Anna,F,2698
2,Emma,F,2034
3,Elizabeth,F,1852
4,Margaret,F,1658


In [14]:
years=range(1880,2018)
pieces=[]
columns=['ime','spol','stevilo']

In [15]:
for year in years:
    names_year=pd.read_csv('data/DATA_03_names/yob'+str(year)+'.txt',names=columns)
    names_year['leto']=year
    pieces.append(names_year)


In [16]:
len(pieces)

138

In [17]:
len(pieces[0])

2000

In [18]:
pieces[0].head()

Unnamed: 0,ime,spol,stevilo,leto
0,Mary,F,7065,1880
1,Anna,F,2604,1880
2,Emma,F,2003,1880
3,Elizabeth,F,1939,1880
4,Minnie,F,1746,1880


In [19]:
names=pd.concat(pieces,ignore_index=True)

In [21]:
names.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1924665 entries, 0 to 1924664
Data columns (total 4 columns):
ime        object
spol       object
stevilo    int64
leto       int64
dtypes: int64(2), object(2)
memory usage: 58.7+ MB


In [25]:
total_births=names.pivot_table('stevilo','leto','spol',aggfunc='sum')
total_births.head()

spol,F,M
leto,Unnamed: 1_level_1,Unnamed: 2_level_1
1880,90993,110491
1881,91953,100743
1882,107847,113686
1883,112319,104627
1884,129020,114442


In [28]:
total_births.plot(title='Stevilo rojstev')

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7f0125b95ef0>

In [29]:
def add_prob(group):
    group['prob']=group['stevilo']/group['stevilo'].sum()
    return group


In [30]:
names=names.groupby(['leto','spol']).apply(add_prob)

In [31]:
names.head()

Unnamed: 0,ime,spol,stevilo,leto,prob
0,Mary,F,7065,1880,0.077643
1,Anna,F,2604,1880,0.028618
2,Emma,F,2003,1880,0.022013
3,Elizabeth,F,1939,1880,0.021309
4,Minnie,F,1746,1880,0.019188


In [35]:
def get_top(group,n=1000):
    return group.sort_values(by='stevilo',ascending=False)[:n]

In [38]:
grouped=names.groupby(['leto','spol'])

In [43]:
topN=grouped.apply(get_top)
topN=topN.reset_index(drop=True)
topN.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 275877 entries, 0 to 275876
Data columns (total 5 columns):
ime        275877 non-null object
spol       275877 non-null object
stevilo    275877 non-null int64
leto       275877 non-null int64
prob       275877 non-null float64
dtypes: float64(1), int64(2), object(2)
memory usage: 10.5+ MB


In [45]:
boys=topN[topN['spol']=='M']
girls=topN[topN['spol']=='F']
boys.head()

Unnamed: 0,ime,spol,stevilo,leto,prob
942,John,M,9655,1880,0.087383
943,William,M,9532,1880,0.086269
944,James,M,5927,1880,0.053642
945,Charles,M,5348,1880,0.048402
946,George,M,5126,1880,0.046393


In [48]:
total_births=topN.pivot_table('stevilo','leto','ime',aggfunc='sum')
total_births.head()

ime,Aaden,Aadhya,Aaliyah,Aanya,Aarav,Aaron,Aarush,Ab,Abagail,Abb,...,Zoe,Zoey,Zoie,Zola,Zollie,Zona,Zora,Zula,Zuri,Zyaire
leto,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1880,,,,,,102.0,,,,,...,23.0,,,7.0,,8.0,28.0,27.0,,
1881,,,,,,94.0,,,,,...,22.0,,,10.0,,9.0,21.0,27.0,,
1882,,,,,,85.0,,,,,...,25.0,,,9.0,,17.0,32.0,21.0,,
1883,,,,,,105.0,,,,,...,23.0,,,10.0,,11.0,35.0,25.0,,
1884,,,,,,97.0,,,,,...,31.0,,,14.0,6.0,8.0,58.0,27.0,,


In [55]:
subset=total_births[['John','Harry','Mary','Khaleesi','Jon','Arya']]

In [56]:
subset.plot(subplots=True,figsize=[12,16],grid=False,title='Number of births per year')

<IPython.core.display.Javascript object>

array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f0123cea400>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x7f0123d16400>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x7f0123cc0518>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x7f0123c26940>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x7f0123c51da0>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x7f0123bff240>],
      dtype=object)

In [59]:
table=topN.pivot_table('prob',index='leto',columns='spol',aggfunc='sum')

In [60]:
table.head()

spol,F,M
leto,Unnamed: 1_level_1,Unnamed: 2_level_1
1880,1.0,0.997375
1881,1.0,1.0
1882,0.998702,0.995646
1883,0.997596,0.998566
1884,0.993156,0.994539


In [62]:
table.plot(ylim=[0.7,1])

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7f01249bc710>