## Introduction To The Dataset

In [1]:
file = open("US_births_1994-2003_CDC_NCHS.csv", "r")
data = file.read()
rows = data.split("\n")

In [2]:
rows[0:10]

['year,month,date_of_month,day_of_week,births',
 '1994,1,1,6,8096',
 '1994,1,2,7,7772',
 '1994,1,3,1,10142',
 '1994,1,4,2,11248',
 '1994,1,5,3,11053',
 '1994,1,6,4,11406',
 '1994,1,7,5,11251',
 '1994,1,8,6,8653',
 '1994,1,9,7,7910']

## Converting Data Into A List Of Lists

In [3]:
def read_csv(i_csv):
    file = open(i_csv, "r")
    data = file.read()
    rows = data.split("\n")
    string_list = rows[1:]
    
    final_list = []
    
    for each in string_list:
        int_fields    = []
        string_fields = each.split(",")
        
        for each_str in string_fields:
            int_fields.append(int(each_str))
            
        final_list.append(int_fields)
        
    return final_list 

In [4]:
cdc_list = read_csv("US_births_1994-2003_CDC_NCHS.csv")

In [5]:
cdc_list[0:10]

[[1994, 1, 1, 6, 8096],
 [1994, 1, 2, 7, 7772],
 [1994, 1, 3, 1, 10142],
 [1994, 1, 4, 2, 11248],
 [1994, 1, 5, 3, 11053],
 [1994, 1, 6, 4, 11406],
 [1994, 1, 7, 5, 11251],
 [1994, 1, 8, 6, 8653],
 [1994, 1, 9, 7, 7910],
 [1994, 1, 10, 1, 10498]]

## Calculating Number Of Births Each Month

In [6]:
def month_births(i_list):
    births_per_month = {}
    
    for each in i_list:
        month  = each[1]
        births = each[4]
        
        if month in births_per_month:
            births_per_month[month] += births
        else:
            births_per_month[month]  = births
    
    return births_per_month    

In [7]:
cdc_month_births = month_births(cdc_list)

In [8]:
cdc_month_births

{1: 3232517,
 2: 3018140,
 3: 3322069,
 4: 3185314,
 5: 3350907,
 6: 3296530,
 7: 3498783,
 8: 3525858,
 9: 3439698,
 10: 3378814,
 11: 3171647,
 12: 3301860}

## Calculating Number Of Births Each Day Of Week

In [9]:
def dow_births(i_list):
    births_per_week = {}
    
    for each in i_list:
        day    = each[3]
        births = each[4]
        
        if day in births_per_week:
            births_per_week[day] += births
        else:
            births_per_week[day]  = births
            
    return births_per_week

In [10]:
cdc_day_births = dow_births(cdc_list)

In [11]:
cdc_day_births

{1: 5789166,
 2: 6446196,
 3: 6322855,
 4: 6288429,
 5: 6233657,
 6: 4562111,
 7: 4079723}

## Creating A More General Function

In [12]:
def calc_counts(i_list, i_column):
    births_per_column = {}
    
    for each in i_list:
        column = each[i_column]
        births = each[4]
        
        if column in births_per_column:
            births_per_column[column] += births
        else:
            births_per_column[column]  = births
    
    return births_per_column

In [13]:
cdc_year_births  = calc_counts(cdc_list,0)
cdc_month_births = calc_counts(cdc_list,1)
cdc_dom_births   = calc_counts(cdc_list,2)
cdc_dow_births   = calc_counts(cdc_list,3)

In [14]:
cdc_year_births

{1994: 3952767,
 1995: 3899589,
 1996: 3891494,
 1997: 3880894,
 1998: 3941553,
 1999: 3959417,
 2000: 4058814,
 2001: 4025933,
 2002: 4021726,
 2003: 4089950}

In [15]:
cdc_month_births

{1: 3232517,
 2: 3018140,
 3: 3322069,
 4: 3185314,
 5: 3350907,
 6: 3296530,
 7: 3498783,
 8: 3525858,
 9: 3439698,
 10: 3378814,
 11: 3171647,
 12: 3301860}

In [16]:
cdc_dom_births

{1: 1276557,
 2: 1288739,
 3: 1304499,
 4: 1288154,
 5: 1299953,
 6: 1304474,
 7: 1310459,
 8: 1312297,
 9: 1303292,
 10: 1320764,
 11: 1314361,
 12: 1318437,
 13: 1277684,
 14: 1320153,
 15: 1319171,
 16: 1315192,
 17: 1324953,
 18: 1326855,
 19: 1318727,
 20: 1324821,
 21: 1322897,
 22: 1317381,
 23: 1293290,
 24: 1288083,
 25: 1272116,
 26: 1284796,
 27: 1294395,
 28: 1307685,
 29: 1223161,
 30: 1202095,
 31: 746696}

In [17]:
cdc_dow_births

{1: 5789166,
 2: 6446196,
 3: 6322855,
 4: 6288429,
 5: 6233657,
 6: 4562111,
 7: 4079723}

## Bonus 1: Min vs Max

The function works without keys.

In [18]:
def find_min_max(i_dic):
    min_max = {
        "min": list(i_dic.values())[0],
        "max": list(i_dic.values())[0]
    }

    for key, value in i_dic.items():
        if min_max["max"] < value:
            min_max["max"] = value

        if min_max["min"] > value:
            min_max["min"] = value

    return min_max

In [19]:
min_max_year_births  = find_min_max(cdc_year_births)
min_max_month_births = find_min_max(cdc_month_births)
min_max_dom_births   = find_min_max(cdc_dom_births)
min_max_dow_births   = find_min_max(cdc_dow_births)

In [20]:
print("By year: "         +str(min_max_year_births))
print("By month: "        +str(min_max_month_births))
print("By day of month: " +str(min_max_dom_births))
print("By day of week: "  +str(min_max_dow_births))

By year: {'min': 3880894, 'max': 4089950}
By month: {'min': 3018140, 'max': 3525858}
By day of month: {'min': 746696, 'max': 1326855}
By day of week: {'min': 4079723, 'max': 6446196}


## Bonus 2: Dynamics of the birth rate

In [21]:
"""
i_csv    | file path
i_indx   | Type analysis
       0 - by year
       1 - by month
       2 - by day of month
       3 - by day of weak
i_header | Does the file contain a header line
"""


def rate_dynamics(p_csv, p_index, p_header=False):

    l_rows = open(p_csv, "r").read().split("\n")
    if p_header:
        l_rows = l_rows[1:]

    l_dict = {}

    for each in l_rows:
        int_list = [int(x) for x in each.split(",")]

        if int_list[p_index] in l_dict:
            l_dict[int_list[p_index]] += int_list[4]
        else:
            l_dict[int_list[p_index]] = int_list[4]

    return l_dict

In [22]:
dyn_year_dict  = rate_dynamics("US_births_1994-2003_CDC_NCHS.csv", 0, p_header=True)
dyn_month_dict = rate_dynamics("US_births_1994-2003_CDC_NCHS.csv", 1, p_header=True)
dyn_dom_dict   = rate_dynamics("US_births_1994-2003_CDC_NCHS.csv", 2, p_header=True)
dyn_dow_dict   = rate_dynamics("US_births_1994-2003_CDC_NCHS.csv", 3, p_header=True)

In [23]:
print(dyn_year_dict)

{1994: 3952767, 1995: 3899589, 1996: 3891494, 1997: 3880894, 1998: 3941553, 1999: 3959417, 2000: 4058814, 2001: 4025933, 2002: 4021726, 2003: 4089950}


In [24]:
print(dyn_month_dict)

{1: 3232517, 2: 3018140, 3: 3322069, 4: 3185314, 5: 3350907, 6: 3296530, 7: 3498783, 8: 3525858, 9: 3439698, 10: 3378814, 11: 3171647, 12: 3301860}


In [25]:
print(dyn_dom_dict)

{1: 1276557, 2: 1288739, 3: 1304499, 4: 1288154, 5: 1299953, 6: 1304474, 7: 1310459, 8: 1312297, 9: 1303292, 10: 1320764, 11: 1314361, 12: 1318437, 13: 1277684, 14: 1320153, 15: 1319171, 16: 1315192, 17: 1324953, 18: 1326855, 19: 1318727, 20: 1324821, 21: 1322897, 22: 1317381, 23: 1293290, 24: 1288083, 25: 1272116, 26: 1284796, 27: 1294395, 28: 1307685, 29: 1223161, 30: 1202095, 31: 746696}


In [26]:
print(dyn_dow_dict)

{6: 4562111, 7: 4079723, 1: 5789166, 2: 6446196, 3: 6322855, 4: 6288429, 5: 6233657}


## Bonus 3: Combine CDC with SSA data

Combine U.S. births data:
<br>[CDC NCHS] Centers for Disease Control and Prevention's National Center for Health Statistics (1994-2003);
<br>[SSA] Social Security Administration (2000 - 2014).

We will use the calculation of the arithmetic mean in a case of an intersection of the dates.

In [27]:
def dicts_combine(p_dict_1, p_dict_2):
    if list(p_dict_1.keys())[0] < list(p_dict_2.keys())[0]:
        first_dict = p_dict_1
        second_dict = p_dict_2
    else:
        first_dict = p_dict_2
        second_dict = p_dict_1

    final_dict = {}

    for each in first_dict:
        if not (each in second_dict):
            final_dict[each] = first_dict[each]
        else:
            final_dict[each] = (first_dict[each] + second_dict[each]) / 2

    for each in second_dict:
        if not (each in final_dict):
            final_dict[each] = second_dict[each]

    return final_dict

In [28]:
cdc_dict = rate_dynamics("US_births_1994-2003_CDC_NCHS.csv", 0, p_header=True)
ssa_dict = rate_dynamics("US_births_2000-2014_SSA.csv", 0, p_header=True)

In [29]:
combine_dict = dicts_combine(cdc_dict, ssa_dict)

In [30]:
print(combine_dict)

{1994: 3952767, 1995: 3899589, 1996: 3891494, 1997: 3880894, 1998: 3941553, 1999: 3959417, 2000: 4104206.0, 2001: 4068448.0, 2002: 4060519.5, 2003: 4126505.0, 2004: 4186863, 2005: 4211941, 2006: 4335154, 2007: 4380784, 2008: 4310737, 2009: 4190991, 2010: 4055975, 2011: 4006908, 2012: 4000868, 2013: 3973337, 2014: 4010532}
