**Introduction to the Dataset**

In [1]:
f = open('US_births_1994-2003_CDC_NCHS.csv','r')
r = f.read()
rows = r.split('\n')
rows[0:10]

['year,month,date_of_month,day_of_week,births',
 '1994,1,1,6,8096',
 '1994,1,2,7,7772',
 '1994,1,3,1,10142',
 '1994,1,4,2,11248',
 '1994,1,5,3,11053',
 '1994,1,6,4,11406',
 '1994,1,7,5,11251',
 '1994,1,8,6,8653',
 '1994,1,9,7,7910']

**Converting Data into a List of Lists**

In [2]:
def read_csv(file_name):
    file = open(file_name, 'r')
    r = file.read()
    rows = r.split('\n')
    string_list = rows[1:len(rows)]
    final_list = []
    for row in string_list:
        int_fields = []
        string_fields = row.split(',')
        for each in string_fields:
            num = int(each)
            int_fields.append(num)
        final_list.append(int_fields)
    return final_list

cdc_list = read_csv("US_births_1994-2003_CDC_NCHS.csv")
cdc_list[0:10]

[[1994, 1, 1, 6, 8096],
 [1994, 1, 2, 7, 7772],
 [1994, 1, 3, 1, 10142],
 [1994, 1, 4, 2, 11248],
 [1994, 1, 5, 3, 11053],
 [1994, 1, 6, 4, 11406],
 [1994, 1, 7, 5, 11251],
 [1994, 1, 8, 6, 8653],
 [1994, 1, 9, 7, 7910],
 [1994, 1, 10, 1, 10498]]

**Calculating Number of Births Each Month**

In [3]:
def month_births(lstoflst):
    births_per_month = {}
    for lst in lstoflst:
        month = lst[1]
        births = lst[4]
        if month in births_per_month:
            births_per_month[month] += births
        else: 
            births_per_month[month] = births
    return births_per_month

cdc_month_births = month_births(cdc_list)
cdc_month_births

{1: 3232517,
 2: 3018140,
 3: 3322069,
 4: 3185314,
 5: 3350907,
 6: 3296530,
 7: 3498783,
 8: 3525858,
 9: 3439698,
 10: 3378814,
 11: 3171647,
 12: 3301860}

**Calculating Number of Births Each Day of Week**

In [4]:
def dow_births(lstoflst):
    births_per_dow = {}
    for lst in lstoflst:
        dow = lst[3]
        births = lst[4]
        if dow in births_per_dow:
            births_per_dow[dow] += births
        else: 
            births_per_dow[dow] = births
    return births_per_dow

cdc_day_births = dow_births(cdc_list)
cdc_day_births

{1: 5789166,
 2: 6446196,
 3: 6322855,
 4: 6288429,
 5: 6233657,
 6: 4562111,
 7: 4079723}

**Creating a More General Function**

In [5]:
def calc_counts(data, column):
    births_dict = {}
    for lst in data:
        value = lst[column]
        births = lst[4]
        if value in births_dict:
            births_dict[value] += births
        else: 
            births_dict[value] = births
    return births_dict

cdc_year_births = calc_counts(cdc_list, 0)
cdc_month_births = calc_counts(cdc_list, 1)
cdc_dom_births = calc_counts(cdc_list, 2)
cdc_dow_births = calc_counts(cdc_list, 3)

In [6]:
cdc_year_births 

{1994: 3952767,
 1995: 3899589,
 1996: 3891494,
 1997: 3880894,
 1998: 3941553,
 1999: 3959417,
 2000: 4058814,
 2001: 4025933,
 2002: 4021726,
 2003: 4089950}

In [7]:
cdc_month_births

{1: 3232517,
 2: 3018140,
 3: 3322069,
 4: 3185314,
 5: 3350907,
 6: 3296530,
 7: 3498783,
 8: 3525858,
 9: 3439698,
 10: 3378814,
 11: 3171647,
 12: 3301860}

In [8]:
cdc_dom_births

{1: 1276557,
 2: 1288739,
 3: 1304499,
 4: 1288154,
 5: 1299953,
 6: 1304474,
 7: 1310459,
 8: 1312297,
 9: 1303292,
 10: 1320764,
 11: 1314361,
 12: 1318437,
 13: 1277684,
 14: 1320153,
 15: 1319171,
 16: 1315192,
 17: 1324953,
 18: 1326855,
 19: 1318727,
 20: 1324821,
 21: 1322897,
 22: 1317381,
 23: 1293290,
 24: 1288083,
 25: 1272116,
 26: 1284796,
 27: 1294395,
 28: 1307685,
 29: 1223161,
 30: 1202095,
 31: 746696}

In [9]:
cdc_dow_births

{1: 5789166,
 2: 6446196,
 3: 6322855,
 4: 6288429,
 5: 6233657,
 6: 4562111,
 7: 4079723}

**Creating a Min/Max Function**

In [10]:
def min_max(dictionary, return_min = True):
    min_dict = 99999999999
    max_dict = 0
    for key in dictionary:
        if dictionary[key] < min_dict:
            min_dict = dictionary[key]
        elif dictionary[key] > max_dict:
            max_dict = dictionary[key]
    if return_min == True:
        return min_dict
    else:
        return max_dict
    
min_cdc_dom_births = min_max(cdc_dom_births)
max_cdc_dom_births = min_max(cdc_dom_births, False)

In [11]:
min_cdc_dom_births

746696

In [12]:
max_cdc_dom_births

1326855

**Extracting Values Across Years Function**

In [13]:
def across_years(i_year, f_year, column, column_value):
    lst = []
    while i_year <= f_year:
        for row in cdc_list:
            if i_year == row[0]:
                if column_value == row[column]:
                    lst.append(row)
        i_year += 1
    final_dict = {}
    for row in lst:
        final_dict[row[0]] = row[4]
    # This adds all the corresponding values to a dictionary as opposed to calculating the difference between years, as the latter may result in loss of general data and simplicity.
    # If we wish to calculate differences, simply write a for loop with (initial year + 1), running through dictionary and subtracting [initial year + 1] - [initial year] in
    return final_dict

**Example of Extraction Function: Saturday births from 1994 to 2003:**

In [14]:
Saturday_births_1994_to_2003 = across_years(1994, 2003, 3, 6)
Saturday_births_1994_to_2003

{1994: 8809,
 1995: 9093,
 1996: 9103,
 1997: 8679,
 1998: 7735,
 1999: 6674,
 2000: 9177,
 2001: 9365,
 2002: 8953,
 2003: 8646}

**Combining CDC Data and SSA Data**

In [15]:
cdc_list = read_csv("US_births_1994-2003_CDC_NCHS.csv")
ssa_list = read_csv("US_births_2000-2014_SSA.csv")

# We can choose either file to trim for the extra dates - here we choose cdc_list.

new_data_list = []
for row in cdc_list:
    if row[0] < 2000:
        new_data_list.append(row)
for row in ssa_list:
    new_data_list.append(row)

In [16]:
new_data_list[0:10]

[[1994, 1, 1, 6, 8096],
 [1994, 1, 2, 7, 7772],
 [1994, 1, 3, 1, 10142],
 [1994, 1, 4, 2, 11248],
 [1994, 1, 5, 3, 11053],
 [1994, 1, 6, 4, 11406],
 [1994, 1, 7, 5, 11251],
 [1994, 1, 8, 6, 8653],
 [1994, 1, 9, 7, 7910],
 [1994, 1, 10, 1, 10498]]

In [17]:
length_list = len(new_data_list)
new_data_list[length_list-10:length_list]
# Here we see the two are linked successfully.

[[2014, 12, 22, 1, 12799],
 [2014, 12, 23, 2, 12604],
 [2014, 12, 24, 3, 9308],
 [2014, 12, 25, 4, 6749],
 [2014, 12, 26, 5, 10386],
 [2014, 12, 27, 6, 8656],
 [2014, 12, 28, 7, 7724],
 [2014, 12, 29, 1, 12811],
 [2014, 12, 30, 2, 13634],
 [2014, 12, 31, 3, 11990]]