In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

### BIRTH'S 2016

In [None]:
fi = open('data/us_births_2016.txt', 'r')

In [None]:
row = fi.readline()

In [None]:
values = [
    (9, 12, 'dob_year'),
    (75, 76, 'mothers_age'),
    (79, 79, 'mothers_age_grouped'),
    (117, 117, 'mothers_race'),
    (120, 120, 'marital_status'),
    (124, 124, 'mothers_education'),
    (251, 251, 'wic')
]

In [None]:
results = []
for row in fi:
    row_values = {}

    for start, stop, label in values:
        if label != 'FILLER':
            row_values[label] = row[start-1:stop]
            
    results.append(row_values)

In [None]:
us_births_2016 = pd.DataFrame(results)

In [None]:
fi.close()

### Organizing DFs

In [None]:
us_births_2016.mothers_age = us_births_2016.mothers_age.astype(int)

### AGE 

In [None]:
us_births_2016_age_counts = us_births_2016.mothers_age.value_counts(normalize=True).to_frame().reset_index().sort_values('index').rename(columns = {'index':'age','mothers_age':'pct_births'})

In [None]:
us_births_2016_age_counts['year'] = 2016

In [None]:
us_births_2016_age_counts.to_csv('organized_data/us_births_2016_age_counts.csv')

In [None]:
us_births_2016_grouped_age_counts = us_births_2016.mothers_age_grouped.value_counts(normalize=True).to_frame().reset_index().sort_values('index').rename(columns = {'index':'age_group_code','mothers_age_grouped':'pct_births'})

In [None]:
us_births_2016_grouped_age_counts['year'] = 2016

In [None]:
for index, row in us_births_2016_grouped_age_counts.iterrows():
    if row.age_group_code == "1":
        us_births_2016_grouped_age_counts.loc[index, 'age_group'] = 'Under 15 years'
    elif row.age_group_code == "2":
        us_births_2016_grouped_age_counts.loc[index, 'age_group'] = '15-19 years'
    elif row.age_group_code == "3":
        us_births_2016_grouped_age_counts.loc[index, 'age_group'] = '20-24 years'
    elif row.age_group_code == "4":
        us_births_2016_grouped_age_counts.loc[index, 'age_group'] = '25-29 years'
    elif row.age_group_code == "5":
        us_births_2016_grouped_age_counts.loc[index, 'age_group'] = '30-34 years'
    elif row.age_group_code == "6":
        us_births_2016_grouped_age_counts.loc[index, 'age_group'] = '35-39 years'
    elif row.age_group_code == "7":
        us_births_2016_grouped_age_counts.loc[index, 'age_group'] = '40-44 years'
    elif row.age_group_code == "8":
        us_births_2016_grouped_age_counts.loc[index, 'age_group'] = '45-49 years'
    elif row.age_group_code == "9":
        us_births_2016_grouped_age_counts.loc[index, 'age_group'] = '50-54 years'

In [None]:
us_births_2016_grouped_age_counts.to_csv('organized_data/us_births_2016_grouped_age_counts.csv')

### EDUCATION

In [None]:
us_births_2016_ed_counts = us_births_2016.mothers_education.value_counts(normalize=True).to_frame().reset_index().sort_values('index').head(8).rename(columns = {'index':'ed_code','mothers_education':'pct_births'})

In [None]:
us_births_2016_ed_counts['year'] = 2016

In [None]:
for index, row in us_births_2016_ed_counts.iterrows():
    if row.ed_code == "1":
        us_births_2016_ed_counts.loc[index, 'age_group'] = '8th grade or less'
    elif row.ed_code == "2":
        us_births_2016_ed_counts.loc[index, 'age_group'] = '9th through 12th grade with no diploma'
    elif row.ed_code == "3":
        us_births_2016_ed_counts.loc[index, 'age_group'] = 'High school graduate or GED completed'
    elif row.ed_code == "4":
        us_births_2016_ed_counts.loc[index, 'age_group'] = 'Some college credit, but not a degree'
    elif row.ed_code == "5":
        us_births_2016_ed_counts.loc[index, 'age_group'] = 'Associate degree (AA,AS)'
    elif row.ed_code == "6":
        us_births_2016_ed_counts.loc[index, 'age_group'] = 'Bachelor’s degree (BA, AB, BS)'
    elif row.ed_code == "7":
        us_births_2016_ed_counts.loc[index, 'age_group'] = 'Master’s degree (MA, MS, MEng, MEd, MSW, MBA)'
    elif row.ed_code == "8":
        us_births_2016_ed_counts.loc[index, 'age_group'] = 'Doctorate (PhD, EdD) or Professional Degree (MD, DDS,DVM, LLB, JD)'

In [None]:
us_births_2016_ed_counts.to_csv('organized_data/us_births_2016_ed_counts.csv')

### WIC

In [None]:
us_births_2016_wic_counts = us_births_2016.wic.value_counts(normalize=True).head(2).to_frame().reset_index().rename(columns = {'index':'wic','wic':'pct_births'})

In [None]:
us_births_2016_wic_counts['year']=2016

In [None]:
us_births_2016_wic_counts.to_csv('organized_data/us_births_2016_wic_counts.csv')

### DEMO

In [None]:
us_births_2016_demo_counts = us_births_2016.mothers_race.value_counts(normalize=True).to_frame().reset_index().rename(columns = {'index':'demo_code','mothers_race':'pct_births'})

In [None]:
for index, row in us_births_2016_demo_counts.iterrows():
    if row.demo_code == "1":
        us_births_2016_demo_counts.loc[index, 'demo'] = 'White'
    elif row.demo_code == "2":
        us_births_2016_demo_counts.loc[index, 'demo'] = 'Black or African American'
    elif row.demo_code == "3":
        us_births_2016_demo_counts.loc[index, 'demo'] = 'AIAN'
    elif row.demo_code == "4":
        us_births_2016_demo_counts.loc[index, 'demo'] = 'Asian'
    elif row.demo_code == "5":
        us_births_2016_demo_counts.loc[index, 'demo'] = 'NHOPI'
    elif row.demo_code == "6":
        us_births_2016_demo_counts.loc[index, 'demo'] = 'More than one race'
    elif row.demo_code == "7":
        us_births_2016_demo_counts.loc[index, 'demo'] = 'Hispanic'
    elif row.demo_code == "8":
        us_births_2016_demo_counts.loc[index, 'demo'] = 'Unknown/Not Stated'

In [None]:
us_births_2016_demo_counts['year']=2016

In [None]:
us_births_2016_demo_birth_count = us_births_2016.mothers_race.value_counts().to_frame().reset_index().rename(columns = {'index':'demo_code','mothers_race':'births_count'})

In [None]:
us_births_2016_demo_counts = pd.merge(us_births_2016_demo_birth_count,us_births_2016_demo_counts, on='demo_code', how='inner')

In [None]:
us_births_2016_demo_counts.to_csv('organized_data/us_births_2016_demo_counts.csv')

### MARITAL STATUS

In [None]:
us_births_2016_marital_status = us_births_2016.marital_status.value_counts(normalize=True).reset_index().rename(columns = {'index':'status_code','marital_status':'pct_births'})

In [None]:
for index, row in us_births_2016_marital_status.iterrows():
    if row.status_code == "1":
        us_births_2016_marital_status.loc[index, 'demo'] = 'Married'
    elif row.status_code == "2":
        us_births_2016_marital_status.loc[index, 'demo'] = 'Unmarried'

In [None]:
us_births_2016_marital_status['year']=2016

In [None]:
us_births_2016_marital_status.to_csv('organized_data/us_births_2016_marital_status.csv')