# Data Sourcing, Profiling, & Cleaning

### This script contains the following points:
1. Importing libraries
2. Importing data
3. Data cleaning
4. Combining data sets
5. Remaining data cleaning & deriving new column
6. Data profiling & consistency checks

### 1. Importing libraries

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import os

### 2. Importing data

In [2]:
path = r'C:\Users\keely\Documents\Courses\CareerFoundry\Immersion\Achievement 6 - Advanced Analytics_Dashboard\2019-2021 CDC Natality'

In [3]:
df_2019 = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'Natality 2019.csv'), index_col = False)

In [4]:
df_2019.head()

Unnamed: 0,State,Year,Month,Month Code,Mother's Education,Month Prenatal Care Began,Births,Average Birth Weight,Average Age of Mother,Average LMP Gestational Age
0,Alabama,2019,January,1,8th grade or less,No prenatal care,34.0,3223.15,27.44,38.82
1,Alabama,2019,January,1,8th grade or less,2nd month,30.0,3482.77,29.93,39.07
2,Alabama,2019,January,1,8th grade or less,3rd month,25.0,3251.96,30.76,38.64
3,Alabama,2019,January,1,8th grade or less,4th month,25.0,3120.4,28.64,38.12
4,Alabama,2019,January,1,8th grade or less,5th month,12.0,2938.08,27.92,37.5


In [5]:
df_2019.tail()

Unnamed: 0,State,Year,Month,Month Code,Mother's Education,Month Prenatal Care Began,Births,Average Birth Weight,Average Age of Mother,Average LMP Gestational Age
29286,Wyoming,2019,December,12,"Associate degree (AA, AS)",3rd month,29.0,3057.72,29.55,37.9
29287,Wyoming,2019,December,12,"Bachelor's degree (BA, AB, BS)",2nd month,42.0,3202.02,31.79,39.1
29288,Wyoming,2019,December,12,"Bachelor's degree (BA, AB, BS)",3rd month,26.0,3143.0,31.31,38.96
29289,Wyoming,2019,December,12,"Master's degree (MA, MS, MEng, MEd, MSW, MBA)",2nd month,16.0,3039.25,33.75,39.38
29290,Wyoming,2019,December,12,"Master's degree (MA, MS, MEng, MEd, MSW, MBA)",3rd month,11.0,3038.64,32.91,38.36


In [6]:
df_2019.shape

(29291, 10)

In [7]:
df_2020 = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'Natality 2020.csv'), index_col = False)

In [8]:
df_2020.head()

Unnamed: 0,State,Year,Month,Month Code,Mother's Education,Month Prenatal Care Began,Births,Average Birth Weight,Average Age of Mother,Average LMP Gestational Age
0,Alabama,2020,January,1,8th grade or less,No prenatal care,49.0,3147.1,26.37,38.26
1,Alabama,2020,January,1,8th grade or less,2nd month,23.0,3258.74,27.39,39.04
2,Alabama,2020,January,1,8th grade or less,3rd month,32.0,3206.38,28.53,38.56
3,Alabama,2020,January,1,8th grade or less,4th month,27.0,3171.63,28.19,38.41
4,Alabama,2020,January,1,8th grade or less,5th month,15.0,3165.0,25.87,38.07


In [9]:
df_2020.tail()

Unnamed: 0,State,Year,Month,Month Code,Mother's Education,Month Prenatal Care Began,Births,Average Birth Weight,Average Age of Mother,Average LMP Gestational Age
28487,Wyoming,2020,December,12,"Associate degree (AA, AS)",2nd month,28.0,3099.25,29.18,38.54
28488,Wyoming,2020,December,12,"Associate degree (AA, AS)",3rd month,23.0,3287.52,28.57,38.61
28489,Wyoming,2020,December,12,"Bachelor's degree (BA, AB, BS)",2nd month,58.0,3247.53,30.78,38.95
28490,Wyoming,2020,December,12,"Bachelor's degree (BA, AB, BS)",3rd month,29.0,3407.24,30.59,39.62
28491,Wyoming,2020,December,12,"Master's degree (MA, MS, MEng, MEd, MSW, MBA)",2nd month,16.0,2916.25,33.5,38.12


In [10]:
df_2020.shape

(28492, 10)

In [11]:
df_2021 = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'Natality 2021.csv'), index_col = False)

In [12]:
df_2021.head()

Unnamed: 0,State,Year,Month,Month Code,Mother's Education,Month Prenatal Care Began,Births,Average Birth Weight,Average Age of Mother,Average LMP Gestational Age
0,Alabama,2021,January,1,8th grade or less,No prenatal care,39,3263.77,25.72,38.54
1,Alabama,2021,January,1,8th grade or less,2nd month,16,3105.69,27.69,38.06
2,Alabama,2021,January,1,8th grade or less,3rd month,26,3214.77,29.04,38.85
3,Alabama,2021,January,1,8th grade or less,4th month,22,3070.23,27.55,38.27
4,Alabama,2021,January,1,8th grade or less,5th month,26,3086.15,27.85,37.73


In [13]:
df_2021.tail()

Unnamed: 0,State,Year,Month,Month Code,Mother's Education,Month Prenatal Care Began,Births,Average Birth Weight,Average Age of Mother,Average LMP Gestational Age
28650,Wyoming,2021,December,12,"Bachelor's degree (BA, AB, BS)",2nd month,56,3292.62,29.02,38.75
28651,Wyoming,2021,December,12,"Bachelor's degree (BA, AB, BS)",3rd month,29,3372.9,32.28,38.48
28652,Wyoming,2021,December,12,"Bachelor's degree (BA, AB, BS)",4th month,11,3488.64,34.73,38.73
28653,Wyoming,2021,December,12,"Master's degree (MA, MS, MEng, MEd, MSW, MBA)",2nd month,22,3173.05,32.68,38.27
28654,Wyoming,2021,December,12,"Master's degree (MA, MS, MEng, MEd, MSW, MBA)",3rd month,15,3091.07,33.33,38.47


In [14]:
df_2021.shape

(28655, 10)

### 3. Data cleaning

#### There was a limit on the size of the data set that could be imported from CDC's WONDER Search at one time, so three separate data sets were downloaded, converted to CSV, and concatenated. 

In [15]:
# This aspect of cleaning/consistency checking is done now to avoid potential issue with blank rows at the beginning 
# or end of the dataframe, which causes issues when concatenating.

df_2019.isnull().sum()  

State                          0
Year                           0
Month                          0
Month Code                     0
Mother's Education             0
Month Prenatal Care Began      0
Births                         0
Average Birth Weight           0
Average Age of Mother          0
Average LMP Gestational Age    0
dtype: int64

In [16]:
df_2020.isnull().sum()

State                          0
Year                           0
Month                          0
Month Code                     0
Mother's Education             0
Month Prenatal Care Began      0
Births                         0
Average Birth Weight           0
Average Age of Mother          0
Average LMP Gestational Age    0
dtype: int64

In [17]:
df_2021.isnull().sum()

State                          0
Year                           0
Month                          0
Month Code                     0
Mother's Education             0
Month Prenatal Care Began      0
Births                         0
Average Birth Weight           0
Average Age of Mother          0
Average LMP Gestational Age    0
dtype: int64

In [18]:
# Checking for duplicate rows in each dataframe.

df_2019_dups = df_2019[df_2019.duplicated()]

df_2019_dups

Unnamed: 0,State,Year,Month,Month Code,Mother's Education,Month Prenatal Care Began,Births,Average Birth Weight,Average Age of Mother,Average LMP Gestational Age


In [19]:
df_2020_dups = df_2020[df_2020.duplicated()]

df_2020_dups

Unnamed: 0,State,Year,Month,Month Code,Mother's Education,Month Prenatal Care Began,Births,Average Birth Weight,Average Age of Mother,Average LMP Gestational Age


In [20]:
df_2021_dups = df_2021[df_2021.duplicated()]

df_2021_dups

Unnamed: 0,State,Year,Month,Month Code,Mother's Education,Month Prenatal Care Began,Births,Average Birth Weight,Average Age of Mother,Average LMP Gestational Age


In [21]:
# Checking for mixed data types in data frames.

for col in df_2019.columns.tolist():
  weird = (df_2019[[col]].applymap(type) != df_2019[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_2019[weird]) > 0:
    print (col, ' mixed')
  else: print(col, ' consistent')

State  consistent
Year  consistent
Month  consistent
Month Code  consistent
Mother's Education  consistent
Month Prenatal Care Began  consistent
Births  consistent
Average Birth Weight  consistent
Average Age of Mother  consistent
Average LMP Gestational Age  consistent


In [22]:
for col in df_2020.columns.tolist():
  weird = (df_2020[[col]].applymap(type) != df_2020[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_2020[weird]) > 0:
    print (col, ' mixed')
  else: print(col, ' consistent')

State  consistent
Year  consistent
Month  consistent
Month Code  consistent
Mother's Education  consistent
Month Prenatal Care Began  consistent
Births  consistent
Average Birth Weight  consistent
Average Age of Mother  consistent
Average LMP Gestational Age  consistent


In [23]:
for col in df_2021.columns.tolist():
  weird = (df_2021[[col]].applymap(type) != df_2021[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_2021[weird]) > 0:
    print (col, ' mixed')
  else: print(col, ' consistent')

State  consistent
Year  consistent
Month  consistent
Month Code  consistent
Mother's Education  consistent
Month Prenatal Care Began  consistent
Births  consistent
Average Birth Weight  consistent
Average Age of Mother  consistent
Average LMP Gestational Age  consistent


In [24]:
# Checking data types.

df_2019.dtypes

State                           object
Year                             int64
Month                           object
Month Code                       int64
Mother's Education              object
Month Prenatal Care Began       object
Births                         float64
Average Birth Weight           float64
Average Age of Mother          float64
Average LMP Gestational Age    float64
dtype: object

In [25]:
df_2020.dtypes

State                           object
Year                             int64
Month                           object
Month Code                       int64
Mother's Education              object
Month Prenatal Care Began       object
Births                         float64
Average Birth Weight           float64
Average Age of Mother          float64
Average LMP Gestational Age    float64
dtype: object

In [26]:
df_2021.dtypes

State                           object
Year                             int64
Month                           object
Month Code                       int64
Mother's Education              object
Month Prenatal Care Began       object
Births                           int64
Average Birth Weight            object
Average Age of Mother          float64
Average LMP Gestational Age     object
dtype: object

In [27]:
# Changing data types to optimize running scripts and ensure they are consistent across all three dataframes
# before concatenating.

df_2019['Year']=df_2019['Year'].astype('int16')
df_2019['Month Code']=df_2019['Month Code'].astype('int16')
df_2019['Births']=df_2019['Births'].astype('int32')
df_2019['Average Birth Weight']=df_2019['Average Birth Weight'].astype('float32')
df_2019['Average Age of Mother']=df_2019['Average Age of Mother'].astype('float32')
df_2019['Average LMP Gestational Age']=df_2019['Average LMP Gestational Age'].astype('float32')


In [28]:
df_2020['Year']=df_2020['Year'].astype('int16')
df_2020['Month Code']=df_2020['Month Code'].astype('int16')
df_2020['Births']=df_2020['Births'].astype('int32')
df_2020['Average Birth Weight']=df_2020['Average Birth Weight'].astype('float32')
df_2020['Average Age of Mother']=df_2020['Average Age of Mother'].astype('float32')
df_2020['Average LMP Gestational Age']=df_2020['Average LMP Gestational Age'].astype('float32')

In [29]:
df_2021['Year']=df_2021['Year'].astype('int16')
df_2021['Month Code']=df_2021['Month Code'].astype('int16')
df_2021['Births']=df_2021['Births'].astype('int32')
df_2021['Average Birth Weight']=df_2021['Average Birth Weight'].astype('float32')
df_2021['Average Age of Mother']=df_2021['Average Age of Mother'].astype('float32')
df_2021['Average LMP Gestational Age']=df_2021['Average LMP Gestational Age'].astype('float32')

ValueError: could not convert string to float: 'Not Applicable'

In [30]:
# Finding string 'Not Applicable' in Average Birth Weight preventing data type change. Looking at specific row
# with value(s).

df_2021.loc[df_2021['Average Birth Weight'] == 'Not Applicable']


Unnamed: 0,State,Year,Month,Month Code,Mother's Education,Month Prenatal Care Began,Births,Average Birth Weight,Average Age of Mother,Average LMP Gestational Age
11852,Massachusetts,2021,January,1,Unknown or Not Stated,Unknown or Not Stated,17,Not Applicable,32.88,Not Applicable


In [31]:
# Because these 17 births are missing so many key categorical values, such as mother's education, month prenatal
# care began, and because it it makes up such a small percentage of the total births in Massachusettes, this record
# will be deleted.

df_2021_clean = df_2021.loc[df_2021['Average Birth Weight'] != 'Not Applicable' ]


In [32]:
# Make sure row does not exist in cleaned 2021 dataframe.

df_2021_clean.loc[df_2021_clean['Average Birth Weight'] == 'Not Applicable']

Unnamed: 0,State,Year,Month,Month Code,Mother's Education,Month Prenatal Care Began,Births,Average Birth Weight,Average Age of Mother,Average LMP Gestational Age


In [34]:
# Trying to change df_2021_clean data types now that row with string is removed.

df_2021_clean['Year']=df_2021_clean['Year'].astype('int16')
df_2021_clean['Month Code']=df_2021_clean['Month Code'].astype('int16')
df_2021_clean['Births']=df_2021_clean['Births'].astype('int32')
df_2021_clean['Average Birth Weight']=df_2021_clean['Average Birth Weight'].astype('float32')
df_2021_clean['Average Age of Mother']=df_2021_clean['Average Age of Mother'].astype('float32')
df_2021_clean['Average LMP Gestational Age']=df_2021_clean['Average LMP Gestational Age'].astype('float32')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2021_clean['Year']=df_2021_clean['Year'].astype('int16')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2021_clean['Month Code']=df_2021_clean['Month Code'].astype('int16')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2021_clean['Births']=df_2021_clean['Births'].astype('int32')
A value i

In [35]:
df_2021_clean.dtypes

State                           object
Year                             int16
Month                           object
Month Code                       int16
Mother's Education              object
Month Prenatal Care Began       object
Births                           int32
Average Birth Weight           float32
Average Age of Mother          float32
Average LMP Gestational Age    float32
dtype: object

### 4. Combining data sets

#### The CDC natality data had download limited to a maximum number of records, so selected data were downloaded by three different years. Now, they will be combined via concatenation.

In [36]:
frames = [df_2019, df_2020, df_2021_clean]
df_concat = pd.concat(frames)

In [37]:
df_concat.head()

Unnamed: 0,State,Year,Month,Month Code,Mother's Education,Month Prenatal Care Began,Births,Average Birth Weight,Average Age of Mother,Average LMP Gestational Age
0,Alabama,2019,January,1,8th grade or less,No prenatal care,34,3223.149902,27.440001,38.82
1,Alabama,2019,January,1,8th grade or less,2nd month,30,3482.77002,29.93,39.07
2,Alabama,2019,January,1,8th grade or less,3rd month,25,3251.959961,30.76,38.639999
3,Alabama,2019,January,1,8th grade or less,4th month,25,3120.399902,28.639999,38.119999
4,Alabama,2019,January,1,8th grade or less,5th month,12,2938.080078,27.92,37.5


In [38]:
df_concat.tail()

Unnamed: 0,State,Year,Month,Month Code,Mother's Education,Month Prenatal Care Began,Births,Average Birth Weight,Average Age of Mother,Average LMP Gestational Age
28650,Wyoming,2021,December,12,"Bachelor's degree (BA, AB, BS)",2nd month,56,3292.620117,29.02,38.75
28651,Wyoming,2021,December,12,"Bachelor's degree (BA, AB, BS)",3rd month,29,3372.899902,32.279999,38.48
28652,Wyoming,2021,December,12,"Bachelor's degree (BA, AB, BS)",4th month,11,3488.639893,34.73,38.73
28653,Wyoming,2021,December,12,"Master's degree (MA, MS, MEng, MEd, MSW, MBA)",2nd month,22,3173.050049,32.68,38.27
28654,Wyoming,2021,December,12,"Master's degree (MA, MS, MEng, MEd, MSW, MBA)",3rd month,15,3091.070068,33.330002,38.470001


In [39]:
# Verifying concatenated dataframe is equal to sum of rows of the three orginal dataframes.

df_concat.shape

(86437, 10)

In [40]:
len(df_2019)+len(df_2020)+len(df_2021_clean)

86437

In [41]:
# Rechecking data types of combined data frame.

df_concat.dtypes

State                           object
Year                             int16
Month                           object
Month Code                       int16
Mother's Education              object
Month Prenatal Care Began       object
Births                           int32
Average Birth Weight           float32
Average Age of Mother          float32
Average LMP Gestational Age    float32
dtype: object

In [42]:
# Ensuring mixed data types were resolved in concatenated dataframe.

for col in df_concat.columns.tolist():
  weird = (df_concat[[col]].applymap(type) != df_concat[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_concat[weird]) > 0:
    print (col, ' mixed')
  else: print(col, ' consistent')

State  consistent
Year  consistent
Month  consistent
Month Code  consistent
Mother's Education  consistent
Month Prenatal Care Began  consistent
Births  consistent
Average Birth Weight  consistent
Average Age of Mother  consistent
Average LMP Gestational Age  consistent


### 5. Remaining data cleaning & deriving new column

In [43]:
# Changing column names:

df_concat.rename(columns  = {'State' : 'state'}, inplace = True)
df_concat.rename(columns  = {'Year' : 'year'}, inplace = True)
df_concat.rename(columns  = {'Month' : 'month'}, inplace = True)
df_concat.rename(columns  = {'Month Code' : 'month_code'}, inplace = True)
df_concat.rename(columns  = {'Mother\'s Education' : 'mother_ed'}, inplace = True)
df_concat.rename(columns  = {'Month Prenatal Care Began' : 'prenatal_start'}, inplace = True)
df_concat.rename(columns  = {'Births' : 'births'}, inplace = True)
df_concat.rename(columns  = {'Average Birth Weight' : 'birth_wt_avg'}, inplace = True)
df_concat.rename(columns  = {'Average Age of Mother' : 'mother_age_avg'}, inplace = True)
df_concat.rename(columns  = {'Average LMP Gestational Age' : 'gest_age_avg'}, inplace = True)

In [44]:
df_concat.head()

Unnamed: 0,state,year,month,month_code,mother_ed,prenatal_start,births,birth_wt_avg,mother_age_avg,gest_age_avg
0,Alabama,2019,January,1,8th grade or less,No prenatal care,34,3223.149902,27.440001,38.82
1,Alabama,2019,January,1,8th grade or less,2nd month,30,3482.77002,29.93,39.07
2,Alabama,2019,January,1,8th grade or less,3rd month,25,3251.959961,30.76,38.639999
3,Alabama,2019,January,1,8th grade or less,4th month,25,3120.399902,28.639999,38.119999
4,Alabama,2019,January,1,8th grade or less,5th month,12,2938.080078,27.92,37.5


In [45]:
# Creating a date column combining month and year.

# df_concat['date'] = pd.to_datetime(df_concat[['year', 'month_code']].assign(DAY=1)) 

df_concat['date'] = pd.to_datetime(df_concat.year.astype(str) + '/' + df_concat.month_code.astype(str) + '/01')

In [46]:
pd.options.display.max_rows = None

In [47]:
df_concat.head()

Unnamed: 0,state,year,month,month_code,mother_ed,prenatal_start,births,birth_wt_avg,mother_age_avg,gest_age_avg,date
0,Alabama,2019,January,1,8th grade or less,No prenatal care,34,3223.149902,27.440001,38.82,2019-01-01
1,Alabama,2019,January,1,8th grade or less,2nd month,30,3482.77002,29.93,39.07,2019-01-01
2,Alabama,2019,January,1,8th grade or less,3rd month,25,3251.959961,30.76,38.639999,2019-01-01
3,Alabama,2019,January,1,8th grade or less,4th month,25,3120.399902,28.639999,38.119999,2019-01-01
4,Alabama,2019,January,1,8th grade or less,5th month,12,2938.080078,27.92,37.5,2019-01-01


In [48]:
df_concat.tail()

Unnamed: 0,state,year,month,month_code,mother_ed,prenatal_start,births,birth_wt_avg,mother_age_avg,gest_age_avg,date
28650,Wyoming,2021,December,12,"Bachelor's degree (BA, AB, BS)",2nd month,56,3292.620117,29.02,38.75,2021-12-01
28651,Wyoming,2021,December,12,"Bachelor's degree (BA, AB, BS)",3rd month,29,3372.899902,32.279999,38.48,2021-12-01
28652,Wyoming,2021,December,12,"Bachelor's degree (BA, AB, BS)",4th month,11,3488.639893,34.73,38.73,2021-12-01
28653,Wyoming,2021,December,12,"Master's degree (MA, MS, MEng, MEd, MSW, MBA)",2nd month,22,3173.050049,32.68,38.27,2021-12-01
28654,Wyoming,2021,December,12,"Master's degree (MA, MS, MEng, MEd, MSW, MBA)",3rd month,15,3091.070068,33.330002,38.470001,2021-12-01


In [49]:
# Shortening mother's education descriptions:

df_concat.loc[df_concat['mother_ed'] == "9th through 12th grade with no diploma", 'mother_ed'] = "grade 9-12 no diploma"
df_concat.loc[df_concat['mother_ed'] == "High school graduate or GED completed", 'mother_ed'] = "high school grad/GED"
df_concat.loc[df_concat['mother_ed'] == "Associate degree (AA, AS)", 'mother_ed'] = "associate degree"
df_concat.loc[df_concat['mother_ed'] == "Some college credit, but not a degree", 'mother_ed'] = "some college but no degree"
df_concat.loc[df_concat['mother_ed'] == "Bachelor's degree (BA, AB, BS)", 'mother_ed'] = "bachelor\'s degree"
df_concat.loc[df_concat['mother_ed'] == "Master's degree (MA, MS, MEng, MEd, MSW, MBA)", 'mother_ed'] = "master\'s degree"
df_concat.loc[df_concat['mother_ed'] == "Doctorate (PhD, EdD) or Professional Degree (MD, DDS, DVM, LLB, JD)", 'mother_ed'] = "doctorate or professional degree"


#df_concat["mother_ed"] = df_concat["mother_ed"].replace("9th through 12th grade with no diploma","grade 9-12 no diploma")
#df_concat["mother_ed"] = df_concat["mother_ed"].replace("High school graduate or GED completed","high school grad/GED")
#df_concat["mother_ed"] = df_concat["mother_ed"].replace("Associate degree (AA,AS)","associate degree")
#df_concat["mother_ed"] = df_concat["mother_ed"].replace("Some college credit, but not a degree","some college but no degree")
#df_concat["mother_ed"] = df_concat["mother_ed"].replace("Bachelor's degree (BA,AB,BS)","bachelor's degree")
#df_concat["mother_ed"] = df_concat["mother_ed"].replace("Master's degree (MA,MS,MEng,MEd,MSW,MBA)","master's degree")
#df_concat["mother_ed"] = df_concat["mother_ed"].replace("Doctorate (PhD, EdD) or Professional Degree (MD, DDS, DVM, LLB, JD)","doctorate or professional degree")


In [50]:
df_concat.head()

Unnamed: 0,state,year,month,month_code,mother_ed,prenatal_start,births,birth_wt_avg,mother_age_avg,gest_age_avg,date
0,Alabama,2019,January,1,8th grade or less,No prenatal care,34,3223.149902,27.440001,38.82,2019-01-01
1,Alabama,2019,January,1,8th grade or less,2nd month,30,3482.77002,29.93,39.07,2019-01-01
2,Alabama,2019,January,1,8th grade or less,3rd month,25,3251.959961,30.76,38.639999,2019-01-01
3,Alabama,2019,January,1,8th grade or less,4th month,25,3120.399902,28.639999,38.119999,2019-01-01
4,Alabama,2019,January,1,8th grade or less,5th month,12,2938.080078,27.92,37.5,2019-01-01


In [51]:
df_concat.tail()

Unnamed: 0,state,year,month,month_code,mother_ed,prenatal_start,births,birth_wt_avg,mother_age_avg,gest_age_avg,date
28650,Wyoming,2021,December,12,bachelor's degree,2nd month,56,3292.620117,29.02,38.75,2021-12-01
28651,Wyoming,2021,December,12,bachelor's degree,3rd month,29,3372.899902,32.279999,38.48,2021-12-01
28652,Wyoming,2021,December,12,bachelor's degree,4th month,11,3488.639893,34.73,38.73,2021-12-01
28653,Wyoming,2021,December,12,master's degree,2nd month,22,3173.050049,32.68,38.27,2021-12-01
28654,Wyoming,2021,December,12,master's degree,3rd month,15,3091.070068,33.330002,38.470001,2021-12-01


### 6. Data profiling & consistency checks

In [52]:
# Getting descriptive statistics of data set variables. 

df_concat.describe()

Unnamed: 0,year,month_code,births,birth_wt_avg,mother_age_avg,gest_age_avg
count,86437.0,86437.0,86437.0,86437.0,86437.0,86437.0
mean,2019.99263,6.527112,124.560917,3219.151855,29.00535,38.39344
std,0.818734,3.443856,264.989882,164.164566,2.959844,0.711688
min,2019.0,1.0,10.0,840.0,19.0,27.0
25%,2019.0,4.0,19.0,3138.429932,26.709999,38.110001
50%,2020.0,7.0,39.0,3237.469971,28.9,38.5
75%,2021.0,10.0,106.0,3322.51001,31.299999,38.799999
max,2021.0,12.0,4661.0,4152.5,38.700001,41.110001


In [53]:
df_concat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 86437 entries, 0 to 28654
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   state           86437 non-null  object        
 1   year            86437 non-null  int16         
 2   month           86437 non-null  object        
 3   month_code      86437 non-null  int16         
 4   mother_ed       86437 non-null  object        
 5   prenatal_start  86437 non-null  object        
 6   births          86437 non-null  int32         
 7   birth_wt_avg    86437 non-null  float32       
 8   mother_age_avg  86437 non-null  float32       
 9   gest_age_avg    86437 non-null  float32       
 10  date            86437 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float32(3), int16(2), int32(1), object(4)
memory usage: 5.6+ MB


In [54]:
df_concat['state'].value_counts()

Texas                   3184
California              3139
Florida                 3041
New York                2998
Pennsylvania            2789
Illinois                2554
North Carolina          2538
Georgia                 2500
Washington              2482
New Jersey              2467
Virginia                2433
Ohio                    2407
Michigan                2379
Arizona                 2366
Maryland                2317
Tennessee               2280
Missouri                2117
Colorado                1995
Louisiana               1983
Indiana                 1976
Kentucky                1910
Massachusetts           1897
Alabama                 1877
South Carolina          1875
Wisconsin               1855
Oklahoma                1733
Minnesota               1714
Nevada                  1615
Utah                    1506
Arkansas                1505
Mississippi             1466
Oregon                  1417
New Mexico              1390
Kansas                  1284
Iowa          

In [55]:
df_concat['year'].value_counts()

2019    29291
2021    28654
2020    28492
Name: year, dtype: int64

In [56]:
len(df_2019)

29291

In [57]:
len(df_2020)

28492

In [58]:
len(df_2021)

28655

In [59]:
df_concat['month'].value_counts()

July         7400
August       7365
October      7323
September    7258
December     7223
June         7172
January      7171
March        7160
November     7139
May          7136
April        7082
February     7008
Name: month, dtype: int64

In [60]:
df_concat['mother_ed'].value_counts()

high school grad/GED                15509
some college but no degree          13896
grade 9-12 no diploma               12732
bachelor's degree                   11915
associate degree                     9188
master's degree                      8418
8th grade or less                    7906
doctorate or professional degree     4600
Unknown or Not Stated                2273
Name: mother_ed, dtype: int64

In [61]:
df_concat.head()

Unnamed: 0,state,year,month,month_code,mother_ed,prenatal_start,births,birth_wt_avg,mother_age_avg,gest_age_avg,date
0,Alabama,2019,January,1,8th grade or less,No prenatal care,34,3223.149902,27.440001,38.82,2019-01-01
1,Alabama,2019,January,1,8th grade or less,2nd month,30,3482.77002,29.93,39.07,2019-01-01
2,Alabama,2019,January,1,8th grade or less,3rd month,25,3251.959961,30.76,38.639999,2019-01-01
3,Alabama,2019,January,1,8th grade or less,4th month,25,3120.399902,28.639999,38.119999,2019-01-01
4,Alabama,2019,January,1,8th grade or less,5th month,12,2938.080078,27.92,37.5,2019-01-01


In [62]:
df_concat['births'].value_counts()

10      3210
11      2898
12      2651
13      2479
14      2352
15      2157
17      1859
16      1854
18      1661
19      1627
20      1545
22      1441
21      1402
23      1336
24      1299
26      1170
25      1166
28      1098
27      1096
29      1055
31       991
30       950
32       909
34       869
33       861
35       827
38       753
37       750
36       738
41       682
40       659
39       651
42       600
43       595
46       589
45       577
44       572
47       516
48       513
50       487
49       473
51       471
55       454
52       452
53       422
56       417
57       409
54       407
58       390
61       381
59       375
60       361
62       358
63       340
68       334
70       330
65       321
69       312
66       311
64       309
67       305
74       302
73       291
72       284
75       282
76       272
71       264
81       261
80       256
79       254
78       247
77       246
83       237
85       227
82       218
84       216
86       211

In [63]:
df_concat['prenatal_start'].value_counts()

2nd month                14484
3rd month                14468
4th month                11323
5th month                 8549
1st month                 7846
6th month                 6816
7th month                 6087
Unknown or Not Stated     5200
8th month                 4953
No prenatal care          4790
9th month                 1921
Name: prenatal_start, dtype: int64

In [64]:
df_concat['date'].value_counts()

2019-07-01    2503
2019-08-01    2501
2021-12-01    2468
2021-08-01    2466
2021-07-01    2464
2019-10-01    2464
2019-01-01    2455
2019-05-01    2454
2021-10-01    2442
2021-09-01    2438
2019-03-01    2438
2019-09-01    2434
2020-07-01    2433
2019-12-01    2430
2019-06-01    2420
2020-10-01    2417
2019-04-01    2406
2021-11-01    2405
2020-01-01    2405
2020-08-01    2398
2019-02-01    2393
2019-11-01    2393
2021-06-01    2391
2020-09-01    2386
2020-03-01    2380
2020-06-01    2361
2020-02-01    2360
2020-05-01    2348
2021-03-01    2342
2020-11-01    2341
2020-04-01    2338
2021-04-01    2338
2021-05-01    2334
2020-12-01    2325
2021-01-01    2311
2021-02-01    2255
Name: date, dtype: int64

In [65]:
df_concat.head()

Unnamed: 0,state,year,month,month_code,mother_ed,prenatal_start,births,birth_wt_avg,mother_age_avg,gest_age_avg,date
0,Alabama,2019,January,1,8th grade or less,No prenatal care,34,3223.149902,27.440001,38.82,2019-01-01
1,Alabama,2019,January,1,8th grade or less,2nd month,30,3482.77002,29.93,39.07,2019-01-01
2,Alabama,2019,January,1,8th grade or less,3rd month,25,3251.959961,30.76,38.639999,2019-01-01
3,Alabama,2019,January,1,8th grade or less,4th month,25,3120.399902,28.639999,38.119999,2019-01-01
4,Alabama,2019,January,1,8th grade or less,5th month,12,2938.080078,27.92,37.5,2019-01-01


In [66]:
df_concat['state'].nunique()

51

In [67]:
df_concat['year'].nunique()

3

In [68]:
df_concat['month'].nunique()

12

In [69]:
df_concat['mother_ed'].nunique()

9

In [70]:
df_concat['prenatal_start'].nunique()

11

In [71]:
df_concat['births'].nunique()

1822

In [72]:
df_concat['birth_wt_avg'].nunique()

43376

In [73]:
df_concat['mother_age_avg'].nunique()

1579

In [74]:
df_concat['gest_age_avg'].nunique()

676

In [75]:
df_concat['date'].nunique()

36

In [186]:
df_concat.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'natality_concat.pkl'))

In [76]:
df_concat.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'natality_concat.csv'))