# Dataframes


In [2]:
#Importing the nsfg library to work with it's data and the functions of their modules
import nsfg

In [3]:
#Creating a dataframe with the function that reads the NSFG pregnancy data from 2002
df = nsfg.ReadFemPreg()

In [4]:
#Calling df to show the table 
df.head()

Unnamed: 0,caseid,pregordr,howpreg_n,howpreg_p,moscurrp,nowprgdk,pregend1,pregend2,nbrnaliv,multbrth,...,laborfor_i,religion_i,metro_i,basewgt,adj_mod_basewgt,finalwgt,secu_p,sest,cmintvw,totalwgt_lb
0,1,1,,,,,6.0,,1.0,,...,0,0,0,3410.389399,3869.349602,6448.271112,2,9,,8.8125
1,1,2,,,,,6.0,,1.0,,...,0,0,0,3410.389399,3869.349602,6448.271112,2,9,,7.875
2,2,1,,,,,5.0,,3.0,5.0,...,0,0,0,7226.30174,8567.54911,12999.542264,2,12,,9.125
3,2,2,,,,,6.0,,1.0,,...,0,0,0,7226.30174,8567.54911,12999.542264,2,12,,7.0
4,2,3,,,,,6.0,,1.0,,...,0,0,0,7226.30174,8567.54911,12999.542264,2,12,,6.1875


In [5]:
#The atribute/method columns returns a sequence of column names as Unicode String on a data structure from pandas called "Index"
df.columns

Index(['caseid', 'pregordr', 'howpreg_n', 'howpreg_p', 'moscurrp', 'nowprgdk',
       'pregend1', 'pregend2', 'nbrnaliv', 'multbrth',
       ...
       'laborfor_i', 'religion_i', 'metro_i', 'basewgt', 'adj_mod_basewgt',
       'finalwgt', 'secu_p', 'sest', 'cmintvw', 'totalwgt_lb'],
      dtype='object', length=244)

In [6]:
#For now let's treat the index like a list
df.columns[45]

'paybirth3'

In [7]:
#To access a specific column from the dataframe, treat the column like a keyword
pregordr = df['pregordr']
#You can also access columns with .notations
df.pregordr  #<- This one only works if column name is valid python identifier -> Has to begin with a letter, can't contain spaces,etc.
#To check the columns type
type(pregordr)

pandas.core.series.Series

In [8]:
#Series is another pandas data structure. Is like a list but with some additional features. If print(series) = you get the indices and the corresponding values
pregordr

0        1
1        2
2        1
3        2
4        3
5        1
6        2
7        3
8        1
9        2
10       1
11       1
12       2
13       3
14       1
15       2
16       3
17       1
18       2
19       1
20       2
21       1
22       2
23       1
24       2
25       3
26       1
27       1
28       2
29       3
        ..
13563    2
13564    3
13565    1
13566    1
13567    1
13568    2
13569    1
13570    2
13571    3
13572    4
13573    1
13574    2
13575    1
13576    1
13577    2
13578    1
13579    2
13580    1
13581    2
13582    3
13583    1
13584    2
13585    1
13586    2
13587    3
13588    1
13589    2
13590    3
13591    4
13592    5
Name: pregordr, Length: 13593, dtype: int64

In [9]:
#Here, Indices = integers[0:13592] (they can be any sortable type); Elements = also integers (can be any type).
#Last line includes Variable name, Series lenght, and data type:int64 (a numpy type). If PC = 32bit -> data type = int32

In [10]:
#Acess elements through Indices
pregordr[5]

1

In [11]:
#Also using Slices
pregordr[54:59]

54    3
55    1
56    2
57    1
58    2
Name: pregordr, dtype: int64

# Variables

In [12]:
#We know that there are 244 variables in the dataframe because of the lenght when we run the code to see the columns of the dataframe
df.columns

Index(['caseid', 'pregordr', 'howpreg_n', 'howpreg_p', 'moscurrp', 'nowprgdk',
       'pregend1', 'pregend2', 'nbrnaliv', 'multbrth',
       ...
       'laborfor_i', 'religion_i', 'metro_i', 'basewgt', 'adj_mod_basewgt',
       'finalwgt', 'secu_p', 'sest', 'cmintvw', 'totalwgt_lb'],
      dtype='object', length=244)

## Here we will use the following variables:
- caseid: integer ID of the respondent.
- prglenght: integer duration of the pregnancy in weeks.
- outcome: binary, 1 = live birth
- pregordr: pregnancy serial number for respondent's. If 1 = her first pregnancy, if 2 = her second pregnancy and so on.
- birthord: serial number for respondent's live births. Like above BUT if outcome = other than live birth -> space blank.
- birthwgt_lb and birthwgt_oz: baby birth weight in pounds and onces.
- agepreg: mother age at end of pregnancy. 
- finalwgt: statistical weight associated with respondent. Floating value that represents the nº of ppl in U.S. population that the respondent represents. 

In [13]:
#Here variables = recodes calculated with the raw data. Example:
#prglenght = wksgest (raw variable). 
#If wksgest not in RawData:
    #prglenght = mosgest * 4.33 (months of gestation times average weeks in a month)

# Transformation

In [14]:
#Data Cleaning = check for errors, deal with special values, convert data into different formats, and perform calculations.
#nsfg.py includes CleanFemPreg, a function that cleans the values that we are planning to use.
def CleanFemPreg(df):
    df.agepreg /= 100.0   #<- Contains mother's age at pregnancy end. in df is encoded as integer of centiyears. So here we divide each observation by 100, yielding a floating-point value in years.
    
    na_vals = [97, 98, 99]
    df.birthwgt_lbs.replace(na_vals, np.nan, inplace=True)   #<- as we know those are the weight of the baby in pounds and ounces at birth (if they end up living)
    df.birthwgt_oz.replace(na_vals, np.nan, inplace=True)    #<- They use several special codes: 97 = NOT ASCERTAINED; 98 = REFUSED; 99 = DON'T KNOW
    
    df['totalwgt_lb'] = df.birthwgt_lb + df.birthwgt_oz / 16.0
    
#We gotta be careful with Special Values, because those, if not handled properly can generate bogus results like a 99 pounds baby. 
#The replace method replaces these "Specially Dangerous Values" with np.nan, a special floating-point that represents "not a number"
#The inplace flag tells replace to modify the existing Series rather than create a new one.

In [15]:
#As part of the IEEE floating-point standard:
#for argument in AnyMathematicalOperation:
#    if AnyArgument == nan
#    return nan
import numpy as np
np.nan / 100.0

nan

Here you see that computations with nan are effective, and most pandas functions handle nan appropiately. 
Get use to deal with missing data, it will be a recurring issue.

In [16]:
#The last line of def CleanFemPreg(df) creates a new column called totalwgt_lb that combines pounds and ounces into just pounds.
#IMPORTANT NOTE: when you add a new column to a df, you must use dictionary syntax:

#WRONG!!
df.totalwgt_lb = df.birthwgt_lb + df.birthwgt_oz / 16.0  #<- This version adds an attribute to the df but that attribute is NOT treated as a new column.

#CORRECT!!
df['totalwgt_lb'] = df.birthwgt_lb + df.birthwgt_oz / 16.0


# Validation

## Errors could be introduce when:
   - Data is exported from one software environment to another software environment.
   - When you are getting familiar with a new dataset:
     * You might interpret data incorrectly or introduce other misunderstandings.

## Validating data can save us time later and avoid errors.
   - One way to do so is to compute basic stats and compare them with published results.
     * In our case, we have tables that summarize each variable in the NSFG codebook/documentation. 
     * Here we have the table of outcome for each pregnancy:
            value label               Total 
            1 LIVE BIRTH              9148 
            2 INDUCED ABORTION        1862 
            3 STILLBIRTH               120 
            4 MISCARRIAGE             1921 
            5 ECTOPIC PREGNANCY        190 
            6 CURRENT PREGNANCY        352
       - Series class provide a method (value_counts) that counts the number of times each value appears. So if we select the outcome Series from the dataframe,
       - we can compare it with the poublished data with value_counts():

In [17]:
df.outcome.value_counts().sort_index()

1    9148
2    1862
3     120
4    1921
5     190
6     352
Name: outcome, dtype: int64

In [18]:
#value_counts() produces a Series; sort_index() sorts the Series by Index so the value appears in order. 

In [19]:
#Here we have our table for birthwgt_lb:
        #value label                  Total
        #. INAPPLICABLE               4449 
        #0-5 UNDER 6 POUNDS           1125 
        #6 6 POUNDS                   2223 
        #7 7 POUNDS                   3049 
        #8 8 POUNDS                   1889 
        #9-95 9 POUNDS OR MORE        799 
#Let's use value_counts() to compare:
df.birthwgt_lb.value_counts().sort_index()

0.0        8
1.0       40
2.0       53
3.0       98
4.0      229
5.0      697
6.0     2223
7.0     3049
8.0     1889
9.0      623
10.0     132
11.0      26
12.0      10
13.0       3
14.0       3
15.0       1
Name: birthwgt_lb, dtype: int64

#### IMPORTANT: all data on point. BUT, if we were to get a weird value, like a 51 pound baby. We can add a simple line to CleanFemPreg():
       
       df.birthwgt_lb[df.birthwgt_lb > 20] = np.nan
       

In [None]:
#It would look like this:
def CleanFemPreg(df):
    df.agepreg /= 100.0  
    
    na_vals = [97, 98, 99]
    df.birthwgt_lbs.replace(na_vals, np.nan, inplace=True)  
    df.birthwgt_oz.replace(na_vals, np.nan, inplace=True)   
    df.birthwgt_lb[df.birthwgt_lb > 20] = np.nan
    
    df['totalwgt_lb'] = df.birthwgt_lb + df.birthwgt_oz / 16.0

# Interpretation

In [22]:
#Let's look at the sequence of outcomes for a few respondents. We gotta use a function to collect the pregnancy data for each respondant:
def MakePregMap(df):
    from collections import defaultdict
    d = defaultdict(list)
    for index, caseid in df.caseid.iteritems():
        d[caseid].append(index)
    return d

In [23]:
#d is a dictionary that maps from each case ID to a list of indices.
#iteritems() method enumerates the index (row number) and caseid for each pregnancy.

In [24]:
#This example looks up one respondent and prints a list of outcomes for her pregnancies:
preg_map = MakePregMap(df)
caseid = 10229
indices = preg_map[caseid]
df.outcome[indices].values

array([4, 4, 4, 4, 4, 4, 1], dtype=int64)

In [25]:
#indices is the list of indices for pregnancies that correspond to the respondant 10229.
#When we use this list as an index into df.outcome it will select the indicated rows and yield a Series.
#Then the .values attribute is a NumPy array that instead of printing the whole series (a column of 7 rows with their respective index), is just prints the values in one row.
#Outcome code 1 indicates a live birth. COde 4 a misscarriage = pregnancy that ended spontaneously (usually with no known medical cause).
#This is not an uncommon respondent. But if we remember the context, it gets worst. Because this story speaks of a woman that was pregnant 7 times, and finally at the 7th gave birth.
#Beautiful for the mother but sad to see that there are that many miscarriages.