## DataFrames

In [1]:
# Returns a DataFrame (df)
import nsfg
df = nsfg.ReadFemPreg()
df

Unnamed: 0,caseid,pregordr,howpreg_n,howpreg_p,moscurrp,nowprgdk,pregend1,pregend2,nbrnaliv,multbrth,...,laborfor_i,religion_i,metro_i,basewgt,adj_mod_basewgt,finalwgt,secu_p,sest,cmintvw,totalwgt_lb
0,1,1,,,,,6.0,,1.0,,...,0,0,0,3410.389399,3869.349602,6448.271112,2,9,,8.8125
1,1,2,,,,,6.0,,1.0,,...,0,0,0,3410.389399,3869.349602,6448.271112,2,9,,7.8750
2,2,1,,,,,5.0,,3.0,5.0,...,0,0,0,7226.301740,8567.549110,12999.542264,2,12,,9.1250
3,2,2,,,,,6.0,,1.0,,...,0,0,0,7226.301740,8567.549110,12999.542264,2,12,,7.0000
4,2,3,,,,,6.0,,1.0,,...,0,0,0,7226.301740,8567.549110,12999.542264,2,12,,6.1875
5,6,1,,,,,6.0,,1.0,,...,0,0,0,4870.926435,5325.196999,8874.440799,1,23,,8.5625
6,6,2,,,,,6.0,,1.0,,...,0,0,0,4870.926435,5325.196999,8874.440799,1,23,,9.5625
7,6,3,,,,,6.0,,1.0,,...,0,0,0,4870.926435,5325.196999,8874.440799,1,23,,8.3750
8,7,1,,,,,5.0,,1.0,,...,0,0,0,3409.579565,3787.539000,6911.879921,2,14,,7.5625
9,7,2,,,,,5.0,,1.0,,...,0,0,0,3409.579565,3787.539000,6911.879921,2,14,,6.6250


In [2]:
# Returns sequence of column names as Unicode strings
# Result is an Index, another Pandas data structure
df.columns

Index(['caseid', 'pregordr', 'howpreg_n', 'howpreg_p', 'moscurrp', 'nowprgdk',
       'pregend1', 'pregend2', 'nbrnaliv', 'multbrth',
       ...
       'laborfor_i', 'religion_i', 'metro_i', 'basewgt', 'adj_mod_basewgt',
       'finalwgt', 'secu_p', 'sest', 'cmintvw', 'totalwgt_lb'],
      dtype='object', length=244)

In [3]:
# Can be treated like a list
df.columns[1]

'pregordr'

In [4]:
# Access a column from a df by using column name as a key
pregordr = df['pregordr']
# OR using dot notation
# pregordr = df.pregordr
# notation only works if column name is valid Python identifier
# so it has to begin with a letter, can't contain spaces, etc.

type(pregordr)

pandas.core.series.Series

In [5]:
# Series is another Pandas structure.
# It's like a Python list with additional features such as
# indices and corresponding values
pregordr

0        1
1        2
2        1
3        2
4        3
5        1
6        2
7        3
8        1
9        2
10       1
11       1
12       2
13       3
14       1
15       2
16       3
17       1
18       2
19       1
20       2
21       1
22       2
23       1
24       2
25       3
26       1
27       1
28       2
29       3
        ..
13563    2
13564    3
13565    1
13566    1
13567    1
13568    2
13569    1
13570    2
13571    3
13572    4
13573    1
13574    2
13575    1
13576    1
13577    2
13578    1
13579    2
13580    1
13581    2
13582    3
13583    1
13584    2
13585    1
13586    2
13587    3
13588    1
13589    2
13590    3
13591    4
13592    5
Name: pregordr, Length: 13593, dtype: int64

In [6]:
# Variable name, Series length, and data type
# int64 is a type provided by NumPy
# if it's run on 32bit maching, you might see
# int32

# Access elements of Series using integer indicies and slices
pregordr[0]

1

In [7]:
# result of slice is another Series
pregordr[2:5]

2    1
3    2
4    3
Name: pregordr, dtype: int64

## Variables

* __caseid__ integer ID of the respondent
* __prglength__ integer duration of the pregnancy in weeks
* __outcome__ integer code for the outcome of the pregnancy.  Code 1 = live birth
* __pregordr__ pregnancy serial number.  1st preg = 1, 2nd preg = 2
* __birthord__ serial number for live births 1st live birth = 1 etc.  If no live births, it's blank
* __birthwgt\_lb__ and __birthwgt\_oz__ contain pounds and ounces parts of the birth weight of the baby
* __agepreg__ age of mother at end of pregnancy
* __finalwgt__ statistical weight associated with respondant.  Floating point value indicates number of people in Us population this respondent represents

## Transformation

In [8]:
# Data cleaning

# def CleanFemPreg(df):
#     df.agepreg /= 100.0  # integer number of centiyears, converting to float in years
#     
#     na_vals = [97, 98, 99] # special codes for 97=Not Ascertained, 98=Refused, and 99=Don't Know
#     df.birthwgt_lb.replace(na_vals, np.nan, inplace=True) # Replace with NaN (Not A Number)
#     df.birthwgt_oz.replace(na_vals, np.nan, inplace=True) # Replace with NaN (Not A Number)
#     
#     df['totalwgt_lb'] = df.birthwgt_lb + df.birthwgt_oz / 16.0 # Creates new column combining total weight

### NOTE ###
# When adding new columns, MUST use dictionary syntax, not dot notation
### CORRECT ###
#    df['totalwgt_lb'] = df.birth...
### WRONG! ###
#    df.totalwgt_lb = df.birth...

In [10]:
# If either argument is NaN, numpy returns NaN
import numpy as np
np.nan / 100.0

nan

## Validation

In [13]:
# Just make sure things add up especially when exporting/importing from different environments

# NSFG codebook has tables to summarize each variable
# Table for OUTCOME, encoding outcome of each pregnancy

# value  label       Total
# 1 Live Birth       9148
# 2 Induced Abortion 1862
# 3 Stillbirth        120
# 4 Miscarriage      1921
# 5 Ectopic Pregnancy 190
# 6 Current Pregnancy 352

# Series class provides value_counts
# counts number of times each value appears
# Compare with published data

df.outcome.value_counts().sort_index()
# value_counts is a series
# sort_index sorts Series by index

1    9148
2    1862
3     120
4    1921
5     190
6     352
Name: outcome, dtype: int64

In [16]:
# value  label    Total
# . Inapplicable  4449
# 0-5 -6 Pounds   1125
# 6 6 Pounds      2223
# 7 7 Pounds      3049
# 8 8 Pounds      1889
# 9-95 9+ Pounds   799

df.birthwgt_lb.value_counts(sort=False)

8.0     1889
7.0     3049
6.0     2223
4.0      229
5.0      697
10.0     132
12.0      10
14.0       3
3.0       98
1.0       40
2.0       53
0.0        8
9.0      623
11.0      26
13.0       3
15.0       1
Name: birthwgt_lb, dtype: int64

In [17]:
# Supposed to have a 51lb baby.  It's an error.

# expression in [] yields a Series of type bool
# where True indicates the condition is true
# When bool used as index, it selects only 
# elements that satisfy the condition

# line added to def CleanFemPreg
# df.birthwgt_lb[df.birthwgt_lb > 20] = np.nan

## Interpretation

Think on 2 levels at the same time:

1. level of Statistics

2. level of Context

In [23]:
def MakePregMap(df):
    d = defaultdict(list)
    for index, caseid in df.caseid.interitems():
        d[caseid].append(index)
    return d

# df = DataFrame with preg data
# interitems = enumerates index (row numbers) and caseid for each pregnancy
# d = dictionary that maps from each caseID to list of indices.
# 'defaultdict' is in Python 'collections' module


NameError: name 'defaultdict' is not defined

In [22]:
caseid = 10229
indices = preg_map[caseid]
df.outcome[indices].values

NameError: name 'preg_map' is not defined