# Preparing CPS data for Employment-to-Population ratio exercise

##### Goal:
- Explore CPS raw data

In [2]:
import pandas as pd
from pathlib import Path
from datetime import datetime as dt
today = dt.today().strftime("%d-%b-%y")

In [3]:
RAW_DATA_FOLDER = Path("../data/raw/")
INTERIM_DATA_FOLDER = Path("../data/interim/")
FINAL_DATA_FOLDER = Path("../data/final")
EXTERNAL_DATA_FOLDER = Path("../data/external")

In [8]:
# from realpython.com/
def tree(directory):
    print(f'+ {directory}')
    for path in sorted(directory.rglob('*')):
        depth = len(path.relative_to(directory).parts)
        spacer = '    ' * depth
        print(f'{spacer}+ {path.name}')

In [10]:
tree(RAW_DATA_FOLDER)

+ ..\data\raw
    + cps_00016.dta.gz
    + tl_2018_us_county.zip


### Load the data

This is an example of how you could load files if they are _zipped_. If your data is not zipped pandas can read it in directly.

In [41]:
with gzip.open(RAW_DATA_FOLDER / 'cps_00016.dta.gz',) as file:
    data = pd.read_stata(file)

In [42]:
data.shape

(1367425, 10)

In [43]:
data.head()

Unnamed: 0,year,month,statefip,county,wtfinl,age,sex,empstat,labforce,educ
0,2004,march,alabama,1073,2180.8989,41,female,at work,"yes, in the labor force","associate's degree, occupational/vocational pr..."
1,2004,march,alabama,1073,3795.1238,23,male,"nilf, unable to work","no, not in the labor force",high school diploma or equivalent
2,2004,march,alabama,0,3039.8985,53,female,at work,"yes, in the labor force","associate's degree, academic program"
3,2004,march,alabama,0,2891.3329,36,female,"unemployed, experienced worker","yes, in the labor force",high school diploma or equivalent
4,2004,march,alabama,0,2420.0073,17,male,"nilf, other","no, not in the labor force",grade 10


In [44]:
data['month'].unique()

[march]
Categories (1, object): [march]

In [47]:
# like checks if the string you're passing it is **in** the axis labels you want to filter
data.filter(axis = 'columns', like = 'e').head()

Unnamed: 0,year,statefip,age,sex,empstat,labforce,educ
0,2004,alabama,41,female,at work,"yes, in the labor force","associate's degree, occupational/vocational pr..."
1,2004,alabama,23,male,"nilf, unable to work","no, not in the labor force",high school diploma or equivalent
2,2004,alabama,53,female,at work,"yes, in the labor force","associate's degree, academic program"
3,2004,alabama,36,female,"unemployed, experienced worker","yes, in the labor force",high school diploma or equivalent
4,2004,alabama,17,male,"nilf, other","no, not in the labor force",grade 10


In [56]:
# regex uses re.search(arg, col). for example,  columns that end with t
data.filter(axis = 'columns', regex = 't$').head()

Unnamed: 0,empstat
0,at work
1,"nilf, unable to work"
2,at work
3,"unemployed, experienced worker"
4,"nilf, other"


In [58]:
# or start with S
data.filter(axis = 'columns', regex = '^s').head()

Unnamed: 0,statefip,sex
0,alabama,female
1,alabama,male
2,alabama,female
3,alabama,female
4,alabama,male


In [62]:
# reindexing for something useful and filtering index
reindexed_data = data.set_index('statefip')
reindexed_data.head()

Unnamed: 0_level_0,year,month,county,wtfinl,age,sex,empstat,labforce,educ
statefip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
alabama,2004,march,1073,2180.8989,41,female,at work,"yes, in the labor force","associate's degree, occupational/vocational pr..."
alabama,2004,march,1073,3795.1238,23,male,"nilf, unable to work","no, not in the labor force",high school diploma or equivalent
alabama,2004,march,0,3039.8985,53,female,at work,"yes, in the labor force","associate's degree, academic program"
alabama,2004,march,0,2891.3329,36,female,"unemployed, experienced worker","yes, in the labor force",high school diploma or equivalent
alabama,2004,march,0,2420.0073,17,male,"nilf, other","no, not in the labor force",grade 10


In [63]:
# index starts with c
reindexed_data.filter(axis = 'index', regex = '^c').head()

Unnamed: 0_level_0,year,month,county,wtfinl,age,sex,empstat,labforce,educ
statefip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
california,2004,march,6073,4307.1373,33,female,at work,"yes, in the labor force",bachelor's degree
california,2004,march,0,4858.1338,61,female,at work,"yes, in the labor force",some college but no degree
california,2004,march,6037,2895.1947,63,male,at work,"yes, in the labor force",bachelor's degree
california,2004,march,6037,2680.936,57,female,at work,"yes, in the labor force",bachelor's degree
california,2004,march,6037,2671.5139,20,female,"nilf, other","no, not in the labor force",some college but no degree


In [64]:
# index has carolina in its name
reindexed_data.filter(axis = 'index', like='carolina')

Unnamed: 0_level_0,year,month,county,wtfinl,age,sex,empstat,labforce,educ
statefip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
north carolina,2004,march,0,3015.8797,46,female,"nilf, unable to work","no, not in the labor force",grade 10
north carolina,2004,march,0,3460.4450,16,male,"nilf, other","no, not in the labor force",grade 9
north carolina,2004,march,37183,2753.9863,29,female,"unemployed, experienced worker","yes, in the labor force",bachelor's degree
north carolina,2004,march,37183,3266.7691,30,male,at work,"yes, in the labor force",bachelor's degree
north carolina,2004,march,0,2491.2422,32,female,at work,"yes, in the labor force",bachelor's degree
north carolina,2004,march,0,2723.8521,37,male,"has job, not at work last week","yes, in the labor force",high school diploma or equivalent
north carolina,2004,march,0,2752.1155,27,female,at work,"yes, in the labor force",bachelor's degree
north carolina,2004,march,0,3775.5612,24,male,at work,"yes, in the labor force",high school diploma or equivalent
north carolina,2004,march,0,2372.2875,51,female,at work,"yes, in the labor force",high school diploma or equivalent
north carolina,2004,march,37183,2496.5739,20,female,at work,"yes, in the labor force",some college but no degree
