In [1]:
####################################################################
# Prepared for Gabor's Data Analysis
#
# Data Analysis for Business, Economics, and Policy
# by Gabor Bekes and  Gabor Kezdi
# Cambridge University Press 2021
#
# gabors-data-analysis.com
#
# License: Free to share, modify and use for educational purposes.
# 	Not to be used for commercial purposes.
#
####################################################################

####################################################################
# cps-earnings dataset
#
# input:
#       morg2014.csv

# output:
#       morg-2014-emp.csv

# version 1.0   2021-05-23
####################################################################

In [2]:
### SETTING UP DIRECTORIES

# import packages
import pandas as pd
import os
import numpy as np

# set working directory for da_data_repo -- replace the
os.chdir('C:\\Users\\77774\\Documents\\GitHub\\Prediction_with_ML_for_Economists\\Assignment_1_cps_earnings_dataset')

# location folders
data_in = "C:\\Users\\77774\\Documents\\GitHub\\Prediction_with_ML_for_Economists\\Assignment_1_cps_earnings_dataset\\raw\\"
data_out = "C:\\Users\\77774\\Documents\\GitHub\\Prediction_with_ML_for_Economists\\Assignment_1_cps_earnings_dataset\\clean\\"

In [3]:
# load dataset (as unicode, to avoid size and memor warnings)

df = pd.read_csv(
    data_in + "morg2014.csv",
    quotechar='"',
    delimiter=",",
    encoding="utf-8",
    dtype = "unicode"
)

In [4]:
# select a subset of columns
df= df[
    [
        "lfsr94", #employment status
        "hhid", #household id
        "lineno", #
        "intmonth", #int_month
        "stfips", #state
        "weight",
        "earnwke", #weakly earnings
        "uhourse", #usual work hours
        "grade92", #highest educ
        "race", #race
        "ethnic", #ethnicity
        "age", #age
        "sex", #gender
        "marital",
        "ownchild",
        "chldpres",
        "prcitshp",
        "state",
        "ind02",
        "occ2012", #occupational code
        "class94",
        "unionmme",
        "unioncov",
    ]
]

In [5]:
# rename variables

df.reset_index(drop=True,inplace=True)
df.rename(
    columns={
        "class94": "class",
        "uhourse": "uhours",
    },
    inplace=True,
)

In [6]:
# destring filter variables

df["age"] = pd.to_numeric(df["age"], errors="coerce").astype("Int64")

df["earnwke"] = pd.to_numeric(df["earnwke"], errors="coerce")
df["earnwke"] = df.earnwke.fillna(0)

df["uhours"] = pd.to_numeric(df["uhours"], errors="coerce").astype("Int64")
df["uhours"] = df.uhours.fillna(0)

In [7]:
# filtering dataset

df = df[(df.age >= 16) & (df.age <= 64)]

df = df[(df.lfsr94 == "Employed-At Work") | (df.lfsr94 == "Employed-Absent")]

df.drop(df.loc[(df.earnwke == 0) | (df.uhours == 0)].index, inplace=True)

df.reset_index(drop=True, inplace=True)

In [8]:
# save table

df.to_csv(data_out + "morg-2014-emp.csv", index=False)

In [9]:
df.describe()

Unnamed: 0,earnwke,uhours,age
count,149316.0,149316.0,149316.0
mean,888.834343,38.939819,40.70681
std,643.737322,10.25928,12.776095
min,0.01,1.0,16.0
25%,430.0,40.0,30.0
50%,719.6,40.0,41.0
75%,1153.84,40.0,52.0
max,2884.61,99.0,64.0


In [10]:
df

Unnamed: 0,lfsr94,hhid,lineno,intmonth,stfips,weight,earnwke,uhours,grade92,race,...,marital,ownchild,chldpres,prcitshp,state,ind02,occ2012,class,unionmme,unioncov
0,Employed-At Work,002600310997690,3,January,AL,3151.6801,1692.00,40,43,1,...,7,0,0,"Native, Born In US",63,Employment services (5613),630,"Private, For Profit",No,No
1,Employed-Absent,075680310997590,1,January,AL,3457.1138,450.00,40,41,2,...,1,2,6,"Native, Born In US",63,Outpatient care centers (6214),5400,"Private, For Profit",No,No
2,Employed-At Work,075680310997590,2,January,AL,3936.911,1090.00,60,41,2,...,1,2,6,"Native, Born In US",63,Motor vehicles and motor vehicle equipment man...,8140,"Private, For Profit",No,No
3,Employed-At Work,179140131100930,1,January,AL,3288.364,769.23,40,40,1,...,1,2,4,"Native, Born In US",63,"**Publishing, except newspapers and software (...",8255,"Private, For Profit",Yes,
4,Employed-At Work,179140131100930,2,January,AL,3422.85,826.92,40,43,1,...,1,2,4,"Native, Born In US",63,"Banking and related activities (521, 52211,52219)",5940,"Private, For Profit",No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149311,Employed-At Work,896679860459501,2,December,WY,346.2296,692.30,40,39,1,...,6,0,0,"Native, Born In US",8,Office supplies and stationery stores (45321),4760,"Private, For Profit",No,No
149312,Employed-At Work,907086820569600,1,December,WY,294.98,1984.61,40,44,1,...,1,1,3,"Native, Born In US",8,Administration of human resource programs (923),430,Government - State,No,No
149313,Employed-At Work,907086820569600,2,December,WY,324.1761,2884.61,55,43,1,...,1,1,3,"Native, Born In US",8,Nursing care facilities (6231),10,"Private, For Profit",No,No
149314,Employed-At Work,950868097156649,1,December,WY,321.6982,1153.84,40,42,1,...,1,0,0,"Native, Born In US",8,Hospitals (622),5820,"Private, Nonprofit",No,No
