In [1]:
import numpy as np
import pandas as pd

#after testing replace StringIO(temp) to filename
df = pd.read_csv("../input/cps_00054.csv")

FileNotFoundError: [Errno 2] File b'../input/cps_00054.csv' does not exist: b'../input/cps_00054.csv'

In [None]:
# Sample Selection - HPV Sample C
# --------------------------------

df["YEAR"] = df["year".upper()] - 1 # survey questions ask about 'last year'

# Prime age
df = df[df["AGE"]>=25] # Drop if if age < 25
df = df[df["AGE"]<=60] # drop if age > 60

# Employed at firm
df = df[ (df['CLASSWLY'] == 22) ]  # drop self-employed, government workers
     
# Drop if missing values    
df = df.dropna(subset=["OCC90LY", "IND90LY", "AGE", 'CLASSWLY', 'RACE', 'INCWAGE', 'WKSWORK1', 'UHRSWORKLY', 'EDUC']) 
df = df[(df['OCC90LY'] != 991) & (df['OCC90LY'] != 999) & (df['OCC90LY'] != 995) ] # NIU, unemployed
df = df[df["INCWAGE"] < 99999998] # Topcoded income

In [None]:
# Variables
# --------------------------------

# Sex
df["FEMALE"] = df["SEX"].replace(1, 0)
df["FEMALE"] = df["FEMALE"].replace(2, 1)
df["MALE"] = df["SEX"].replace(2  , 0)

# Big firm has 1000+ employees (=9, 500-999 = 8)
df["big_firm"]  = df["firmsize".upper()]
for x in range(9):
    df["big_firm"] = df["big_firm"].replace(x,0)

df["big_firm"] = df["big_firm"].replace(9,1)
df['big_firm'].fillna(0, inplace =True)

# Race 
for x in df["RACE"].unique():
    if x == 100:
        df["RACE"] = df["RACE"].replace(x,1)
    else:
        df["RACE"] = df["RACE"].replace(x,0)
        
        
# Occupation
df['2digit occupation'] = pd.cut(x=df['OCC90LY'], bins=[0, 43, 203, 213, 290, 303, 405, 415, 434, 473, 479, 503, 558, 703, 803, 823, 905], labels=[3, 43, 203, 213, 290, 303, 405, 415, 434, 473, 479, 503, 558, 703, 803, 823], right=False)
df["1digit occupation"] = pd.cut(x=df["OCC90LY"], bins=[0, 200, 400, 470, 500, 700, 900,998], labels=[2,3,4,5,1,7,6])

# Industry
df["1digit industry"]   = pd.cut(x=df["IND90LY"], bins=[0,200,391, 473, 498, 890,1000], labels=[2,3,4,5,1,6])

# Education 
kf = pd.read_stata("../input/educ_years_school.dta")

if "years_school" in df:
    df = df.drop(columns=["years_school"])
    
educ_df = kf.rename(columns={"educ":"EDUC"})
df = pd.merge(df, educ_df, on='EDUC', how='right') # merge in years of schooling
df = df.rename(columns={"years_school_y":"years_school"})

# Experience
df["exp"]  = (df["AGE"] - np.maximum(df["years_school"], 12)) - 6
df["exp2"] = (df["exp"] ** 2 )
df = df[df["exp"] >= 0] 

# Hours
df["annual_hours"] = df["WKSWORK1"] * df["UHRSWORKLY"] 
df["annual_hours"] = df[df["annual_hours"]>=260] # drop if worked less than a month of 8hr days

# Drop if report income, but no hours.
indexNames = df[(df['INCWAGE'] > 0) & (df['annual_hours'] == 0)].index
df.drop(indexNames , inplace=True)


# Earnings
df['inc_self'] = 0 # df['OINCBUS'] + df['OINCFARM']    
df['earnings'] = (2/3) * df['inc_self'] + df['INCWAGE'] # 2/3 business income + labor income

In [2]:
# Adjust for inflation
# --------------------------------

df = df.rename(columns={"YEAR":"year"})

#Merge in cpi data
cpi_df = pd.read_csv("../output/revised_CPI.csv") 
df     = pd.merge(df, cpi_df, on='year', how='right')

# Merge in fedminwage data
fedminwage_df = pd.read_csv("../input/fedminwage.csv") 
df = pd.merge(df, fedminwage_df, on='year', how='right') # real_cpis is cpi / cpi[2000]

# Normalize to 2000-dollars
df["earnings"]   = df["earnings"] / df["real_cpis"]
df["fedminwage"] = df["fedminwage"] / df["real_cpis"]

# Compute implied hourly wage
df["Wage"] = df["earnings"] / df["annual_hours"]

# Drop if wage is less than 1/2 fedminwage
df = df.query("fedminwage * .5 <= Wage")# same thing: df[(df['Wage'] > df["fedminwage"])]

NameError: name 'df' is not defined

In [5]:
# Clean up
# --------------------------------
df = df.rename(columns={'occ90ly'.upper(): "3digit occupation", "RACE":"White"})

k = {}
for x in df.columns:
    k[x] = x.capitalize()
df = df.rename(columns=k)
columnsTitles = ['Year', '3digit occupation', '2digit occupation', '1digit occupation', 'Age', 'Asecwth', 'Exp','Male', 'Female','White', 'Exp2','1digit industry', 'Big_firm', "Wage"]
df = df.reindex(columns=columnsTitles)
df = df.reset_index(drop=True)

df[df.columns[:-1]] = df[df.columns[:-1]].astype('int64')
df[["Wage"]] = df["Wage"].astype('float64')
df["Wage"]   = df["Wage"].round(2)

# Save
df.to_csv("../output/current_values.csv")

In [6]:
df.head()

Unnamed: 0,Year,3digit occupation,2digit occupation,1digit occupation,Age,Asecwth,Exp,Male,Female,White,Exp2,1digit industry,Big_firm,Wage
0,1975,628,558,1,60,1729,42,1,0,1,1764,3,0,18.04
1,1975,785,703,7,45,1516,27,1,0,1,729,3,0,19.02
2,1975,453,434,4,27,1524,9,1,0,1,81,1,0,18.04
3,1975,796,703,7,49,1099,31,0,1,0,961,2,0,6.56
4,1975,779,703,7,49,1558,31,0,1,0,961,3,0,11.5
