In [1]:
import numpy as np
import pandas as pd

#after testing replace StringIO(temp) to filename
df = pd.read_csv("../../input/cps_00054.csv")

In [2]:
# Sample Selection - HPV Sample C
# --------------------------------

df["YEAR"] = df["year".upper()] - 1 # survey questions ask about 'last year'

# Prime age
df = df[df["AGE"]>=25] # Drop if if age < 25
df = df[df["AGE"]<=60] # drop if age > 60

# Employed at firm
df = df[ (df['CLASSWLY'] == 22) ]  # drop self-employed, government workers
     
# Drop if missing values    
df = df.dropna(subset=["OCCLY", "IND90LY", "AGE", 'CLASSWLY', 'RACE', 'INCWAGE', 'WKSWORK1', 'UHRSWORKLY', 'EDUC', "YEAR"])
df = df.rename(columns={"OCC90LY":"original"})
df = df[df["INCWAGE"] < 99999998] # Topcoded income


In [3]:

        
df['YEAR_BINS'] = pd.cut(x=df['YEAR'], bins=[1975, 1981, 1991, 2003, 2010], labels=[1970, 1980, 1990, 2000])

df = df.dropna(subset=["YEAR_BINS"])
codes_by_year = pd.DataFrame()
for x in df["YEAR_BINS"].sort_values().unique():
    hf = pd.read_stata("../../../../3_Notes/deming_xwalk_occ/occ"+str(x)+"_occ1990dd.dta")
    hf = hf.rename(columns={"occ1990dd":"OCC90LY", "occ":"OCCLY", "occ70":"OCCLY", "occ2010":"OCCLY"})
    if x==2000:
        hf['OCCLY'] = hf['OCCLY'] * 10
    hf["YEAR_BINS"] = x
    codes_by_year = codes_by_year.append(hf).reset_index(drop = True)

df = pd.merge(df,codes_by_year, on=['OCCLY','YEAR_BINS'], how='left', validate="m:1")




In [4]:

df=df[df["OCC90LY"].notnull()]

In [5]:
# Variables
# --------------------------------

# Sex
df["FEMALE"] = df["SEX"].replace(1, 0)
df["FEMALE"] = df["FEMALE"].replace(2, 1)
df["MALE"] = df["SEX"].replace(2  , 0)

# Big firm has 1000+ employees (=9, 500-999 = 8)
df["big_firm"]  = df["firmsize".upper()]
for x in range(9):
    df["big_firm"] = df["big_firm"].replace(x,0)

df["big_firm"] = df["big_firm"].replace(9,1)
df['big_firm'].fillna(0, inplace =True)

# Race 
for x in df["RACE"].unique():
    if x == 100:
        df["RACE"] = df["RACE"].replace(x,1)
    else:
        df["RACE"] = df["RACE"].replace(x,0)
        
        

# Occupation
df['2digit occupation'] = pd.cut(x=df['OCC90LY'], bins=[0,22, 37,200, 235,283,389,408,427,444,447,455,472,498,549,599,617,699,799,890], labels=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19], right=False)
df["1digit occupation"] = pd.cut(x=df["OCC90LY"], bins=[0,199,389,472,498,699,889], labels=[1,2,3,4,5,6])

# Industry
df["1digit industry"]   = pd.cut(x=df["IND90LY"], bins=[0,200,391, 473, 498, 890,1000], labels=[2,3,4,5,1,6])

# Education 
kf = pd.read_stata("../../input/educ_years_school.dta")

if "years_school" in df:
    df = df.drop(columns=["years_school"])
    
educ_df = kf.rename(columns={"educ":"EDUC"})
df = pd.merge(df, educ_df, on='EDUC', how='right') # merge in years of schooling
df = df.rename(columns={"years_school_y":"years_school"})

# Experience
df["exp"]  = (df["AGE"] - np.maximum(df["years_school"], 12)) - 6
df["exp2"] = (df["exp"] ** 2 )
df = df[df["exp"] >= 0] 

# Hours
df["annual_hours"] = df["WKSWORK1"] * df["UHRSWORKLY"] 
df["annual_hours"] = df[df["annual_hours"]>=260] # drop if worked less than a month of 8hr days

# Drop if report income, but no hours.
indexNames = df[(df['INCWAGE'] > 0) & (df['annual_hours'] == 0)].index
df.drop(indexNames , inplace=True)


# Earnings
df['inc_self'] = 0 # df['OINCBUS'] + df['OINCFARM']    
df['earnings'] = (2/3) * df['inc_self'] + df['INCWAGE'] # 2/3 business income + labor income

In [6]:
# Adjust for inflation
# --------------------------------

df = df.rename(columns={"YEAR":"year"})

#Merge in cpi data
cpi_df = pd.read_csv("../../output/revised_CPI.csv") 
df     = pd.merge(df, cpi_df, on='year', how='right')

# Merge in fedminwage data
fedminwage_df = pd.read_csv("../../input/fedminwage.csv") 
df = pd.merge(df, fedminwage_df, on='year', how='right') # real_cpis is cpi / cpi[2000]

# Normalize to 2000-dollars
df["earnings"]   = df["earnings"] / df["real_cpis"]
df["fedminwage"] = df["fedminwage"] / df["real_cpis"]

# Compute implied hourly wage
df["Wage"] = df["earnings"] / df["annual_hours"]

# Drop if wage is less than 1/2 fedminwage
df = df.query("fedminwage * .5 <= Wage")# same thing: df[(df['Wage'] > df["fedminwage"])]

In [7]:
# Clean up
# --------------------------------
df = df.rename(columns={'occ90ly'.upper(): "3digit occupation", "RACE":"White"})

k = {}
for x in df.columns:
    k[x] = x.capitalize()
df = df.rename(columns=k)
columnsTitles = ['Year', '3digit occupation', '2digit occupation', '1digit occupation', 'Age', 'Asecwt', 'Exp','Male', 'Female','White', 'Exp2','1digit industry', 'Big_firm', "Wage"]
df = df.reindex(columns=columnsTitles)
df = df.reset_index(drop=True)

df[df.columns[:-1]] = df[df.columns[:-1]].astype('int64')
df[["Wage"]] = df["Wage"].astype('float64')
df["Wage"]   = df["Wage"].round(2)

# Save
df.to_csv("../output/current_values2.csv")

FileNotFoundError: [Errno 2] No such file or directory: '../output/current_values2.csv'

In [16]:
#k = df[df['OCC90LY'].isnull()].groupby("YEAR_BINS")
#original is the original occ1990 codes from datatframe, when first imported



df["Year"].value_counts()

2000    61555
2001    61308
2006    57456
2007    57402
2005    57389
2008    57352
2004    57316
2009    55509
2010    47262
1990    41261
1989    40761
1992    39739
1994    39252
1987    38821
1980    38450
1993    38423
1986    38031
1979    37843
1985    37765
1999    37664
1991    37624
1984    37165
1998    36966
1988    36515
1997    36417
1996    35858
1983    35337
1982    35172
1981    35111
1995    34968
1978    31926
1977    31115
Name: Year, dtype: int64

In [18]:
len(df)








1364733