In [105]:
import numpy as np
import pandas as pd

#after testing replace StringIO(temp) to filename
df = pd.read_csv("../input/cps_00054.csv")

In [95]:
# Sample Selection - HPV Sample C
# --------------------------------

df["YEAR"] = df["year".upper()] - 1 # survey questions ask about 'last year'

# Prime age
df = df[df["AGE"]>=25] # Drop if if age < 25
df = df[df["AGE"]<=60] # drop if age > 60

# Employed at firm
df = df[ (df['CLASSWLY'] == 22) ]  # drop self-employed, government workers
     
# Drop if missing values    
df = df.dropna(subset=["OCCLY", "IND90LY", "AGE", 'CLASSWLY', 'RACE', 'INCWAGE', 'WKSWORK1', 'UHRSWORKLY', 'EDUC', "YEAR"])
df = df.rename(columns={"OCC90LY":"original"})
df = df[df["INCWAGE"] < 99999998] # Topcoded income


In [84]:
for x in df[df["YEAR"]<=2004]["OCCLY"].unique():
    if str(x)[-3]==0:
        df['OCCLY'] = df["OCCLY"].replace(x,int(str(x)[:-3]))

In [108]:

        
df['YEAR_BINS'] = pd.cut(x=df['YEAR'], bins=[1979, 1981, 1991, 2002, 2020], labels=[1970, 1980, 1990, 2000])

df = df.dropna(subset=["YEAR_BINS"])
codes_by_year = pd.DataFrame()
for x in df["YEAR_BINS"].sort_values().unique():
    hf = pd.read_stata("../../../3_Notes/deming_xwalk_occ/occ"+str(x)+"_occ1990dd.dta")
    hf = hf.rename(columns={"occ1990dd":"OCC90LY", "occ":"OCCLY", "occ70":"OCCLY", "occ2010":"OCCLY"})
    hf["YEAR_BINS"] = x
    codes_by_year = codes_by_year.append(hf).reset_index(drop = True)

df = pd.merge(df,codes_by_year, on=['OCCLY','YEAR_BINS'], how='outer', validate="m:1")




In [111]:
df["OCC90LY"].value_counts()
len(df["OCC90LY"].isnull())

9424429

In [97]:
df['OCCLY'] = np.where((df.OCCLY == 0) & (df.YEAR <=1981), np.nan,df.OCC90LY)

df['OCCLY'] = np.where((df.OCCLY == 590) & (df.YEAR <=1981), "Military",df.OCC90LY)

df['OCCLY'] = np.where((df.OCCLY == 995) & (df.YEAR <=1981), "Did not work",df.OCC90LY)

In [98]:
# Variables
# --------------------------------

# Sex
df["FEMALE"] = df["SEX"].replace(1, 0)
df["FEMALE"] = df["FEMALE"].replace(2, 1)
df["MALE"] = df["SEX"].replace(2  , 0)

# Big firm has 1000+ employees (=9, 500-999 = 8)
df["big_firm"]  = df["firmsize".upper()]
for x in range(9):
    df["big_firm"] = df["big_firm"].replace(x,0)

df["big_firm"] = df["big_firm"].replace(9,1)
df['big_firm'].fillna(0, inplace =True)

# Race 
for x in df["RACE"].unique():
    if x == 100:
        df["RACE"] = df["RACE"].replace(x,1)
    else:
        df["RACE"] = df["RACE"].replace(x,0)
        
        

# Occupation
df['2digit occupation'] = pd.cut(x=df['OCC90LY'], bins=[0,22, 37,200, 235,283,389,408,427,444,447,455,472,498,549,599,617,699,799,890], labels=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19], right=False)
df["1digit occupation"] = pd.cut(x=df["OCC90LY"], bins=[0,199,389,472,498,699,889], labels=[1,2,3,4,5,6])

# Industry
df["1digit industry"]   = pd.cut(x=df["IND90LY"], bins=[0,200,391, 473, 498, 890,1000], labels=[2,3,4,5,1,6])

# Education 
kf = pd.read_stata("../input/educ_years_school.dta")

if "years_school" in df:
    df = df.drop(columns=["years_school"])
    
educ_df = kf.rename(columns={"educ":"EDUC"})
df = pd.merge(df, educ_df, on='EDUC', how='right') # merge in years of schooling
df = df.rename(columns={"years_school_y":"years_school"})

# Experience
df["exp"]  = (df["AGE"] - np.maximum(df["years_school"], 12)) - 6
df["exp2"] = (df["exp"] ** 2 )
df = df[df["exp"] >= 0] 

# Hours
df["annual_hours"] = df["WKSWORK1"] * df["UHRSWORKLY"] 
df["annual_hours"] = df[df["annual_hours"]>=260] # drop if worked less than a month of 8hr days

# Drop if report income, but no hours.
indexNames = df[(df['INCWAGE'] > 0) & (df['annual_hours'] == 0)].index
df.drop(indexNames , inplace=True)


# Earnings
df['inc_self'] = 0 # df['OINCBUS'] + df['OINCFARM']    
df['earnings'] = (2/3) * df['inc_self'] + df['INCWAGE'] # 2/3 business income + labor income

In [99]:
# Adjust for inflation
# --------------------------------

df = df.rename(columns={"YEAR":"year"})

#Merge in cpi data
cpi_df = pd.read_csv("../output/revised_CPI.csv") 
df     = pd.merge(df, cpi_df, on='year', how='right')

# Merge in fedminwage data
fedminwage_df = pd.read_csv("../input/fedminwage.csv") 
df = pd.merge(df, fedminwage_df, on='year', how='right') # real_cpis is cpi / cpi[2000]

# Normalize to 2000-dollars
df["earnings"]   = df["earnings"] / df["real_cpis"]
df["fedminwage"] = df["fedminwage"] / df["real_cpis"]

# Compute implied hourly wage
df["Wage"] = df["earnings"] / df["annual_hours"]

# Drop if wage is less than 1/2 fedminwage
df = df.query("fedminwage * .5 <= Wage")# same thing: df[(df['Wage'] > df["fedminwage"])]

In [75]:
# Clean up
# --------------------------------
df = df.rename(columns={'occ90ly'.upper(): "3digit occupation", "RACE":"White"})

k = {}
for x in df.columns:
    k[x] = x.capitalize()
df = df.rename(columns=k)
columnsTitles = ['Year', '3digit occupation', '2digit occupation', '1digit occupation', 'Age', 'Asecwt', 'Exp','Male', 'Female','White', 'Exp2','1digit industry', 'Big_firm', "Wage"]
df = df.reindex(columns=columnsTitles)
df = df.reset_index(drop=True)

df[df.columns[:-1]] = df[df.columns[:-1]].astype('int64')
df[["Wage"]] = df["Wage"].astype('float64')
df["Wage"]   = df["Wage"].round(2)

# Save
df.to_csv("../output/current_values.csv")

ValueError: Cannot convert non-finite values (NA or inf) to integer

In [103]:
#k = df[df['OCC90LY'].isnull()].groupby("YEAR_BINS")
#original is the original occ1990 codes from datatframe, when first imported



df[["year", "OCCLY", "OCC90LY", "original"]].head()


Unnamed: 0,year,OCCLY,OCC90LY,original
0,1980.0,405.0,405.0,405.0
1,1980.0,889.0,889.0,889.0
2,1980.0,479.0,479.0,479.0
3,1980.0,453.0,453.0,453.0
4,1980.0,745.0,745.0,745.0


In [78]:
df








Unnamed: 0,Year,3digit occupation,2digit occupation,1digit occupation,Age,Asecwt,Exp,Male,Female,White,Exp2,1digit industry,Big_firm,Wage
0,1980.0,405.0,7,3,53.0,412.87,35.0,0.0,1.0,1.0,1225.0,1,0.0,7.02661
1,1980.0,889.0,19,6,35.0,489.84,17.0,1.0,0.0,1.0,289.0,4,0.0,20.6637
2,1980.0,479.0,13,4,53.0,481.40,35.0,1.0,0.0,1.0,1225.0,2,0.0,7.97219
3,1980.0,453.0,11,3,45.0,481.40,27.0,0.0,1.0,1.0,729.0,1,0.0,6.53172
4,1980.0,745.0,18,6,46.0,486.34,28.0,0.0,1.0,1.0,784.0,3,0.0,6.46919
5,1980.0,738.0,18,6,34.0,228.67,16.0,1.0,0.0,1.0,256.0,2,0.0,4.85189
6,1980.0,849.0,19,6,53.0,444.27,35.0,1.0,0.0,1.0,1225.0,3,0.0,6.68483
7,1980.0,745.0,18,6,52.0,406.00,34.0,0.0,1.0,1.0,1156.0,3,0.0,7.33174
8,1980.0,779.0,18,6,57.0,421.84,39.0,1.0,0.0,1.0,1521.0,3,0.0,14.3228
9,1980.0,779.0,18,6,48.0,429.01,30.0,1.0,0.0,1.0,900.0,2,0.0,30.1895


In [10]:
for x in df[df["year"]<=2004]["OCCLY"].unique():
    if str(x)[-3]=="0": 
        df['OCCLY'] = df["OCCLY"].replace(x,int(str(x)[:-3]))

In [368]:
df[df["OCCLY"] == 590]

Unnamed: 0,year,SERIAL,MONTH,HWTFINL,CPSID,ASECFLAG,HFLAG,ASECWTH,PERNUM,WTFINL,...,exp,exp2,annual_hours,inc_self,earnings,Unnamed: 0_x,real_cpis,Unnamed: 0_y,fedminwage,Wage
