In [16]:
import numpy as np
import pandas as pd

#after testing replace StringIO(temp) to filename
df = pd.read_csv("../input/cps_00054_occ1990dd.csv")

In [20]:
df.columns = [x.upper() for x in df.columns]
df.head()

Unnamed: 0,YEAR,SERIAL,MONTH,HWTFINL,CPSID,ASECFLAG,HFLAG,ASECWTH,PERNUM,WTFINL,...,CLASSWLY,WKSWORK1,UHRSWORKLY,FULLPART,FIRMSIZE,INCWAGE,OINCBUS,OINCFARM,PAIDGH,OCC1990DD
0,1969,19629,3,,,,,1191.36,1,,...,22,,,1,,12500,,,,
1,1980,28185,3,,,1.0,,749.29999,1,,...,22,52.0,40.0,1,,20140,,,20.0,386.0
2,1988,57267,3,,,1.0,,2070.1001,1,,...,22,52.0,45.0,1,9.0,34000,0.0,0.0,21.0,48.0
3,1983,54658,3,,,1.0,,247.96001,2,,...,28,7.0,40.0,1,,1450,,,0.0,379.0
4,1983,8424,3,,,1.0,,1795.78,2,,...,22,1.0,37.0,1,,115,,,0.0,277.0


In [21]:
# Sample Selection - HPV Sample C
# --------------------------------

df["YEAR"] = df["year".upper()] - 1 # income questions ask about 'last year'
df["AGE"]  = df["AGE".upper()]  - 1 # income questions ask about 'last year'

# Prime age
df = df[df["AGE"]>=25] # Drop if if age < 25
df = df[df["AGE"]<=60] # drop if age > 60

# Employed at firm
df = df[ (df['CLASSWLY'] == 22) ]  # drop self-employed, government workers
     
# Drop if missing values    
df = df.dropna(subset=["OCCLY", "IND90LY", "AGE", 'CLASSWLY', 'RACE', 'INCWAGE', 'WKSWORK1', 'UHRSWORKLY', 'EDUC', "YEAR","OCC1990DD"])
df = df.rename(columns={"OCC90LY":"original"})
df = df[df["INCWAGE"] < 99999998] # Topcoded income

In [22]:
df = df.rename({"OCC1990DD":"occ1990dd"},axis=1)
df.head()

Unnamed: 0,YEAR,SERIAL,MONTH,HWTFINL,CPSID,ASECFLAG,HFLAG,ASECWTH,PERNUM,WTFINL,...,CLASSWLY,WKSWORK1,UHRSWORKLY,FULLPART,FIRMSIZE,INCWAGE,OINCBUS,OINCFARM,PAIDGH,occ1990dd
1,1979,28185,3,,,1.0,,749.29999,1,,...,22,52.0,40.0,1,,20140,,,20.0,386.0
2,1987,57267,3,,,1.0,,2070.1001,1,,...,22,52.0,45.0,1,9.0,34000,0.0,0.0,21.0,48.0
5,1993,55317,3,,19931200000000.0,1.0,,1166.05,2,,...,22,52.0,40.0,1,9.0,35000,0.0,0.0,22.0,744.0
7,2000,83084,3,,0.0,1.0,,538.63,2,,...,22,52.0,45.0,1,9.0,80000,0.0,0.0,21.0,585.0
9,1996,221,3,,19970200000000.0,1.0,,1025.47,1,,...,22,52.0,44.0,1,5.0,16000,0.0,0.0,22.0,783.0


In [23]:
# Variables
# --------------------------------

# Sex
df["FEMALE"] = df["SEX"].replace(1, 0)
df["FEMALE"] = df["FEMALE"].replace(2, 1)
df["MALE"] = df["SEX"].replace(2  , 0)

# Big firm has 1000+ employees (=9, 500-999 = 8)
df["big_firm"]  = df["firmsize".upper()]
for x in range(9):
    df["big_firm"] = df["big_firm"].replace(x,0)

df["big_firm"] = df["big_firm"].replace(9,1)
df['big_firm'].fillna(0, inplace =True)

# Race 
for x in df["RACE"].unique():
    if x == 100:
        df["RACE"] = df["RACE"].replace(x,1)
    else:
        df["RACE"] = df["RACE"].replace(x,0)
        
        

# Occupation
df['2digit occupation'] = pd.cut(x=df['occ1990dd'], bins=[0,22, 37,200, 235,283,389,408,427,444,447,455,472,498,549,599,617,699,799,890], labels=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19], right=False)
df["1digit occupation"] = pd.cut(x=df["occ1990dd"], bins=[0,199,389,472,498,699,889], labels=[1,2,3,4,5,6])
# Industry
df["1digit industry"]   = pd.cut(x=df["IND90LY"], bins=[0,200,391, 473, 498, 890,1000], labels=[2,3,4,5,1,6])

# Education 
kf = pd.read_stata("../input/educ_years_school.dta")

if "years_school" in df:
    df = df.drop(columns=["years_school"])
    
educ_df = kf.rename(columns={"educ":"EDUC"})
df = pd.merge(df, educ_df, on='EDUC', how='right') # merge in years of schooling
df = df.rename(columns={"years_school_y":"years_school"})

# Experience
df["exp"]  = (df["AGE"] - np.maximum(df["years_school"], 12)) - 6
df["exp2"] = (df["exp"] ** 2 )
df = df[df["exp"] >= 0] 


# Hours
df["annual_hours"] = df["WKSWORK1"] * df["UHRSWORKLY"]
df = df[df["annual_hours"]>=260] # drop if worked less than a month of 8hr days

# Drop if report income, but no hours.
indexNames = df[(df['INCWAGE'] > 0) & (df['annual_hours'] == 0)].index
df.drop(indexNames , inplace=True)


# Earnings
df['inc_self'] = 0 # df['OINCBUS'] + df['OINCFARM']    
df['earnings'] = (2/3) * df['inc_self'] + df['INCWAGE'] # 2/3 business income + labor income


In [24]:
# Adjust for inflation
# --------------------------------

df = df.rename(columns={"YEAR":"year"})

#Merge in cpi data
cpi_df = pd.read_csv("../output/revised_CPI.csv") 
df     = pd.merge(df, cpi_df, on='year', how='right')

# Merge in fedminwage data
fedminwage_df = pd.read_csv("../input/fedminwage.csv") 
df = pd.merge(df, fedminwage_df, on='year', how='right') # real_cpis is cpi / cpi[2000]
df["earnings1"] = df["earnings"]
# Normalize to 2000-dollars
df["earnings"]   = df["earnings"] / df["real_cpis"]
df["fedminwage"] = df["fedminwage"] / df["real_cpis"]

# Compute implied hourly wage
df["Wage"] = df["earnings"] / df["annual_hours"]

# Drop if wage is less than 1/2 fedminwage
df = df.query("fedminwage * .5 <= Wage")# same thing: df[(df['Wage'] > df["fedminwage"])]
#df = df[df["Wage"]>= .5 * df["fedminwage"]]

In [30]:
# Clean up
# --------------------------------
df = df.rename(columns={'occ1990dd': "3digit occupation", "RACE":"White","EDUC":"education", "annual_hours":"Annual hours"})

k = {}
for x in df.columns:
    k[x] = x.capitalize()
df = df.rename(columns=k)
columnsTitles = ['Year', '3digit occupation', '2digit occupation', '1digit occupation', 'Age', 'Asecwt', 'Exp','Male', 'Female','White', 'Exp2','1digit industry', 'Big_firm', "Education","Annual hours","Wage","Years_school"]
df = df.reindex(columns=columnsTitles)
df = df.reset_index(drop=True)

df[df.columns[:-1]] = df[df.columns[:-1]].astype('int64')
df[["Wage"]] = df["Wage"].astype('float64')
df["Wage"]   = df["Wage"].round(2)

# Save
df.to_csv("../output/current_values(last).csv")

In [14]:
hf = df

In [29]:
max(df["Year"])

2001