In [None]:
# H1B Labor Condition Applications (Form ETA-9035) 

In [1]:
# H1B Labor Condition Applications (Form ETA-9035)

# http://econdataus.com/h1bdata.htm

#source: US Department of Labor
#https://www.dol.gov/agencies/eta/foreign-labor/performance

#source: US Bureau of Labor Statistics 
#https://www.bls.gov/oes/tables.htm

# metadata: lca_cols
# https://www.dol.gov/sites/dolgov/files/ETA/oflc/pdfs/LCA_Record_Layout_FY2022_Q3.pdf

# DOL Guidance on Determining OES Wage Levels
# https://www.aila.org/infonet/dol-guidance-on-determining-oes-wage-levels

# 
# https://www.dol.gov/sites/doPrevailing Wage Determination Policy Guidancelgov/files/ETA/oflc/pdfs/Policy_Nonag_Progs.pdf

# git pull https://github.com/JohnBroberg/H1B_LCA.git

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


## global variables

In [2]:

#columns to be read from LCA Excel file
lca_cols = ['CASE_NUMBER','CASE_STATUS', 'RECEIVED_DATE', 'DECISION_DATE', 'ORIGINAL_CERT_DATE'
           , 'VISA_CLASS', 'SOC_CODE', 'SOC_TITLE', 'FULL_TIME_POSITION'
           , 'TOTAL_WORKER_POSITIONS', 'EMPLOYER_NAME', 'NAICS_CODE', 'WORKSITE_WORKERS'
           , 'SECONDARY_ENTITY', 'SECONDARY_ENTITY_BUSINESS_NAME', 'WORKSITE_STATE'
           , 'WAGE_RATE_OF_PAY_FROM', 'WAGE_RATE_OF_PAY_TO', 'WAGE_UNIT_OF_PAY'
           , 'PREVAILING_WAGE', 'PW_UNIT_OF_PAY', 'PW_WAGE_LEVEL']
#           , 'PW_OES_YEAR', 'PW_OTHER_SOURCE', 'PW_OTHER_YEAR', 'PW_SURVEY_PUBLISHER', 'PW_SURVEY_NAME']



# Fiscal Year 2023

In [None]:
# Download LCA_Disclosure_Data_FY2022_Q4.xlsx

df23 = pd.read_excel("https://www.dol.gov/sites/dolgov/files/ETA/oflc/pdfs/LCA_Disclosure_Data_FY2023_Q2.xlsx"
                  , usecols = lca_cols)
#                  , index_col = None)

df23 = df23[(df23['VISA_CLASS']=='H-1B') & \
        (df23['FULL_TIME_POSITION']=='Y')]

df23

In [None]:
min(df23['DECISION_DATE'])

In [None]:
max(df23['DECISION_DATE'])

In [None]:
# no duplicate rows
len(df23)-len(df23.drop_duplicates())

In [None]:
# no duplicate Case Numbers
len(df23['CASE_NUMBER'])-len(df23['CASE_NUMBER'].drop_duplicates())

In [None]:
# remove duplcate cases, keeping row with latest Decision Date

#df23 = df23.sort_values(['CASE_NUMBER', 'DECISION_DATE'], ascending = False) \
#    .drop_duplicates(subset = 'CASE_NUMBER', keep = 'first', ignore_index = True)

In [None]:
#len(df23['CASE_NUMBER'])-len(df23['CASE_NUMBER'].drop_duplicates())

In [None]:
#dup = df23[df23.duplicated(['CASE_NUMBER'], keep = False)]\
    .sort_values(['CASE_NUMBER', 'DECISION_DATE'], ascending = False)

#dup

In [None]:
df23.info()

In [None]:
# Add WORKSITE_EMPLOYER column

df23['WORKSITE_EMPLOYER'] = df23['EMPLOYER_NAME']
df23.loc[df23['SECONDARY_ENTITY'] == 'Yes', 'WORKSITE_EMPLOYER'] = df23['SECONDARY_ENTITY_BUSINESS_NAME']

df23[['EMPLOYER_NAME','SECONDARY_ENTITY_BUSINESS_NAME','WORKSITE_EMPLOYER']]

In [None]:
df23['WORKSITE_EMPLOYER'] = df23['WORKSITE_EMPLOYER'].str.upper()
df23['WORKSITE_EMPLOYER']

In [None]:
# Facebook changes company name to Meta
# https://www.cnbc.com/2021/10/28/facebook-changes-company-name-to-meta.html

dict_emp = {'FACEBOOK INC':'META PLATFORMS INC'
                , 'AMAZON CORPORATE LLC':'AMAZON COM SERVICES LLC'
                , 'AMAZON WEB SERVICES INC':'AMAZON COM SERVICES LLC'
                , 'AMAZON.COM SERVICES LLC':'AMAZON COM SERVICES LLC'
                , 'AMAZON.COM SERVICES INC':'AMAZON COM SERVICES LLC'
                , 'AMAZON FULFILLMENT SERVICES INC':'AMAZON COM SERVICES LLC'
                , 'AMAZON DEVELOPMENT CENTER US INC':'AMAZON COM SERVICES LLC'
                , 'AMAZON DATA SERVICES INC':'AMAZON COM SERVICES LLC'
                    , 'GOOGLE INC':'GOOGLE LLC'
           }

df23.WORKSITE_EMPLOYER = df23.WORKSITE_EMPLOYER.replace(dict_emp)  

In [None]:

# Fill in missing values with an empty string
df23['WORKSITE_EMPLOYER'] = df23['WORKSITE_EMPLOYER'].fillna('')

# Replace values in the 'WORKSITE_EMPLOYER' column based on condition contains('FACEBOOK')
mask = df23['WORKSITE_EMPLOYER'].str.contains('FACEBOOK')
df23.loc[mask, 'WORKSITE_EMPLOYER'] = 'META PLATFORMS INC'

In [None]:
df23

In [None]:
df23['WORKSITE_EMPLOYER']=='META PLATFORMS INC'

In [None]:
#df23['WAGE_UNIT_OF_PAY'].unique()


In [None]:
#df23['PW_UNIT_OF_PAY'].unique()

In [None]:
# Add WAGE_RATE_OF_PAY_FROM_YR column

# df23['WAGE_RATE_OF_PAY_FROM_YR'] = df23['WAGE_RATE_OF_PAY_FROM']
# df23.loc[df23['WAGE_UNIT_OF_PAY'] == 'Hour', 'WAGE_RATE_OF_PAY_FROM_YR'] = df23.WAGE_RATE_OF_PAY_FROM * 2000
# df23.loc[df23['WAGE_UNIT_OF_PAY'] == 'Month', 'WAGE_RATE_OF_PAY_FROM_YR'] = df23.WAGE_RATE_OF_PAY_FROM * 12
# df23.loc[df23['WAGE_UNIT_OF_PAY'] == 'Week', 'WAGE_RATE_OF_PAY_FROM_YR'] = df23.WAGE_RATE_OF_PAY_FROM * 50
# df23.loc[df23['WAGE_UNIT_OF_PAY'] == 'Bi-Weekly', 'WAGE_RATE_OF_PAY_FROM_YR'] = df23.WAGE_RATE_OF_PAY_FROM * 25


# # Add PREVAILING_WAGE_YR column

# df23['PREVAILING_WAGE_YR'] = df23['PREVAILING_WAGE']
# df23.loc[df23['PW_UNIT_OF_PAY'] == 'Hour', 'PREVAILING_WAGE_YR'] = df23.PREVAILING_WAGE * 2000
# df23.loc[df23['PW_UNIT_OF_PAY'] == 'Month', 'PREVAILING_WAGE_YR'] = df23.PREVAILING_WAGE * 12
# df23.loc[df23['PW_UNIT_OF_PAY'] == 'Week', 'PREVAILING_WAGE_YR'] = df23.PREVAILING_WAGE * 50
# df23.loc[df23['PW_UNIT_OF_PAY'] == 'Bi-Weekly', 'PREVAILING_WAGE_YR'] = df23.PREVAILING_WAGE * 25

# # Add WAGE_DIFF column

# df23['WAGE_DIFF'] = (df23.WAGE_RATE_OF_PAY_FROM_YR- df23.PREVAILING_WAGE_YR)/df23.PREVAILING_WAGE_YR

# df23[['WAGE_RATE_OF_PAY_FROM', 'WAGE_UNIT_OF_PAY'
#       , 'PREVAILING_WAGE', 'PW_UNIT_OF_PAY'
#       , 'WAGE_RATE_OF_PAY_FROM_YR', 'PREVAILING_WAGE_YR', 'WAGE_DIFF']]


In [None]:
# df23['WAGE_DIFF'].max()

In [None]:
# df23['WAGE_DIFF'].min()

In [None]:
# len(df23[df23['WAGE_DIFF'] < 0])

In [None]:
rslt_df = df23[df23['WAGE_UNIT_OF_PAY'] == 'Year'] 
rslt_df

In [None]:
# len(rslt_df[rslt_df['WAGE_DIFF'] < 0])

In [None]:
# rslt_df = rslt_df['WAGE_DIFF']
# rslt_df.hist(bins = 1000)

In [None]:
df23.info()

In [None]:
df23['PW_WAGE_LEVEL'].unique()

In [None]:
min(df23['DECISION_DATE'])

In [None]:
max(df23['DECISION_DATE'])

In [None]:
len(df23)-len(df23.drop_duplicates())

In [None]:
len(df23['CASE_NUMBER'])-len(df23['CASE_NUMBER'].drop_duplicates())

In [None]:
# Export df to compressed .csv

compression_opts = dict(method='zip', archive_name='h1b_lca_fy22.csv')  

# df23.to_csv('h1b_lca_fy22_data.zip', index=False, compression=compression_opts)  

In [None]:
df23