In [1]:
# H1B Labor Condition Applications (Form ETA-9035)

# http://econdataus.com/h1bdata.htm

#source: US Department of Labor
#https://www.dol.gov/agencies/eta/foreign-labor/performance

# metadata: lca_cols
# https://www.dol.gov/sites/dolgov/files/ETA/oflc/pdfs/LCA_Record_Layout_FY2022_Q3.pdf

# git pull https://github.com/JohnBroberg/H1B_LCA.git

import pandas as pd


In [2]:
#download .xlsx files
#concat into single dataframe (df)

lca_cols = ['CASE_NUMBER','CASE_STATUS', 'RECEIVED_DATE', 'DECISION_DATE', 'ORIGINAL_CERT_DATE'
           , 'VISA_CLASS', 'SOC_TITLE', 'FULL_TIME_POSITION'
           , 'TOTAL_WORKER_POSITIONS', 'EMPLOYER_NAME', 'NAICS_CODE', 'WORKSITE_WORKERS'
           , 'SECONDARY_ENTITY', 'SECONDARY_ENTITY_BUSINESS_NAME', 'WORKSITE_STATE'
           , 'WAGE_RATE_OF_PAY_FROM', 'WAGE_RATE_OF_PAY_TO', 'WAGE_UNIT_OF_PAY'
           , 'PREVAILING_WAGE', 'PW_UNIT_OF_PAY', 'PW_TRACKING_NUMBER', 'PW_WAGE_LEVEL']
#           , 'PW_OES_YEAR', 'PW_OTHER_SOURCE', 'PW_OTHER_YEAR', 'PW_SURVEY_PUBLISHER', 'PW_SURVEY_NAME']

df = pd.read_excel("https://www.dol.gov/sites/dolgov/files/ETA/oflc/pdfs/LCA_Disclosure_Data_FY2022_Q3.xlsx"
                  , usecols = lca_cols)
#                  , index_col = None)



In [3]:
df = df[(df['VISA_CLASS']=='H-1B') & \
        (df['FULL_TIME_POSITION']=='Y')]

df

Unnamed: 0,CASE_NUMBER,CASE_STATUS,RECEIVED_DATE,DECISION_DATE,ORIGINAL_CERT_DATE,VISA_CLASS,SOC_TITLE,FULL_TIME_POSITION,TOTAL_WORKER_POSITIONS,EMPLOYER_NAME,...,SECONDARY_ENTITY,SECONDARY_ENTITY_BUSINESS_NAME,WORKSITE_STATE,WAGE_RATE_OF_PAY_FROM,WAGE_RATE_OF_PAY_TO,WAGE_UNIT_OF_PAY,PREVAILING_WAGE,PW_UNIT_OF_PAY,PW_TRACKING_NUMBER,PW_WAGE_LEVEL
0,I-200-19274-066496,Certified - Withdrawn,2019-10-01,2022-04-20,2019-10-08,H-1B,"Software Developers, Applications",Y,1,"Experis US, Inc.",...,Yes,Verizon Sourcing LLC,TX,53.37,70.0,Hour,53.37,Hour,,III
1,I-200-19274-066597,Certified - Withdrawn,2019-10-01,2022-04-05,2019-10-08,H-1B,Marketing Managers,Y,1,"SAMSUNG ELECTRONICS AMERICA, INC.",...,No,,TX,146077.63,,Year,139464.00,Year,,III
2,I-200-19274-066618,Certified - Withdrawn,2019-10-01,2021-12-01,2019-10-08,H-1B,"Computer Science Teachers, Postsecondary",Y,1,Stevens Institute of Technology,...,No,,NJ,110000.00,130000.0,Year,56290.00,Year,,I
3,I-200-19275-067882,Certified - Withdrawn,2019-10-02,2022-03-08,2019-10-09,H-1B,"Software Developers, Applications",Y,1,"DaVita, Inc.",...,No,,CO,88858.00,106038.0,Year,88858.00,Year,,II
4,I-200-19275-067945,Certified - Withdrawn,2019-10-02,2021-12-09,2019-10-09,H-1B,Mathematicians,Y,1,"Triad National Security, LLC",...,No,,NM,121000.00,148300.0,Year,90314.00,Year,,II
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499270,I-200-22181-325657,Withdrawn,2022-06-30,2022-06-30,NaT,H-1B,"Software Developers, Applications",Y,1,"Technocorp Solutions, Inc.",...,Yes,Anthem,VA,89669.00,,Year,89669.00,Year,,II
499271,I-200-22181-325701,Withdrawn,2022-06-30,2022-06-30,NaT,H-1B,Computer Systems Analysts,Y,1,"COMPUNNEL SOFTWARE GROUP, INC",...,Yes,Fidelity Investments,NC,94500.00,109000.0,Year,94474.00,Year,,III
499272,I-200-22181-325974,Withdrawn,2022-06-30,2022-06-30,NaT,H-1B,"Biological Scientists, All Other",Y,1,"Novartis Institutes for BioMedical Research, Inc.",...,No,,MA,113214.00,,Year,113214.00,Year,,IV
499273,I-200-22181-326033,Withdrawn,2022-06-30,2022-06-30,NaT,H-1B,"Software Developers, Applications",Y,1,"KPIT Technologies, Inc.",...,Yes,Eaton Corporation,MI,95652.00,,Year,80163.00,Year,,II


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 493040 entries, 0 to 499274
Data columns (total 22 columns):
 #   Column                          Non-Null Count   Dtype         
---  ------                          --------------   -----         
 0   CASE_NUMBER                     493040 non-null  object        
 1   CASE_STATUS                     493040 non-null  object        
 2   RECEIVED_DATE                   493040 non-null  datetime64[ns]
 3   DECISION_DATE                   493040 non-null  datetime64[ns]
 4   ORIGINAL_CERT_DATE              23587 non-null   datetime64[ns]
 5   VISA_CLASS                      493040 non-null  object        
 6   SOC_TITLE                       493040 non-null  object        
 7   FULL_TIME_POSITION              493040 non-null  object        
 8   TOTAL_WORKER_POSITIONS          493040 non-null  int64         
 9   EMPLOYER_NAME                   493040 non-null  object        
 10  NAICS_CODE                      493040 non-null  int64  

In [5]:
df['PW_WAGE_LEVEL'].unique()

array(['III', 'I', 'II', 'IV', nan], dtype=object)

In [6]:
min(df['RECEIVED_DATE'])

Timestamp('2019-10-01 00:00:00')

In [7]:
min(df['DECISION_DATE'])

Timestamp('2021-10-01 00:00:00')

In [8]:
df.duplicated()

0         False
1         False
2         False
3         False
4         False
          ...  
499270    False
499271    False
499272    False
499273    False
499274    False
Length: 493040, dtype: bool

In [9]:
len(df)-len(df.drop_duplicates())

0

In [10]:
len(df['CASE_NUMBER'])-len(df['CASE_NUMBER'].drop_duplicates())

0

In [11]:


df21 = pd.read_excel("https://www.dol.gov/sites/dolgov/files/ETA/oflc/pdfs/LCA_Disclosure_Data_FY2021_Q4.xlsx")
df21

Unnamed: 0,CASE_NUMBER,CASE_STATUS,RECEIVED_DATE,DECISION_DATE,ORIGINAL_CERT_DATE,VISA_CLASS,JOB_TITLE,SOC_CODE,SOC_TITLE,FULL_TIME_POSITION,...,WILLFUL_VIOLATOR,SUPPORT_H1B,STATUTORY_BASIS,APPENDIX_A_ATTACHED,PUBLIC_DISCLOSURE,PREPARER_LAST_NAME,PREPARER_FIRST_NAME,PREPARER_MIDDLE_INITIAL,PREPARER_BUSINESS_NAME,PREPARER_EMAIL
0,I-200-21175-424928,Certified,2021-06-24,2021-07-01,NaT,H-1B,Software Engineer 3,15-1132.00,"Software Developers, Applications",Y,...,No,Yes,"$60,000 or higher annual wage",,Disclose Business,Anyanwu,Helen,,Erickson Immigration Group,helena@eiglaw.com
1,I-200-21175-424959,Certified,2021-06-24,2021-07-01,NaT,H-1B,"MANAGER, STRESS",17-2011.00,Aerospace Engineers,Y,...,No,,,,Disclose Business,,,,,
2,I-200-21175-425244,Certified,2021-06-24,2021-07-01,NaT,H-1B,Manager JC50,15-1121.00,Computer Systems Analysts,Y,...,No,Yes,"$60,000 or higher annual wage",,Disclose Business,,,,,
3,I-200-21175-423449,Certified,2021-06-24,2021-07-01,NaT,H-1B,Investment Banking Senior Analyst,13-2051.00,Financial Analysts,Y,...,No,,,,Disclose Business,Cho / #9895,Yunuen,,"Gibney, Anthony & Flaherty, LLP",ycho@gibney.com
4,I-200-21175-425533,Certified,2021-06-24,2021-07-01,NaT,H-1B,Software Engineer,15-1133.00,"Software Developers, Systems Software",Y,...,No,,,,Disclose Business,BYNUM,KAYLA,,TAFAPOLSKY & SMITH LLP,kayla.bynum@tandslaw.com
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126571,I-200-21273-615946,Withdrawn,2021-09-30,2021-09-30,NaT,H-1B,SOFTWARE DEVELOPMENT,15-1133.00,"Software Developers, Systems Software",Y,...,No,,,,Disclose Business,,,,NOT ANY,
126572,I-200-21273-617427,Withdrawn,2021-09-30,2021-09-30,NaT,H-1B,Assistant Professor of Instruction,25-1011.00,"Business Teachers, Postsecondary",Y,...,No,,,,Disclose Business,,,,,
126573,I-200-21273-617937,Withdrawn,2021-09-30,2021-09-30,NaT,H-1B,Engineering Lead,15-1132.00,"Software Developers, Applications",Y,...,No,Yes,"$60,000 or higher annual wage",,Disclose Business,,,,,
126574,I-200-21273-617987,Withdrawn,2021-09-30,2021-09-30,NaT,H-1B,Engineering Lead,15-1132.00,"Software Developers, Applications",Y,...,No,Yes,"$60,000 or higher annual wage",,Disclose Business,,,,,


In [12]:
df_concat = pd.concat([df, df21])
df_concat

Unnamed: 0,CASE_NUMBER,CASE_STATUS,RECEIVED_DATE,DECISION_DATE,ORIGINAL_CERT_DATE,VISA_CLASS,SOC_TITLE,FULL_TIME_POSITION,TOTAL_WORKER_POSITIONS,EMPLOYER_NAME,...,WILLFUL_VIOLATOR,SUPPORT_H1B,STATUTORY_BASIS,APPENDIX_A_ATTACHED,PUBLIC_DISCLOSURE,PREPARER_LAST_NAME,PREPARER_FIRST_NAME,PREPARER_MIDDLE_INITIAL,PREPARER_BUSINESS_NAME,PREPARER_EMAIL
0,I-200-19274-066496,Certified - Withdrawn,2019-10-01,2022-04-20,2019-10-08,H-1B,"Software Developers, Applications",Y,1,"Experis US, Inc.",...,,,,,,,,,,
1,I-200-19274-066597,Certified - Withdrawn,2019-10-01,2022-04-05,2019-10-08,H-1B,Marketing Managers,Y,1,"SAMSUNG ELECTRONICS AMERICA, INC.",...,,,,,,,,,,
2,I-200-19274-066618,Certified - Withdrawn,2019-10-01,2021-12-01,2019-10-08,H-1B,"Computer Science Teachers, Postsecondary",Y,1,Stevens Institute of Technology,...,,,,,,,,,,
3,I-200-19275-067882,Certified - Withdrawn,2019-10-02,2022-03-08,2019-10-09,H-1B,"Software Developers, Applications",Y,1,"DaVita, Inc.",...,,,,,,,,,,
4,I-200-19275-067945,Certified - Withdrawn,2019-10-02,2021-12-09,2019-10-09,H-1B,Mathematicians,Y,1,"Triad National Security, LLC",...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126571,I-200-21273-615946,Withdrawn,2021-09-30,2021-09-30,NaT,H-1B,"Software Developers, Systems Software",Y,1,"ESCOBEDO CONSTRUCTION, LP",...,No,,,,Disclose Business,,,,NOT ANY,
126572,I-200-21273-617427,Withdrawn,2021-09-30,2021-09-30,NaT,H-1B,"Business Teachers, Postsecondary",Y,1,The University of Texas at Dallas,...,No,,,,Disclose Business,,,,,
126573,I-200-21273-617937,Withdrawn,2021-09-30,2021-09-30,NaT,H-1B,"Software Developers, Applications",Y,1,NIC INFO TEK INC,...,No,Yes,"$60,000 or higher annual wage",,Disclose Business,,,,,
126574,I-200-21273-617987,Withdrawn,2021-09-30,2021-09-30,NaT,H-1B,"Software Developers, Applications",Y,1,NIC INFO TEK INC,...,No,Yes,"$60,000 or higher annual wage",,Disclose Business,,,,,


In [13]:
len(df_concat)-len(df_concat.drop_duplicates())

0

In [14]:
len(df_concat['CASE_NUMBER'])-len(df_concat['CASE_NUMBER'].drop_duplicates())

2741

In [15]:
dup = df_concat[df_concat.duplicated(['CASE_NUMBER'], keep = False)]\
    .sort_values(['CASE_NUMBER', 'DECISION_DATE'], ascending = False)

dup

Unnamed: 0,CASE_NUMBER,CASE_STATUS,RECEIVED_DATE,DECISION_DATE,ORIGINAL_CERT_DATE,VISA_CLASS,SOC_TITLE,FULL_TIME_POSITION,TOTAL_WORKER_POSITIONS,EMPLOYER_NAME,...,WILLFUL_VIOLATOR,SUPPORT_H1B,STATUTORY_BASIS,APPENDIX_A_ATTACHED,PUBLIC_DISCLOSURE,PREPARER_LAST_NAME,PREPARER_FIRST_NAME,PREPARER_MIDDLE_INITIAL,PREPARER_BUSINESS_NAME,PREPARER_EMAIL
15471,I-200-21267-603909,Certified - Withdrawn,2021-09-23,2022-03-16,2021-09-30,H-1B,Computer Systems Analysts,Y,1,"ZS Associates, Inc.",...,,,,,,,,,,
126105,I-200-21267-603909,Certified,2021-09-23,2021-09-30,NaT,H-1B,Computer Systems Analysts,Y,1,"ZS Associates, Inc.",...,No,,,,Disclose Business,Sanil,Ranjith,,Seyfarth Shaw LLP,rsanil@seyfarth.com
15470,I-200-21267-603854,Certified - Withdrawn,2021-09-23,2022-04-15,2021-09-30,H-1B,"Financial Specialists, All Other",Y,1,"ERGOTELES, LLC",...,,,,,,,,,,
125737,I-200-21267-603854,Certified,2021-09-23,2021-09-30,NaT,H-1B,"Financial Specialists, All Other",Y,1,"ERGOTELES, LLC",...,No,,,,Disclose Business and Employment,,,,,
15469,I-200-21267-603831,Certified - Withdrawn,2021-09-23,2022-06-21,2021-09-30,H-1B,Statisticians,Y,1,Matrix Analytics Inc,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1165,I-200-21175-423374,Certified,2021-06-24,2021-07-01,NaT,H-1B,Database Administrators,Y,1,TekOrg Inc.,...,No,Yes,"Both $60,000 or higher in annual wage and Mast...",,Disclose Business,Voruganti,Sumanth,,TekOrg Inc.,sumanthvbi281@gmail.com
12699,I-200-21175-423354,Certified - Withdrawn,2021-06-24,2022-03-16,2021-07-01,H-1B,Information Technology Project Managers,Y,1,"ZS Associates, Inc.",...,,,,,,,,,,
1374,I-200-21175-423354,Certified,2021-06-24,2021-07-01,NaT,H-1B,Information Technology Project Managers,Y,1,"ZS Associates, Inc.",...,No,,,,Disclose Business,Sanil,Ranjith,,Seyfarth Shaw LLP,rsanil@seyfarth.com
12698,I-200-21175-423341,Certified - Withdrawn,2021-06-24,2021-10-13,2021-07-01,H-1B,Operations Research Analysts,Y,1,"ZS Associates, Inc.",...,,,,,,,,,,


In [16]:
df_concat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 619616 entries, 0 to 126575
Data columns (total 96 columns):
 #   Column                          Non-Null Count   Dtype         
---  ------                          --------------   -----         
 0   CASE_NUMBER                     619616 non-null  object        
 1   CASE_STATUS                     619616 non-null  object        
 2   RECEIVED_DATE                   619616 non-null  datetime64[ns]
 3   DECISION_DATE                   619616 non-null  datetime64[ns]
 4   ORIGINAL_CERT_DATE              31006 non-null   datetime64[ns]
 5   VISA_CLASS                      619616 non-null  object        
 6   SOC_TITLE                       619616 non-null  object        
 7   FULL_TIME_POSITION              619616 non-null  object        
 8   TOTAL_WORKER_POSITIONS          619616 non-null  int64         
 9   EMPLOYER_NAME                   619616 non-null  object        
 10  NAICS_CODE                      619616 non-null  int64  