In [1]:
# H1B Hub data from USCIS

#NAICS codes: https://www.census.gov/naics/?58967?yearbck=2022
#left of string: https://datatofish.com/left-right-mid-pandas/


import pandas as pd


In [None]:
#download .csv files
#concat into single dataframe (df)

year_first = 2009
year_last = 2022

data_year_list = []
for year_temp in range(year_first, year_last +1):
    url_temp = f"https://www.uscis.gov/sites/default/files/document/data/h1b_datahubexport-{year_temp}.csv"
    data_year = pd.read_csv(url_temp)
    data_year = data_year.rename(columns={'Initial Approvals':'Initial Approval',
                                         'Initial Denials':'Initial Denial',
                                         'Continuing Approvals':'Continuing Approval',
                                         'Continuing Denials':'Continuing Denial'})
    data_year_list.append(data_year)
    
df = pd.concat(data_year_list, ignore_index=True)
df

  data_year = pd.read_csv(url_temp)


In [None]:
df.info()

In [None]:
#Clean df

#remove commas from numeric values
cols = ['Initial Approval', 'Initial Denial', 'Continuing Approval', 'Continuing Denial']
df[cols]=df[cols].replace(',','', regex=True)  

#convert data types
df=df.astype({'Fiscal Year':'int32',
              'Initial Approval':'int32',
              'Initial Denial':'int32',
              'Continuing Approval':'int32',
              'Continuing Denial':'int32',
              'NAICS':'str'})
#              'Employer':'str',
#              'Tax ID':'str',
#              'ZIP':'str'})  

df['Tax ID'] = pd.to_numeric(df['Tax ID'], errors='coerce').fillna(0).astype(int).astype(str)

df.info()


In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df

In [None]:
df.isna().sum()

In [None]:
#electing not to clean State na's, because only 40 fixable State values, with max Approvals = 14

df[(pd.isna(df['State'])) & pd.notna(df['ZIP'])]#.info()  

In [None]:
#Import NAICS table

url = "https://www.census.gov/programs-surveys/economic-census/guidance/understanding-naics.html"

df_naics = pd.read_html(url, header=0)[1]

#"NAICS code 99 means the industry is unknown. 
# Any petition that had a blank code was assigned as 99 as well." (ref. README.md)
df_naics = df_naics.append({'Sector':'99','Description':'unknown'}, ignore_index=True)

df_naics

In [None]:
df_naics.info()

In [None]:
df.NAICS.sort_values().unique()

In [None]:
#Continuing Approvals by NAICS code

df[['NAICS','Continuing Approval']].groupby('NAICS').sum()\
.sort_values(by='NAICS', ascending=True)


In [None]:
#Replace df.NAICS values to match NAICS table

dict_naics = {'31':'31-33', '32':'31-33', '33':'31-33'
                , '44':'44-45', '45':'44-45'
                , '48':'48-49', '49':'48-49'}

df.NAICS=df.NAICS.replace(dict_naics)  

In [None]:
df.NAICS.sort_values().unique()

In [None]:
#Continuing Approvals by NAICS code

df[['NAICS','Continuing Approval']].groupby('NAICS').sum()\
.sort_values(by='NAICS', ascending=True)


In [None]:
df = pd.merge(df, df_naics, how='left', left_on='NAICS', right_on='Sector', suffixes=(False, False))
df

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.columns

In [None]:
df = pd.melt(df, id_vars=['Fiscal Year', 'Employer', 'NAICS', 'Tax ID', 'State', 'City', 'ZIP', 'Sector', 'Description'],
       value_vars=['Initial Approval', 'Initial Denial', 'Continuing Approval', 'Continuing Denial', ],
       var_name='Decision', value_name='Petitions')

In [None]:
df

In [None]:
#Export df to compressed .csv

compression_opts = dict(method='zip', archive_name='h1b_hub.csv')  

df.to_csv('h1b_hub.zip', index=False, compression=compression_opts)  