# H1B Hub data from USCIS

In [1]:
# git pull https://github.com/JohnBroberg/H1B_Hub.git

import pandas as pd


## Download .csv files

_DtypeWarning: Columns (3) have mixed types._
Resolved in __Clean df__ step.

In [None]:
year_first = 2009
year_last = 2022

data_year_list = []
for year_temp in range(year_first, year_last +1):
    url_temp = f"https://www.uscis.gov/sites/default/files/document/data/h1b_datahubexport-{year_temp}.csv"
    data_year = pd.read_csv(url_temp)
    data_year = data_year.rename(columns={'Initial Approvals':'Initial Approval',
                                         'Initial Denials':'Initial Denial',
                                         'Continuing Approvals':'Continuing Approval',
                                         'Continuing Denials':'Continuing Denial'})
    data_year_list.append(data_year)
    


  data_year = pd.read_csv(url_temp)


## Concat into single dataframe (df)

In [None]:
df = pd.concat(data_year_list, ignore_index=True)
df

In [None]:
df.info()

## Clean df

### Remove commas from numeric values

In [None]:
cols = ['Initial Approval', 'Initial Denial', 'Continuing Approval', 'Continuing Denial']
df[cols]=df[cols].replace(',','', regex=True)  

### Convert data types

In [None]:
df=df.astype({'Fiscal Year':'int32',
              'Initial Approval':'int32',
              'Initial Denial':'int32',
              'Continuing Approval':'int32',
              'Continuing Denial':'int32',
              'NAICS':'str'})
#              'Employer':'str',
#              'Tax ID':'str',
#              'ZIP':'str'})  

df['Tax ID'] = pd.to_numeric(df['Tax ID'], errors='coerce').fillna(0).astype(int).astype(str)

df.info()

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df

In [None]:
df.isna().sum()

Electing not to clean State na's, because only 40 fixable State values, with max Approvals = 14

In [None]:
#df[(pd.isna(df['State'])) & pd.notna(df['ZIP'])]#.info()  

## Replace df.Employer values to consolidate employers with multiple company names

In [None]:
# Facebook changes company name to Meta
# https://www.cnbc.com/2021/10/28/facebook-changes-company-name-to-meta.html

dict_emp = {'FACEBOOK INC':'META PLATFORMS INC'
                , 'AMAZON CORPORATE LLC':'AMAZON COM SERVICES LLC'
                , 'AMAZON WEB SERVICES INC':'AMAZON COM SERVICES LLC'
                , 'AMAZON.COM SERVICES LLC':'AMAZON COM SERVICES LLC'
                , 'AMAZON.COM SERVICES INC':'AMAZON COM SERVICES LLC'
                , 'AMAZON FULFILLMENT SERVICES INC':'AMAZON COM SERVICES LLC'
                , 'AMAZON DEVELOPMENT CENTER US INC':'AMAZON COM SERVICES LLC'
                , 'AMAZON DATA SERVICES INC':'AMAZON COM SERVICES LLC'
                    , 'GOOGLE INC':'GOOGLE LLC'
           }

df.Employer = df.Employer.replace(dict_emp)  

In [None]:
df

## Import NAICS table

"NAICS code 99 means the industry is unknown. 
Any petition that had a blank code was assigned as 99 as well." (ref. README.md)

In [None]:

#NAICS codes: https://www.census.gov/naics/?58967?yearbck=2022

url = "https://www.census.gov/programs-surveys/economic-census/guidance/understanding-naics.html"

df_naics = pd.read_html(url, header=0)[1]


#df_naics = df_naics.append({'Sector':'99','Description':'unknown'}, ignore_index=True)

df_temp = pd.DataFrame(data={'Sector':['99'],'Description':['unknown']})

df_naics = pd.concat([df_naics, df_temp])

df_naics

In [None]:
df_naics.info()

In [None]:
df.NAICS.sort_values().unique()

## Continuing Approvals by NAICS code

In [None]:
df[['NAICS','Continuing Approval']].groupby('NAICS').sum()\
.sort_values(by='NAICS', ascending=True)


## Replace df.NAICS values to match NAICS table

In [None]:
dict_naics = {'31':'31-33', '32':'31-33', '33':'31-33'
                , '44':'44-45', '45':'44-45'
                , '48':'48-49', '49':'48-49'}

df.NAICS=df.NAICS.replace(dict_naics)  

In [None]:
df.NAICS.sort_values().unique()

## Continuing Approvals by NAICS code

In [None]:
df[['NAICS','Continuing Approval']].groupby('NAICS').sum()\
.sort_values(by='NAICS', ascending=True)


## MERGE NAICS Description to df

In [None]:
df = pd.merge(df, df_naics, how='left', left_on='NAICS', right_on='Sector', suffixes=(False, False))
df

In [None]:
df.info()

In [None]:
df.isna().sum()

## Melt four Decision measures into single Petition measure column with Descision dimension column  
_Commented out because resulting compressed CSV is too large for Github's 50MB file size limit_

In [None]:
#df = pd.melt(df, id_vars=['Fiscal Year', 'Employer', 'NAICS', 'Tax ID', 'State', 'City', 'ZIP', 'Sector', 'Description'],
#       value_vars=['Initial Approval', 'Initial Denial', 'Continuing Approval', 'Continuing Denial', ],
#       var_name='Decision', value_name='Petitions')

In [None]:
df

## Export df to compressed .csv

In [None]:
compression_opts = dict(method='zip', archive_name='h1b_hub.csv')  

df.to_csv('h1b_hub_data.zip', index=False, compression=compression_opts)  