# H1B Hub data from USCIS

In [1]:
# !pip show pandas
# !pip uninstall NumPy
# !pip install -U pandas
# !pip install --upgrade NumPy

In [None]:
# git pull https://github.com/JohnBroberg/H1B_Hub.git

import numpy as np
import pandas as pd


## Download .csv files

_DtypeWarning: Columns (3) have mixed types._
Resolved in __Clean df__ step.

In [3]:
# year_first = 2009
# year_last = 2023

# data_year_list = []
# for year_temp in range(year_first, year_last +1):
#     url_temp = f"https://www.uscis.gov/sites/default/files/document/data/h1b_datahubexport-{year_temp}.csv"
#     data_year = pd.read_csv(url_temp)
#     data_year = data_year.rename(columns={'Initial Approvals':'Initial Approval',
#                                          'Initial Denials':'Initial Denial',
#                                          'Continuing Approvals':'Continuing Approval',
#                                          'Continuing Denials':'Continuing Denial'})
#     data_year_list.append(data_year)
    


In [None]:
# Base URL for the raw files on your GitHub repository
github_repo_base_url = "https://raw.githubusercontent.com/JohnBroberg/H1B_Hub/refs/heads/main/data/"

# Initialize an empty list to store DataFrames
data_year_list = []

# Loop through the years and process each file
year_first = 2009
year_last = 2024  # Adjusted to include 2024
for year_temp in range(year_first, year_last + 1):
    file_name = f"Employer_Information_{year_temp}.csv"
    file_url = f"{github_repo_base_url}{file_name}"
    
    try:
        # Read the CSV file directly from GitHub
        data_year = pd.read_csv(file_url, encoding="utf-16", sep="\t", on_bad_lines="warn")
        
        # Rename columns for consistency
        data_year = data_year.rename(columns={
            'Initial Approvals': 'Initial Approval',
            'Initial Denials': 'Initial Denial',
            'Continuing Approvals': 'Continuing Approval',
            'Continuing Denials': 'Continuing Denial'
        })
        
        # Append the DataFrame to the list
        data_year_list.append(data_year)
        print(f"Processed: {file_url}")
    except Exception as e:
        print(f"Failed to process: {file_url}. Error: {e}")

# Combine all DataFrames into a single DataFrame
df = pd.concat(data_year_list, ignore_index=True)

# Clean column names by stripping extra spaces
df.columns = df.columns.str.strip()

# Output the combined DataFrame (optional)
print(f"Combined DataFrame has {len(df)} rows.")


## Concat into single dataframe (df)

In [5]:
# df = df.reset_index(drop=True)
df = df.drop(columns=["Line by line"])
df

Unnamed: 0,Fiscal Year,Employer (Petitioner) Name,Tax ID,Industry (NAICS) Code,Petitioner City,Petitioner State,Petitioner Zip Code,Initial Approval,Initial Denial,Continuing Approval,Continuing Denial
0,2009,,,,FLUSHING,NY,11355.0,0,0,0,1
1,2009,,,,LOS ANGELES,CA,90013.0,1,0,0,0
2,2009,,,,MIAMI,FL,33157.0,1,0,0,0
3,2009,,,,NEWARK,DE,19702.0,0,0,1,0
4,2009,,,,POTOMAC,MD,20854.0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
955344,2024,ZYON DIGITAL SOLUTIONS LLC,3124.0,"54 - Professional, Scientific, and Technical S...",EL PASO,TX,79911.0,1,0,2,0
955345,2024,ZYSCOVICH LLC,4852.0,"54 - Professional, Scientific, and Technical S...",MIAMI,FL,33132.0,1,0,0,0
955346,2024,ZYTUS INC,96.0,"54 - Professional, Scientific, and Technical S...",HENRICO,VA,23238.0,1,0,0,0
955347,2024,ZYTUS INC,96.0,"54 - Professional, Scientific, and Technical S...",RICHMOND,VA,23238.0,1,0,0,0


In [6]:
df.info()

#955349  rows

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 955349 entries, 0 to 955348
Data columns (total 11 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Fiscal Year                 955349 non-null  int64  
 1   Employer (Petitioner) Name  955184 non-null  object 
 2   Tax ID                      943370 non-null  float64
 3   Industry (NAICS) Code       841767 non-null  object 
 4   Petitioner City             955323 non-null  object 
 5   Petitioner State            955163 non-null  object 
 6   Petitioner Zip Code         955164 non-null  float64
 7   Initial Approval            955349 non-null  object 
 8   Initial Denial              955349 non-null  object 
 9   Continuing Approval         955349 non-null  object 
 10  Continuing Denial           955349 non-null  object 
dtypes: float64(2), int64(1), object(8)
memory usage: 80.2+ MB


## Clean df

### Remove commas from numeric values

In [7]:
cols = ['Initial Approval', 'Initial Denial', 'Continuing Approval', 'Continuing Denial']
df[cols]=df[cols].replace(',','', regex=True)  

### Convert data types

In [8]:
df=df.astype({'Fiscal Year':'str',
              'Initial Approval':'int32',
              'Initial Denial':'int32',
              'Continuing Approval':'int32',
              'Continuing Denial':'int32',
              'Industry (NAICS) Code':'str',
#              'Employer':'str',
#              'Tax ID':'str',
             'Petitioner Zip Code':'str'})  

df['Tax ID'] = pd.to_numeric(df['Tax ID'], errors='coerce').fillna(0).astype(int).astype(str)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 955349 entries, 0 to 955348
Data columns (total 11 columns):
 #   Column                      Non-Null Count   Dtype 
---  ------                      --------------   ----- 
 0   Fiscal Year                 955349 non-null  object
 1   Employer (Petitioner) Name  955184 non-null  object
 2   Tax ID                      955349 non-null  object
 3   Industry (NAICS) Code       955349 non-null  object
 4   Petitioner City             955323 non-null  object
 5   Petitioner State            955163 non-null  object
 6   Petitioner Zip Code         955349 non-null  object
 7   Initial Approval            955349 non-null  int32 
 8   Initial Denial              955349 non-null  int32 
 9   Continuing Approval         955349 non-null  int32 
 10  Continuing Denial           955349 non-null  int32 
dtypes: int32(4), object(7)
memory usage: 65.6+ MB


In [9]:
df.columns

Index(['Fiscal Year', 'Employer (Petitioner) Name', 'Tax ID',
       'Industry (NAICS) Code', 'Petitioner City', 'Petitioner State',
       'Petitioner Zip Code', 'Initial Approval', 'Initial Denial',
       'Continuing Approval', 'Continuing Denial'],
      dtype='object')

In [10]:
df.head()

Unnamed: 0,Fiscal Year,Employer (Petitioner) Name,Tax ID,Industry (NAICS) Code,Petitioner City,Petitioner State,Petitioner Zip Code,Initial Approval,Initial Denial,Continuing Approval,Continuing Denial
0,2009,,0,,FLUSHING,NY,11355.0,0,0,0,1
1,2009,,0,,LOS ANGELES,CA,90013.0,1,0,0,0
2,2009,,0,,MIAMI,FL,33157.0,1,0,0,0
3,2009,,0,,NEWARK,DE,19702.0,0,0,1,0
4,2009,,0,,POTOMAC,MD,20854.0,1,0,0,0


In [11]:
df.describe()

Unnamed: 0,Initial Approval,Initial Denial,Continuing Approval,Continuing Denial
count,955349.0,955349.0,955349.0,955349.0
mean,1.960923,0.201652,3.683525,0.185565
std,31.415016,5.558547,68.663044,7.493953
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0
50%,0.0,0.0,1.0,0.0
75%,1.0,0.0,1.0,0.0
max,9179.0,3067.0,25200.0,3693.0


In [12]:
df

Unnamed: 0,Fiscal Year,Employer (Petitioner) Name,Tax ID,Industry (NAICS) Code,Petitioner City,Petitioner State,Petitioner Zip Code,Initial Approval,Initial Denial,Continuing Approval,Continuing Denial
0,2009,,0,,FLUSHING,NY,11355.0,0,0,0,1
1,2009,,0,,LOS ANGELES,CA,90013.0,1,0,0,0
2,2009,,0,,MIAMI,FL,33157.0,1,0,0,0
3,2009,,0,,NEWARK,DE,19702.0,0,0,1,0
4,2009,,0,,POTOMAC,MD,20854.0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
955344,2024,ZYON DIGITAL SOLUTIONS LLC,3124,"54 - Professional, Scientific, and Technical S...",EL PASO,TX,79911.0,1,0,2,0
955345,2024,ZYSCOVICH LLC,4852,"54 - Professional, Scientific, and Technical S...",MIAMI,FL,33132.0,1,0,0,0
955346,2024,ZYTUS INC,96,"54 - Professional, Scientific, and Technical S...",HENRICO,VA,23238.0,1,0,0,0
955347,2024,ZYTUS INC,96,"54 - Professional, Scientific, and Technical S...",RICHMOND,VA,23238.0,1,0,0,0


In [13]:
df.isna().sum()

Fiscal Year                     0
Employer (Petitioner) Name    165
Tax ID                          0
Industry (NAICS) Code           0
Petitioner City                26
Petitioner State              186
Petitioner Zip Code             0
Initial Approval                0
Initial Denial                  0
Continuing Approval             0
Continuing Denial               0
dtype: int64

Electing not to clean State na's, because only 40 fixable State values, with max Approvals = 14

In [14]:
#df[(pd.isna(df['State'])) & pd.notna(df['ZIP'])]#.info()  

## Replace df.Employer values to consolidate employers with multiple company names

In [15]:
# Facebook changes company name to Meta
# https://www.cnbc.com/2021/10/28/facebook-changes-company-name-to-meta.html

#Excel function =CONCAT(", ", "'",A2, "':'",$A$1,"'")

dict_emp = {'FACEBOOK INC':'META PLATFORMS INC'
            , 'FACEBOOK SERVICES INC':'META PLATFORMS INC'
            , 'FACEBOOK SVCS INC':'META PLATFORMS INC'
            , 'FACEBOOK PAYMENTS INC':'META PLATFORMS INC'
            , 'FACEBOOK MIAMI INC':'META PLATFORMS INC'
            , 'FACEBOOKSTER INC DBA AVENUESO':'META PLATFORMS INC'


            , 'AMAZON CORPORATE LLC':'AMAZON COM SERVICES LLC'
            , 'AMAZON WEB SERVICES INC':'AMAZON COM SERVICES LLC'
            , 'AMAZON.COM SERVICES LLC':'AMAZON COM SERVICES LLC'
            , 'AMAZON.COM SERVICES INC':'AMAZON COM SERVICES LLC'
            , 'AMAZON FULFILLMENT SERVICES INC':'AMAZON COM SERVICES LLC'
            , 'AMAZON DEVELOPMENT CENTER US INC':'AMAZON COM SERVICES LLC'
            , 'AMAZON DATA SERVICES INC':'AMAZON COM SERVICES LLC'
            , 'AMAZON GLOBAL RESOURCES INC':'AMAZON COM SERVICES LLC'
            , 'AMAZON ADVERTISING LLC':'AMAZON COM SERVICES LLC'
            , 'AMAZON ROBOTICS LLC':'AMAZON COM SERVICES LLC'
            , 'AMAZON.COM.DEDC LLC':'AMAZON COM SERVICES LLC'
            , 'AMAZON SERVICES LLC':'AMAZON COM SERVICES LLC'
            , 'AMAZON.COM.KYDC INC':'AMAZON COM SERVICES LLC'
            , 'AMAZON COM SERVICES INC':'AMAZON COM SERVICES LLC'
            , 'AMAZON PAYMENTS INC':'AMAZON COM SERVICES LLC'
            , 'AMAZON WEB SERVICES LLC':'AMAZON COM SERVICES LLC'
            , 'AMAZON.COM.KYDC LLC':'AMAZON COM SERVICES LLC'
            , 'AMAZON MECHANICAL TURK INC':'AMAZON COM SERVICES LLC'
            , 'AMAZON.COM DEDC LLC':'AMAZON COM SERVICES LLC'
            , 'AMAZON CAPITAL SERVICES INC':'AMAZON COM SERVICES LLC'
            , 'AMAZON SVCS LLC':'AMAZON COM SERVICES LLC'
            , 'AMAZON.COM KYDC LLC':'AMAZON COM SERVICES LLC'
            , 'AMAZON COM INDC LLC':'AMAZON COM SERVICES LLC'
            , 'AMAZON MEDIA GROUP LLC':'AMAZON COM SERVICES LLC'
            , 'AMAZON REGISTRY SERVICES INC':'AMAZON COM SERVICES LLC'
            , 'AMAZON COM AZDC LLC':'AMAZON COM SERVICES LLC'
            , 'AMAZON MEDIA GROUP':'AMAZON COM SERVICES LLC'
            , 'AMAZON DIGITAL SVCS INC':'AMAZON COM SERVICES LLC'
            , 'AMAZON TECHNOLOGIES INC':'AMAZON COM SERVICES LLC'
            , 'AMAZON MEDIA GROUP LLC D/B/A AMAZO':'AMAZON COM SERVICES LLC'
            , 'AMAZON DIGITAL SERVICES INC':'AMAZON COM SERVICES LLC'
            , 'AMAZON COM SVCS INC':'AMAZON COM SERVICES LLC'
            , 'AMAZON STUDIOS LLC':'AMAZON COM SERVICES LLC'
            , 'AMAZON FULFILLMENT SVCS INC':'AMAZON COM SERVICES LLC'
            , 'AMAZON COM DEDC LLC':'AMAZON COM SERVICES LLC'
            , 'AMAZON COM NVDC INC':'AMAZON COM SERVICES LLC'
            , 'AMAZON MEDIA VENTURE LLC':'AMAZON COM SERVICES LLC'
            , 'AMAZON WEB SVCS INC':'AMAZON COM SERVICES LLC'
            , 'AMAZON COM KYDC LLC':'AMAZON COM SERVICES LLC'
            , 'AMAZON.COM.CA INC':'AMAZON COM SERVICES LLC'
            , 'AMAZON FRESH LLC':'AMAZON COM SERVICES LLC'
            , 'AMAZON COM CA INC':'AMAZON COM SERVICES LLC'
            , 'AMAZON PRODUCE NETWORK LLP':'AMAZON COM SERVICES LLC'
            , 'AMAZON COM KSDC LLC':'AMAZON COM SERVICES LLC'
            , 'AMAZON CAPITAL SVCS INC':'AMAZON COM SERVICES LLC'
            , 'AMAZON PHARMACY INC':'AMAZON COM SERVICES LLC'
            , 'AMAZON DIGITAL SERVICES LLC':'AMAZON COM SERVICES LLC'
            , 'AMAZON COM AZDC INC':'AMAZON COM SERVICES LLC'
            , 'AMAZON.COM SVCS LLC F/K/A AMAZON.C':'AMAZON COM SERVICES LLC'
            , 'AMAZON COM SVCS INC F/K/A AMAZON F':'AMAZON COM SERVICES LLC'
            , 'AMAZON.COM KYDC INC':'AMAZON COM SERVICES LLC'
            , 'AMAZON REGISTRY SVCS INC':'AMAZON COM SERVICES LLC'
            , 'AMAZON.COM SERVICES LLC F/K/A AMAZON.COM SERVICES INC':'AMAZON COM SERVICES LLC'
            , 'AMAZON RETAIL LLC':'AMAZON COM SERVICES LLC'
            , 'AMAZON.COM.AZDC LLC':'AMAZON COM SERVICES LLC'
            , 'A2Z DVLP CTR INC DBA AMAZON MUSIC':'AMAZON COM SERVICES LLC'
            , 'AMAZON PRODUCE NETWORK LLC':'AMAZON COM SERVICES LLC'
            , 'AMAZON CORP LLC':'AMAZON COM SERVICES LLC'
            , 'AMAZON.COM SERVICES LLC F/K/A AMAZ':'AMAZON COM SERVICES LLC'
            , 'AMAZON CARGO INC':'AMAZON COM SERVICES LLC'
            , 'AMAZON. COM SEVS INC':'AMAZON COM SERVICES LLC'
            , 'AMAZON COM SVCS LLC FKA AMAZON':'AMAZON COM SERVICES LLC'
            , 'AMAZON MEDIA GRP LLC':'AMAZON COM SERVICES LLC'
            , 'AMAZON COM SVCS LLC F/K/A AMAZON C':'AMAZON COM SERVICES LLC'
            , 'AMAZON STUDIOS INC':'AMAZON COM SERVICES LLC'
            , 'AMAZON.COM.NVDC INC':'AMAZON COM SERVICES LLC'
            , 'AMAZON COM SERVICES INC FORMERLY K':'AMAZON COM SERVICES LLC'
            , 'AMAZON DEVELOPMENT CTR U S INC':'AMAZON COM SERVICES LLC'
            , 'AMAZON SOLUTIONS, INC.':'AMAZON COM SERVICES LLC'
            , 'AMAZON WEB SERVICE INC':'AMAZON COM SERVICES LLC'
            , 'AMAZON.COM SERVICES SERVICES INC':'AMAZON COM SERVICES LLC'
            , 'AMAZON.COM.INDC LLC':'AMAZON COM SERVICES LLC'
            , 'AMAZON WEB SREVICES INC':'AMAZON COM SERVICES LLC'
            , 'AMAZON LOGISTICS GROUP LLC':'AMAZON COM SERVICES LLC'

            , 'GOOGLE INC':'GOOGLE LLC'

            , 'COGNIZANT TECH SOLUTIONS US CORP':'COGNIZANT TECH SOLNS US CORP'
            , 'COGNIZANT TECHNOLOGY SOLUTIONS US':'COGNIZANT TECH SOLNS US CORP'
            , 'COGNIZANT TECHNOLOGY SOLNS US CORP':'COGNIZANT TECH SOLNS US CORP'
            , 'COGNIZANT TECHNOLOGY SOLUTIONS US CORP':'COGNIZANT TECH SOLNS US CORP'
            , 'COGNIZANT TRIZETTO SOFTWARE GROUP':'COGNIZANT TECH SOLNS US CORP'
            , 'COGNIZANT WORLDWIDE LIMITED':'COGNIZANT TECH SOLNS US CORP'
            , 'COGNIZANT TECH SOLNS US CORP ON BE':'COGNIZANT TECH SOLNS US CORP'
            , 'COGNIZANT TRIZETTO SOFTWARE GR INC':'COGNIZANT TECH SOLNS US CORP'
            , 'TRIZETTO CORP A COGNIZANT COMPANY':'COGNIZANT TECH SOLNS US CORP'
            , 'COGNIZANT TRIZETTO SOFTWARE GRP IN':'COGNIZANT TECH SOLNS US CORP'
            , 'COGNIZANT WORLDWIDE LIMITED COGNIZ':'COGNIZANT TECH SOLNS US CORP'
            , 'TMG HEALTH A COGNIZANT COMPANY':'COGNIZANT TECH SOLNS US CORP'
            , 'COGNIZANT TECH SOLUTION US CORP':'COGNIZANT TECH SOLNS US CORP'
            , 'COGNIZANT WORLDWIDE LIMITED COGNIZANT':'COGNIZANT TECH SOLNS US CORP'
            , 'COGNIZANT TECHNOLOGY SOLUTIONS':'COGNIZANT TECH SOLNS US CORP'
            , 'COGNIZANT TECHSOLUTIONS U S CORP':'COGNIZANT TECH SOLNS US CORP'
            , 'COGNIZANT TECH SOLUTIONS CORP':'COGNIZANT TECH SOLNS US CORP'
            , 'COGNIZANT TECH SOLUTIONS US CO':'COGNIZANT TECH SOLNS US CORP'
            , 'COGNIZANT IT PROFESSIONALS INC':'COGNIZANT TECH SOLNS US CORP'
            , 'COGNIZANT':'COGNIZANT TECH SOLNS US CORP'
            , 'COGNIZANT CONSULTING CORP':'COGNIZANT TECH SOLNS US CORP'
            , 'COGNIZANT TECHNOLOGY':'COGNIZANT TECH SOLNS US CORP'
            , 'COGNIZANT TECHNOLOGY SOLN US CORP':'COGNIZANT TECH SOLNS US CORP'
            , 'COGNIZANT TECH SOLUTIONS  US CORP':'COGNIZANT TECH SOLNS US CORP'
            , 'COGNIZANT TECH SOLNS SR US CORP':'COGNIZANT TECH SOLNS US CORP'
            , 'COGNIZANT TECH SOLNS CORP':'COGNIZANT TECH SOLNS US CORP'
            , 'COGNIZANTS TECH SOLUTIONS US CORP':'COGNIZANT TECH SOLNS US CORP'
            , 'ITAAS INC ITAAS C/O COGNIZANT TECH':'COGNIZANT TECH SOLNS US CORP'
            , 'ITAAS INC COGNIZANT TECHNOLOGY SOL':'COGNIZANT TECH SOLNS US CORP'
            , 'COGNIZANT TECHNOLOGY SOLUTIONS GRP':'COGNIZANT TECH SOLNS US CORP'
            , 'COGNIZANT TECHNOLOGY SOLUTIONS UC':'COGNIZANT TECH SOLNS US CORP'
            , 'COGNIZANT MOBILITY INC':'COGNIZANT TECH SOLNS US CORP'
            , 'COGNIZANT WORLDWIDE LTD COGNIZANT':'COGNIZANT TECH SOLNS US CORP'
            , 'TMG HEALTH A COGNIZANT CO-HQ 25 LA':'COGNIZANT TECH SOLNS US CORP'
            , 'TMG HEALTH INC A COGNIZANT COMPANY - HQ':'COGNIZANT TECH SOLNS US CORP'
            , 'COGNIZANT TRIZETTO SOFTWARE GROUP INC CTSG':'COGNIZANT TECH SOLNS US CORP'
            , 'TMG HEALTH INC A COGNIZANT COMPANY':'COGNIZANT TECH SOLNS US CORP'

            , 'TATA CONSULTANCY SERVICES LIMITED':'TATA CONSULTANCY SVCS LTD'
            , 'TATA TECHNOLOGIES INC':'TATA CONSULTANCY SVCS LTD'
            , 'TATA ELXSI LIMITED':'TATA CONSULTANCY SVCS LTD'
            , 'TATA COMMUNICATIONS AMERICA INC':'TATA CONSULTANCY SVCS LTD'
            , 'TATA AMERICA INTERNATIONAL CORP':'TATA CONSULTANCY SVCS LTD'
            , 'TATA TECHNOLOGIES INC TTI':'TATA CONSULTANCY SVCS LTD'
            , 'TATA INTERNATIONAL CORPORATION':'TATA CONSULTANCY SVCS LTD'
            , 'TATA TECHS INC':'TATA CONSULTANCY SVCS LTD'
            , 'TATA COMMUNICATIONS (AMERICA) INC':'TATA CONSULTANCY SVCS LTD'
            , 'TATA INTERACTIVE SYSTEMS USA':'TATA CONSULTANCY SVCS LTD'
            , 'TATA AUTOCOMP SYSTEMS LTD':'TATA CONSULTANCY SVCS LTD'
            , 'TATA ELXSI LTD':'TATA CONSULTANCY SVCS LTD'
            , 'TATA AUTOCOMP SYSTEMS LIMITED':'TATA CONSULTANCY SVCS LTD'
            , 'TATA CONSULTANCY SVCS LIMITED':'TATA CONSULTANCY SVCS LTD'
            , 'TATA CONSULTANCY SERVICES LTD':'TATA CONSULTANCY SVCS LTD'
            , 'TATA INTERACTIVE SYSTEMS':'TATA CONSULTANCY SVCS LTD'
            , 'TATA CHEMICALS SODA ASH PARTNERS':'TATA CONSULTANCY SVCS LTD'
            , 'TATA AMERICA INTL CORP':'TATA CONSULTANCY SVCS LTD'
            , 'TATA CONSUTANCY SVCS LTD':'TATA CONSULTANCY SVCS LTD'
            , 'TATA CONSULTING ENGINEERS LTD':'TATA CONSULTANCY SVCS LTD'
            , 'TATA CHEMICALS NORTH AMERICA INC':'TATA CONSULTANCY SVCS LTD'
            , 'TATA INTL METALS AMERICAS LTD':'TATA CONSULTANCY SVCS LTD'
            , 'TATA BUSINESS SUPPORT SERVICES LTD':'TATA CONSULTANCY SVCS LTD'
            , 'TATA JOHNSON SYSTEMS ENGINEERING L':'TATA CONSULTANCY SVCS LTD'
            , 'TATA AMERICA INTL CORPORATION':'TATA CONSULTANCY SVCS LTD'
            , 'TATA COSULTANCY SVCS LTD':'TATA CONSULTANCY SVCS LTD'
            , 'TATA STEEL INTL AMERICAS INC':'TATA CONSULTANCY SVCS LTD'
            , 'TATA BUSINESS SUPPORT SVCS LIMITED':'TATA CONSULTANCY SVCS LTD'
            , 'TATA AM INTL CORP DBA TCS AMERICA':'TATA CONSULTANCY SVCS LTD'
            , 'TATA ENTERPRISES INC':'TATA CONSULTANCY SVCS LTD'
            , 'TATA STEEL INTERNATIONAL NORTH AME':'TATA CONSULTANCY SVCS LTD'
            , 'TATA E-SERVE INTERNATIONAL LIMITED':'TATA CONSULTANCY SVCS LTD'
            , 'TATA CHEMICALS NORTH AMERICA':'TATA CONSULTANCY SVCS LTD'
            , 'TATA AUTOCOMP SYSTEMS NA':'TATA CONSULTANCY SVCS LTD'
            , 'TATA CONSULTANCY SVCS LTE':'TATA CONSULTANCY SVCS LTD'
            , 'TATA COMMUNICATIONS SVC AMERICA IN':'TATA CONSULTANCY SVCS LTD'
            , 'TATA CONCULTANCY SVCS LTD':'TATA CONSULTANCY SVCS LTD'

            , 'WIPRO VLSI DESIGN SERVICES LLC':'WIPRO LIMITED'
            , 'WIPRO LTD':'WIPRO LIMITED'
            , 'WIPRO APPIRIO INC':'WIPRO LIMITED'
            , 'WIPRO TECHNOLOGIES':'WIPRO LIMITED'
            , 'WIPROSOLUTIONS INC':'WIPRO LIMITED'
            , 'WIPRO LIMTED':'WIPRO LIMITED'
            , 'WIPRO DATA CTR & CLOUD SERVICES IN':'WIPRO LIMITED'
            , 'WIPRO LLC':'WIPRO LIMITED'

            , 'INFOSYS LTD':'INFOSYS LIMITED'
            , 'INFOSYS TECHNOLOGIES LIMITED':'INFOSYS LIMITED'
            , 'INFOSYS SOLUTIONS INC':'INFOSYS LIMITED'
            , 'INFOSYS BPO LIMITED':'INFOSYS LIMITED'
            , 'AMERICAN INFOSYS INC':'INFOSYS LIMITED'
            , 'PROSPECT INFOSYS INC':'INFOSYS LIMITED'
            , 'INFOSYS PUBLIC SERVICES':'INFOSYS LIMITED'
            , 'INFOSYS BPM LIMITED':'INFOSYS LIMITED'
            , 'SIERRA INFOSYS INC':'INFOSYS LIMITED'
            , 'SRIVEN INFOSYS INC':'INFOSYS LIMITED'
            , 'INFOSYS CONSULTING INC':'INFOSYS LIMITED'
            , 'INFOSYS PUBLIC SERVICES INC':'INFOSYS LIMITED'
            , 'INFOSYS BPO LTD':'INFOSYS LIMITED'
            , 'INFOSYS MCCAMISH SYSTEMS LLC':'INFOSYS LIMITED'
            , 'VECTRA INFOSYS INC':'INFOSYS LIMITED'
            , 'B2B INFOSYS INC':'INFOSYS LIMITED'
            , 'PLATINUM INFOSYS INC':'INFOSYS LIMITED'
            , 'SUNRAY INFOSYS INCORPORATION':'INFOSYS LIMITED'
            , 'ICONIC INFOSYS INC':'INFOSYS LIMITED'
            , 'INFOSYS INTERNATIONAL INC':'INFOSYS LIMITED'
            , 'SUNRAY INFOSYS INC':'INFOSYS LIMITED'
            , 'CUBE INFOSYS LLC':'INFOSYS LIMITED'
            , 'LN INFOSYS INC':'INFOSYS LIMITED'
            , 'RAVE INFOSYS INC':'INFOSYS LIMITED'

            , 'ACCENTURE TECHNOLOGY SOLUTIONS':'ACCENTURE LLP'
            , 'ACCENTURE TECH SOLUTIONS':'ACCENTURE LLP'
            , 'ACCENTURE INC':'ACCENTURE LLP'
            , 'ACCENTURE GLOBAL INC':'ACCENTURE LLP'
            , 'ACCENTURE LLP DBA IMAGINEA TECH':'ACCENTURE LLP'

            , 'HCL GLOBAL SYSTEMS INC':'HCL AMERICA INC'
            , 'HCL AMERICA SOLUTIONS INC':'HCL AMERICA INC'
            , 'HCL SYSTEMS INC':'HCL AMERICA INC'
            , 'HCL INFOSERV INC':'HCL AMERICA INC'
            , 'HCL INFOSERV INC DBA HBL INFOSERV':'HCL AMERICA INC'
            , 'HCL GLOBAL SYS INC':'HCL AMERICA INC'

            , 'DELOITTE & TOUCHE LLP':'DELOITTE CONSULTING LLP'
            , 'DELOITTE TAX LLP':'DELOITTE CONSULTING LLP'
            , 'DELOITT& TOUCHE LLP':'DELOITTE CONSULTING LLP'
            , 'DELOITTE TRANSACTIONS & BUS ANALYT':'DELOITTE CONSULTING LLP'
            , 'DELOITTE SERVICES LP':'DELOITTE CONSULTING LLP'
            , 'DELOITTE TOUCHE TOHMATSU SVCS INC':'DELOITTE CONSULTING LLP'
            , 'DELOITTE FINANCIAL ADVISORY SVCS L':'DELOITTE CONSULTING LLP'
            , 'DELOITTE TOUCHE TOHMATSU SVCS LLC':'DELOITTE CONSULTING LLP'
            , 'DELOITTE TOUCHE TOHMATSU SERVICES LLC':'DELOITTE CONSULTING LLP'
            , 'DELOITTE LLP':'DELOITTE CONSULTING LLP'
            , 'DELOITTE TRANSACTIONS AND BUSINESS ANALYTICS LLP':'DELOITTE CONSULTING LLP'
            , 'DELOITTE FINANCIAL ADVISORY SVCS':'DELOITTE CONSULTING LLP'
            , 'DELOITTE CONSULTING EXTENDED B':'DELOITTE CONSULTING LLP'
            , 'DELOITTE SVCS LP':'DELOITTE CONSULTING LLP'
            , 'DELOITTE FIN ADVISORY SVCS LLP':'DELOITTE CONSULTING LLP'
            , 'DELOITTE TOUCHE TOHMATSU SVCS IN':'DELOITTE CONSULTING LLP'
            , 'DELOITTE CONSULTING EXTENDED':'DELOITTE CONSULTING LLP'
            , 'DELOITTE CORPORATE FINANCE LLC':'DELOITTE CONSULTING LLP'
            , 'DELOITTE & TOUCHE LLC':'DELOITTE CONSULTING LLP'
            , 'DELOITTE CONSULTING OVERSEAS PROJE':'DELOITTE CONSULTING LLP'
            , 'DELOITTE ANALYTICS LLC':'DELOITTE CONSULTING LLP'
            , 'DELOITTE CONSULTING OVERSEAS':'DELOITTE CONSULTING LLP'
            , 'DELOITTE FINANCIAL ADVISORY SERVICES LLP':'DELOITTE CONSULTING LLP'
            , 'DELOITTE MARKETPOINT LLC':'DELOITTE CONSULTING LLP'
            , 'DELOITTE LLP-DELOITTE LLP':'DELOITTE CONSULTING LLP'
            , 'DELOITTE FIN ADVISORY SERVICES LLP':'DELOITTE CONSULTING LLP'
            , 'DELOITTE TOUCHE TOHMATSU SERVICES':'DELOITTE CONSULTING LLP'
            , 'DELOITTE SERVICES LLP':'DELOITTE CONSULTING LLP'
            , 'DELOITTE CONSULTING':'DELOITTE CONSULTING LLP'
            , 'DELOITTE CONS OVERSEAS PROJECTS LL':'DELOITTE CONSULTING LLP'
            , 'DELOITTE CONSULTING OVERSEAS SVCS':'DELOITTE CONSULTING LLP'
            , 'DELOITTE INVESTMENTS LLC':'DELOITTE CONSULTING LLP'
            , 'DELOITTE TRANSACTION & BUS ANALYTI':'DELOITTE CONSULTING LLP'
            , 'DELOITTE COSULTING LLP':'DELOITTE CONSULTING LLP'
            , 'DELOITTE CONSULTING OVERS':'DELOITTE CONSULTING LLP'
            , 'DELOITTE FINANCIAL ADVISORY SERVIC':'DELOITTE CONSULTING LLP'
            , 'DELOITTE & TOUCHE OVERSEAS PROJECT':'DELOITTE CONSULTING LLP'
            , 'DELOITTE TAX OVERSEAS SVCS LLC':'DELOITTE CONSULTING LLP'
            , 'DELOITTE TOUCHE LLP':'DELOITTE CONSULTING LLP'
            , 'DELOITTE TRANSACTIONS AND BUSINESS':'DELOITTE CONSULTING LLP'
            , 'DELOITTE TOUCHE OVERSEAS PROJECTS':'DELOITTE CONSULTING LLP'
            , 'DELOITTE CONS OVERSEAS PROJECT LLC':'DELOITTE CONSULTING LLP'
            , 'DELOITTE TAX OVERSEAS SERVICES LLC':'DELOITTE CONSULTING LLP'
            , 'DELOITTE & TOUCHE USA OVERSEAS SVC':'DELOITTE CONSULTING LLP'
            , 'DELOITTE SVCS OVERSEAS SVCS LLC':'DELOITTE CONSULTING LLP'
            , 'DELOITTE CORP FINANCE LLC':'DELOITTE CONSULTING LLP'
            , 'DELOITTE & TOUCHE OVERSEAS SVCS LL':'DELOITTE CONSULTING LLP'
            , 'DELOITTE FINANCIAL ADVISORY SRVCS':'DELOITTE CONSULTING LLP'

            , 'CAPGEMINI FINANCIAL SVCS USA INC':'CAPGEMINI AMERICA INC'
            , 'CAPGEMINI US LLC':'CAPGEMINI AMERICA INC'
            , 'CAPGEMINI U S LLC':'CAPGEMINI AMERICA INC'
            , 'CAPGEMINI FINANCIAL SVCS USA INC (':'CAPGEMINI AMERICA INC'
            , 'CAPGEMINI FINANCIAL SERVICES USA':'CAPGEMINI AMERICA INC'
            , 'CAPGEMINI AMERICA INC THROUGH ITS':'CAPGEMINI AMERICA INC'
            , 'CAPGEMINI':'CAPGEMINI AMERICA INC'
            , 'CAPGEMINI GOVERNMENT SOLUTIONS LLC':'CAPGEMINI AMERICA INC'
            , 'CAPGEMINI APPLICATION SERVICES LLC':'CAPGEMINI AMERICA INC'
            , 'CAPGEMINI FINANCIAL SVCS USA INC L':'CAPGEMINI AMERICA INC'
            , 'CAPGEMINI FINANCIAL SERVICES USA I':'CAPGEMINI AMERICA INC'
            , 'CAPGEMINI APPLICATIONS SVCS LLC':'CAPGEMINI AMERICA INC'
            , 'CAPGEMINI ENERGY LP':'CAPGEMINI AMERICA INC'
            , 'CAPGEMINI AMERICAS INC':'CAPGEMINI AMERICA INC'
            , 'CAPGEMINI GOVT SOLUTIONS LLC':'CAPGEMINI AMERICA INC'
            , 'CAPGEMINI FIN. SERVICES USA INC':'CAPGEMINI AMERICA INC'
            , 'CAPGEMINI GOVERMENT SOLUTIONS LLC':'CAPGEMINI AMERICA INC'

            , 'TECH MAHINDRA (AMERICAS) INC':'TECH MAHINDRA AMERICAS INC'
            , 'TECH MAHINDRA TECHNOLOGIES INC':'TECH MAHINDRA AMERICAS INC'
            , 'TECH MAHINDRA INC':'TECH MAHINDRA AMERICAS INC'
            , 'TECH MAHINDRA NETWORK SERVICES INT':'TECH MAHINDRA AMERICAS INC'
            , 'TECH MAHINDRA NETWORK SVCS INTL IN':'TECH MAHINDRA AMERICAS INC'
            , 'TECH MAHINDRA CERIUM SYSTEMS INC':'TECH MAHINDRA AMERICAS INC'
            , 'TECH MAHINDRA TECHNOLOGIES IN':'TECH MAHINDRA AMERICAS INC'
            , 'TECH MAHINDRA NETWORK SERVICES INTERNATIONAL INC FORMERLY LLC INTERNATIONAL INC':'TECH MAHINDRA AMERICAS INC'
            , 'TECH MAHINDRA NETWORK SERVICES':'TECH MAHINDRA AMERICAS INC'
            , 'TECH MAHINDRA NETWORK SVCS INTL':'TECH MAHINDRA AMERICAS INC'
            , 'TECH MAHINDRA NTWRK SVCS INTL INC':'TECH MAHINDRA AMERICAS INC'
            , 'TECH MAHINDRAS AMERICAS INC':'TECH MAHINDRA AMERICAS INC'
            , 'TECH MAHINDRA R & D SERVICES INC':'TECH MAHINDRA AMERICAS INC'
            , 'TECH MAHINDRA AMERICANS INC':'TECH MAHINDRA AMERICAS INC'

            , 'IBM INDIA PRIVATE LIMITED':'IBM CORPORATION'
            , 'IBM INDIA PVT LTD':'IBM CORPORATION'
            , 'IBM CORP':'IBM CORPORATION'
            , 'IBM GLOBAL SYSTEMS DBA JOLT TECHS':'IBM CORPORATION'
            , 'IBM GLOBAL SYSTEMS INC DBA JOLT TE':'IBM CORPORATION'
            , 'IBM GLOBAL SYSS INC DBA JOLT TECHS':'IBM CORPORATION'
            , 'IBM INDIA PRIVATE LTD':'IBM CORPORATION'
            , 'IBM GLOBAL SYSTEMS INC':'IBM CORPORATION'
            , 'IBM GLOBAL SYSS INC DBA JOLT TECH':'IBM CORPORATION'
            , 'IBM SOUTHEAST EMPLOYEES FEDERAL':'IBM CORPORATION'
            , 'SOFTLAYER AN IBM COMPANY':'IBM CORPORATION'
            , 'UNICA CORPORATION AN IBM COMPANY':'IBM CORPORATION'
            , 'IBM INDIAN PRIVATE LIMITED':'IBM CORPORATION'
            , 'IBM GLOBAL SYSTEMS INC DBA JOLT':'IBM CORPORATION'
            , 'IBM INDIA PRIVATE LIMTIED':'IBM CORPORATION'
            , 'IBM INDIA PRIVATE LITD':'IBM CORPORATION'
            , 'IBM':'IBM CORPORATION'
            , 'KENEXA TECHNOLOGY INC/IBM':'IBM CORPORATION'


            , 'MICROSOFT CORP':'MICROSOFT CORPORATION'
            , 'MICROSOFT LICENSING GP':'MICROSOFT CORPORATION'
            , 'MICROSOFT ONLINE INC':'MICROSOFT CORPORATION'
            , 'MICROSOFT OPS PUERTO RICO LLC':'MICROSOFT CORPORATION'
            , 'MICROSOFT PAYMENTS INC':'MICROSOFT CORPORATION'
            , 'MICROSOFT':'MICROSOFT CORPORATION'
            , 'MICROSOFT CARIBBEAN INC':'MICROSOFT CORPORATION'
            , 'MICROSOFT CORPORATIOIN':'MICROSOFT CORPORATION'
            , 'MICROSOFT OPEN TECHNOLOGIES INC':'MICROSOFT CORPORATION'
            , 'MICROSOFT CORPORTION':'MICROSOFT CORPORATION'

            , 'ERNST YOUNG US LLP':'ERNST & YOUNG US LLP'
            , 'ERNST & YOUNG LLP':'ERNST & YOUNG US LLP'
            , 'ERNST & YOUNG U S LLP':'ERNST & YOUNG US LLP'
            , 'ERNST & YOUNG LLP EY GUAM':'ERNST & YOUNG US LLP'
            , 'ERNST YOUNG U S LLP':'ERNST & YOUNG US LLP'
            , 'ERNST & YOUNG LLP CNMI INC':'ERNST & YOUNG US LLP'
            , 'ERNST & YOUNG CNMI INC':'ERNST & YOUNG US LLP'
            , 'ERNST & YOUNG EY GUAM':'ERNST & YOUNG US LLP'
            , 'ERNST YOUNG US CORP LLP':'ERNST & YOUNG US LLP'
            , 'ERNST YOUNG LLP':'ERNST & YOUNG US LLP'
            , 'ERNST AND YOUNG US LLP':'ERNST & YOUNG US LLP'

           }

df["Employer (Petitioner) Name"] = df["Employer (Petitioner) Name"].replace(dict_emp)  

In [16]:
#df['Fiscal Year'] = df['Fiscal Year'].replace('2023', '2003 Q1')


In [17]:
df

Unnamed: 0,Fiscal Year,Employer (Petitioner) Name,Tax ID,Industry (NAICS) Code,Petitioner City,Petitioner State,Petitioner Zip Code,Initial Approval,Initial Denial,Continuing Approval,Continuing Denial
0,2009,,0,,FLUSHING,NY,11355.0,0,0,0,1
1,2009,,0,,LOS ANGELES,CA,90013.0,1,0,0,0
2,2009,,0,,MIAMI,FL,33157.0,1,0,0,0
3,2009,,0,,NEWARK,DE,19702.0,0,0,1,0
4,2009,,0,,POTOMAC,MD,20854.0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
955344,2024,ZYON DIGITAL SOLUTIONS LLC,3124,"54 - Professional, Scientific, and Technical S...",EL PASO,TX,79911.0,1,0,2,0
955345,2024,ZYSCOVICH LLC,4852,"54 - Professional, Scientific, and Technical S...",MIAMI,FL,33132.0,1,0,0,0
955346,2024,ZYTUS INC,96,"54 - Professional, Scientific, and Technical S...",HENRICO,VA,23238.0,1,0,0,0
955347,2024,ZYTUS INC,96,"54 - Professional, Scientific, and Technical S...",RICHMOND,VA,23238.0,1,0,0,0


## Import NAICS table

"NAICS code 99 means the industry is unknown. 
Any petition that had a blank code was assigned as 99 as well." (ref. README.md)

In [18]:

# #NAICS codes: https://www.census.gov/naics/?58967?yearbck=2022

# url = "https://www.census.gov/programs-surveys/economic-census/guidance/understanding-naics.html"

# df_naics = pd.read_html(url, header=0)[1]


# #df_naics = df_naics.append({'Sector':'99','Description':'unknown'}, ignore_index=True)

# df_temp = pd.DataFrame(data={'Sector':['99'],'Description':['unknown']})

# df_naics = pd.concat([df_naics, df_temp])

# df_naics

In [19]:
# df_naics.info()

In [20]:
# df.NAICS.sort_values().unique()

## Continuing Approvals by NAICS code

In [21]:
# df[['NAICS','Continuing Approval']].groupby('NAICS').sum()\
# .sort_values(by='NAICS', ascending=True)


## Replace df.NAICS values to match NAICS table

In [22]:
# dict_naics = {'31':'31-33', '32':'31-33', '33':'31-33'
#                 , '44':'44-45', '45':'44-45'
#                 , '48':'48-49', '49':'48-49'}

# df.NAICS=df.NAICS.replace(dict_naics)  

In [23]:
# df.NAICS.sort_values().unique()

## Continuing Approvals by NAICS code

In [24]:
# df[['NAICS','Continuing Approval']].groupby('NAICS').sum()\
# .sort_values(by='NAICS', ascending=True)


## MERGE NAICS Description to df

In [25]:
# df = pd.merge(df, df_naics, how='left', left_on='NAICS', right_on='Sector', suffixes=(False, False))
# df

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 955349 entries, 0 to 955348
Data columns (total 11 columns):
 #   Column                      Non-Null Count   Dtype 
---  ------                      --------------   ----- 
 0   Fiscal Year                 955349 non-null  object
 1   Employer (Petitioner) Name  955184 non-null  object
 2   Tax ID                      955349 non-null  object
 3   Industry (NAICS) Code       955349 non-null  object
 4   Petitioner City             955323 non-null  object
 5   Petitioner State            955163 non-null  object
 6   Petitioner Zip Code         955349 non-null  object
 7   Initial Approval            955349 non-null  int32 
 8   Initial Denial              955349 non-null  int32 
 9   Continuing Approval         955349 non-null  int32 
 10  Continuing Denial           955349 non-null  int32 
dtypes: int32(4), object(7)
memory usage: 65.6+ MB


In [27]:
df.isna().sum()

Fiscal Year                     0
Employer (Petitioner) Name    165
Tax ID                          0
Industry (NAICS) Code           0
Petitioner City                26
Petitioner State              186
Petitioner Zip Code             0
Initial Approval                0
Initial Denial                  0
Continuing Approval             0
Continuing Denial               0
dtype: int64

## Melt four Decision measures into single Petition measure column with Descision dimension column  
_Commented out because resulting compressed CSV is too large for Github's 50MB file size limit_

In [28]:
#df = pd.melt(df, id_vars=['Fiscal Year', 'Employer', 'NAICS', 'Tax ID', 'State', 'City', 'ZIP', 'Sector', 'Description'],
#       value_vars=['Initial Approval', 'Initial Denial', 'Continuing Approval', 'Continuing Denial', ],
#       var_name='Decision', value_name='Petitions')

In [29]:
df

Unnamed: 0,Fiscal Year,Employer (Petitioner) Name,Tax ID,Industry (NAICS) Code,Petitioner City,Petitioner State,Petitioner Zip Code,Initial Approval,Initial Denial,Continuing Approval,Continuing Denial
0,2009,,0,,FLUSHING,NY,11355.0,0,0,0,1
1,2009,,0,,LOS ANGELES,CA,90013.0,1,0,0,0
2,2009,,0,,MIAMI,FL,33157.0,1,0,0,0
3,2009,,0,,NEWARK,DE,19702.0,0,0,1,0
4,2009,,0,,POTOMAC,MD,20854.0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
955344,2024,ZYON DIGITAL SOLUTIONS LLC,3124,"54 - Professional, Scientific, and Technical S...",EL PASO,TX,79911.0,1,0,2,0
955345,2024,ZYSCOVICH LLC,4852,"54 - Professional, Scientific, and Technical S...",MIAMI,FL,33132.0,1,0,0,0
955346,2024,ZYTUS INC,96,"54 - Professional, Scientific, and Technical S...",HENRICO,VA,23238.0,1,0,0,0
955347,2024,ZYTUS INC,96,"54 - Professional, Scientific, and Technical S...",RICHMOND,VA,23238.0,1,0,0,0


## Export df to compressed .csv

In [30]:
compression_opts = dict(method='zip', archive_name='h1b_hub.csv')  

df.to_csv('h1b_hub_data.zip', index=False, compression=compression_opts)  

