In [25]:
## IMPORTS
## helpful packages
import pandas as pd
import numpy as np
import random
import re
import recordlinkage

## repeated printouts
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [26]:
## load in h2a data
h2a = pd.read_excel("./my_data/h2a_2018.xlsx")

## load in investigations/violations data
url = "https://enfxfr.dol.gov/data_catalog/WHD/whd_whisard_20210415.csv.zip"
investigations = pd.read_csv(url, 
                      index_col=None,
                      dtype={7:'string'})

## convert the dates to datetime objects
investigations['findings_start_date'] = pd.to_datetime(investigations['findings_start_date'], errors='coerce')
investigations['findings_end_date'] = pd.to_datetime(investigations['findings_end_date'], errors = "coerce")

In [27]:
## this function will pull out the certification status from a given h2a application
def find_status(one):
    string_version = str(one)                    ## convert to string
    pattern = r'\-\s(.*)$'                       ## define regex pattern
    found = re.findall(pattern, string_version)  ## search for pattern and return what's found
    return found[0]

h2a["status"] = [find_status(one) for one in h2a.CASE_STATUS]   ## put the status in a new column

## filter to applications that have received certification or partial certification
approved_only = h2a.loc[((h2a.status == "CERTIFICATION") | (h2a.status == "PARTIAL CERTIFICATION")),:].copy()

In [28]:
## try an exact merge to see if there are any exact matches
## print out a version without duplicate entries (just for visualization for this part)
merge_attempt = pd.merge(approved_only, investigations, how = "inner", left_on = "EMPLOYER_NAME", right_on = "legal_name", indicator = "name_merge_status") 
without_dups = merge_attempt[["EMPLOYER_NAME", "legal_name","CASE_STATUS"]].drop_duplicates()
without_dups

Unnamed: 0,EMPLOYER_NAME,legal_name,CASE_STATUS
0,Sandy Webster,Sandy Webster,DETERMINATION ISSUED - CERTIFICATION
1,Warren Bailey,Warren Bailey,DETERMINATION ISSUED - CERTIFICATION
2,"Grand Louis Four, Inc.","Grand Louis Four, Inc.",DETERMINATION ISSUED - CERTIFICATION
3,Justin Jonas,Justin Jonas,DETERMINATION ISSUED - CERTIFICATION
4,"Olson's Greenhouse Gardens, Inc.","Olson's Greenhouse Gardens, Inc.",DETERMINATION ISSUED - CERTIFICATION
...,...,...,...
993,"KCK Farms, LLC","KCK Farms, LLC",DETERMINATION ISSUED - CERTIFICATION
994,"JFT Harvesting, Inc.","JFT Harvesting, Inc.",DETERMINATION ISSUED - CERTIFICATION
995,"Farmers Gin Company, LLC","Farmers Gin Company, LLC",DETERMINATION ISSUED - CERTIFICATION
998,Gustavo Cisneros,Gustavo Cisneros,DETERMINATION ISSUED - CERTIFICATION


In [29]:
## this function will clean the EMPLOYER_NAME in approved_only (h2a apps) and legal_name in violations (WHD data)
def clean_names(one):
    string_version = str(one)               ## convert to string
    upper_only = string_version.upper()     ## convert to uppercase
    pattern = r"(LLC|CO|INC)\."             ## locate the LLC, CO, or INC that are followed by a period
    replacement = r'\1'                     ## replace the whole pattern with the LLC/CO/INC component 
    res = re.sub(pattern, replacement, upper_only)  ## compute and return the result
    return res

## make new "name" columns for the cleaned versions of the names
approved_only["name"] = [clean_names(one) for one in approved_only.EMPLOYER_NAME]
investigations["name"] = [clean_names(one) for one in investigations.legal_name]
investigations_cleaned = investigations.loc[investigations.name != "NAN",:].copy()      ## get rid of NAN names

In [30]:
investigations_cleaned[["name","h2a_violtn_cnt"]]
violations = investigations_cleaned.loc[investigations_cleaned.h2a_violtn_cnt > 0, :].copy()
violations[["name","h2a_violtn_cnt"]]

Unnamed: 0,name,h2a_violtn_cnt
0,"RELIANT ENERGY RETAIL SERVICES, LLC",0
1,"HEALTHCARE SERVICES GROUP, INC",0
2,"CENTRAL AVENUE BAKERY, INC",0
3,CATHOLIC CHERITIES,0
5,DERRICK PLUMBING,0
...,...,...
313922,"EL TAPATIO MEXICAN RESTAURANT(LA FINCA, INC)",0
313923,WEST FLEET INC,0
313924,NESBITT FRUIT FARMS,1
313925,"MOTOS, INC",0


Unnamed: 0,name,h2a_violtn_cnt
72,"GLADES PLANTING, LLC",2
158,"KINDERHOOK CREEK FARM, INC",9
174,"BACK FORTY BLUEBERRY NURSERY, LLC",7
569,"GISI PHEASANT FARM, INC",40
910,COTTON HOPE FARMS,36
...,...,...
313892,CLARK BROTHERS NURSERY LLC,4
313894,FEENEY'S WHOLESALE NURSERY INC,10
313896,"BLOOMAKER USA, INC",52
313911,AURORA FARMS LLC,2


In [36]:
## STEP 1: INITIALIZE OUR RECORDLINKAGE INDEX AND BLOCK ON STATE
link_apps_investigations = recordlinkage.Index()                                 
link_apps_investigations.block(left_on = "EMPLOYER_STATE", right_on = "st_cd")     
candidate_links_state = link_apps_investigations.index(approved_only, violations)

## STEP 2: INITIALIZE OUR COMPARE CLASS
compare = recordlinkage.Compare()        
compare.string("name","name", method = "jarowinkler", threshold = 0.85) 

## STEP 3: COMPUTE BASED ON OUR PROVIDED COMPARISON
compare_vectors = compare.compute(candidate_links_state, approved_only, violations)
compare_vectors.columns = ["Name"]                           ## rename the column to appropriate variable
selected = compare_vectors[compare_vectors.Name == 1].copy() ## select the matches

## STEP 4: PULL OUT THE INDEX VALUES SO THAT WE CAN MATCH THE CORRECT ENTRIES TO THE ORIGINAL DATA
n = selected.shape[0]
index_approved_only_values = []
index_violations_cleaned_values = []
for i in range(n):
    index = selected.index[i]
    index_approved_only_values.append(index[0])
    index_violations_cleaned_values.append(index[1])
selected["index_approved_only"] = index_approved_only_values.copy()
selected["index_violations_cleaned"] = index_violations_cleaned_values.copy()

## STEP 5: MERGE THE MATCHES BACK WITH THE APPROVED_ONLY H2A DATA
approved_only["index_approved_only"] = approved_only.index 
approved_only_columns = ["status","JOB_START_DATE","JOB_END_DATE","EMPLOYER_STATE", "name","index_approved_only"] 
m1 = pd.merge(selected, approved_only[approved_only_columns], on = "index_approved_only", how = "inner")

## STEP 6: MERGE THE RESULT FROM STEP 5 WITH THE VIOLATIONS DATA
violations["index_violations_cleaned"] = violations.index
violations_cleaned_columns = ["st_cd", "name", "index_violations_cleaned", "case_violtn_cnt","findings_start_date","findings_end_date"]
m2 = pd.merge(m1, violations[violations_cleaned_columns], on = "index_violations_cleaned", how = "inner", suffixes = ["_approved_only", "_violations_cleaned"])

## STEP 7: PRINT OUT MEANINGFUL SUBSET (HERE WE'RE FINIDNG MATCHES WITH MORE THAN 50 VIOLATIONS 
#                                       FOUND AND WITHIN THE CORRECT TIME RANGE)
apps_with_violations = m2.loc[((m2.findings_start_date >= m2.JOB_START_DATE)),:].copy()
apps_50_plus_violatons = m2.loc[(m2.case_violtn_cnt > 50) & ((m2.findings_start_date >= m2.JOB_START_DATE)),:].copy()

## DISPLAY
apps_with_violations.head()
apps_50_plus_violatons.head()


<Index>

<Compare>

Unnamed: 0,Name,index_approved_only,index_violations_cleaned,status,JOB_START_DATE,JOB_END_DATE,EMPLOYER_STATE,name_approved_only,st_cd,name_violations_cleaned,case_violtn_cnt,findings_start_date,findings_end_date
35,1.0,1841,300645,CERTIFICATION,2017-12-07,2018-05-31,CA,CARNEROS CREEK WINERY INC,CA,CARNEROS CREEK WINERY INC,19,2017-12-07,2018-07-23
36,1.0,1862,279472,CERTIFICATION,2017-12-14,2018-09-05,CA,PACIFIC LIVESTOCK INC,CA,"PACIFICA PERSONNEL, INC",114,2019-04-10,2019-06-03
37,1.0,3486,279472,CERTIFICATION,2017-12-14,2018-07-16,CA,PACIFICA PERSONNEL,CA,"PACIFICA PERSONNEL, INC",114,2019-04-10,2019-06-03
38,1.0,3694,279472,CERTIFICATION,2017-12-27,2018-07-16,CA,"PACIFICA PERSONNEL, INC",CA,"PACIFICA PERSONNEL, INC",114,2019-04-10,2019-06-03
39,1.0,3890,279472,CERTIFICATION,2018-01-08,2018-07-23,CA,PACIFICA PERSONNEL,CA,"PACIFICA PERSONNEL, INC",114,2019-04-10,2019-06-03


Unnamed: 0,Name,index_approved_only,index_violations_cleaned,status,JOB_START_DATE,JOB_END_DATE,EMPLOYER_STATE,name_approved_only,st_cd,name_violations_cleaned,case_violtn_cnt,findings_start_date,findings_end_date
36,1.0,1862,279472,CERTIFICATION,2017-12-14,2018-09-05,CA,PACIFIC LIVESTOCK INC,CA,"PACIFICA PERSONNEL, INC",114,2019-04-10,2019-06-03
37,1.0,3486,279472,CERTIFICATION,2017-12-14,2018-07-16,CA,PACIFICA PERSONNEL,CA,"PACIFICA PERSONNEL, INC",114,2019-04-10,2019-06-03
38,1.0,3694,279472,CERTIFICATION,2017-12-27,2018-07-16,CA,"PACIFICA PERSONNEL, INC",CA,"PACIFICA PERSONNEL, INC",114,2019-04-10,2019-06-03
39,1.0,3890,279472,CERTIFICATION,2018-01-08,2018-07-23,CA,PACIFICA PERSONNEL,CA,"PACIFICA PERSONNEL, INC",114,2019-04-10,2019-06-03
40,1.0,5250,279472,CERTIFICATION,2017-12-29,2018-07-16,CA,PACIFICA PERSONNEL,CA,"PACIFICA PERSONNEL, INC",114,2019-04-10,2019-06-03
