In [1]:
## IMPORTS
## helpful packages
import pandas as pd
import numpy as np
import random
import re
import recordlinkage

## repeated printouts
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
h2a = pd.read_excel("./my_data/h2a_2018.xlsx")

url = "https://enfxfr.dol.gov/data_catalog/WHD/whd_whisard_20210415.csv.zip"
violations = pd.read_csv(url, 
                      index_col=None,
                      dtype={7:'string'})
violations['findings_start_date'] = pd.to_datetime(violations['findings_start_date'], errors='coerce')
violations['findings_end_date'] = pd.to_datetime(violations['findings_end_date'], errors='coerce')
# print(f'raw dataframe has {len(raw_dol.columns)} columns and {len(raw_dol)} rows')

In [5]:
def find_status(one):
    string_version = str(one)
    pattern = r'\-\s(.*)$'
    found = re.findall(pattern, string_version)
    return found[0]
h2a["status"] = [find_status(one) for one in h2a.CASE_STATUS]
approved_only = h2a.loc[((h2a.status == "CERTIFICATION") | (h2a.status == "PARTIAL CERTIFICATION")),:].copy()

In [6]:
merge_attempt = pd.merge(approved_only, violations, how = "inner", left_on = "EMPLOYER_NAME", right_on = "legal_name", indicator = "name_merge_status") 
without_dups = merge_attempt[["EMPLOYER_NAME", "legal_name","CASE_STATUS"]].drop_duplicates()

In [7]:
def clean_names(one):
    string_version = str(one)
    upper_only = string_version.upper()
    pattern = r"(LLC|CO|INC)\." ## locate the LLC, CO, or INC that are followed by a period
    replacement = r'\1'         ## replace the whole pattern with the LLC/CO/INC component 
    res = re.sub(pattern, replacement, upper_only)
    return res

approved_only["name"] = [clean_names(one) for one in approved_only.EMPLOYER_NAME]
violations["name"] = [clean_names(one) for one in violations.legal_name]
violations_cleaned = violations.loc[violations.name != "NAN",:].copy()

In [8]:
list_of_violators = [one for one in violations_cleaned.name]
approved_only["has_investigation"] = [np.where(str(one) in list_of_violators, "Yes", "No") for one in approved_only.name]

In [27]:
# approved_only["h2a"]
violations_cleaned[["name","h2a_violtn_cnt"]]
nonzero_violations = violations_cleaned.loc[violations_cleaned.h2a_violtn_cnt > 0, :].copy()
nonzero_violations[["name","h2a_violtn_cnt"]]

Unnamed: 0,name,h2a_violtn_cnt
0,"RELIANT ENERGY RETAIL SERVICES, LLC",0
1,"HEALTHCARE SERVICES GROUP, INC",0
2,"CENTRAL AVENUE BAKERY, INC",0
3,CATHOLIC CHERITIES,0
5,DERRICK PLUMBING,0
...,...,...
313922,"EL TAPATIO MEXICAN RESTAURANT(LA FINCA, INC)",0
313923,WEST FLEET INC,0
313924,NESBITT FRUIT FARMS,1
313925,"MOTOS, INC",0


Unnamed: 0,name,h2a_violtn_cnt
72,"GLADES PLANTING, LLC",2
158,"KINDERHOOK CREEK FARM, INC",9
174,"BACK FORTY BLUEBERRY NURSERY, LLC",7
569,"GISI PHEASANT FARM, INC",40
910,COTTON HOPE FARMS,36
...,...,...
313892,CLARK BROTHERS NURSERY LLC,4
313894,FEENEY'S WHOLESALE NURSERY INC,10
313896,"BLOOMAKER USA, INC",52
313911,AURORA FARMS LLC,2


In [26]:
link_apps_investigations = recordlinkage.Index()     ## initialize our recordlinkage index
link_apps_investigations.block(left_on = "EMPLOYER_STATE", right_on = "st_cd")     ## block on EMPLOYER_STATE

candidate_links_state = link_apps_investigations.index(approved_only, nonzero_violations) ## index on jobs and debar

compare = recordlinkage.Compare()           ## initialize our compare class

## compare on string using the jarowinkler method and 0.85 threshold

compare.string("name","name", method = "jarowinkler", threshold = 0.85) 


## compare jobs and debar using our comparison described above
print("pre-compare")
compare_vectors = compare.compute(candidate_links_state, approved_only, nonzero_violations)
print("post-compare")
compare_vectors.columns = ["Name"] 
selected = compare_vectors[compare_vectors.Name == 1].copy() ## select the matches

print("CHECKPOINT 1")
n = selected.shape[0]

## pull out the index values for our selected matches to store in our selected dataframe
index_approved_only_values = []
index_violations_cleaned_values = []
for i in range(n):
    index = selected.index[i]
    index_approved_only_values.append(index[0])
    index_violations_cleaned_values.append(index[1])
selected["index_approved_only"] = index_approved_only_values.copy()
selected["index_violations_cleaned"] = index_violations_cleaned_values.copy()

print("CHECKPINT 2")
## here, we're first going to make a column in jobs that has the index values
## then, we're going to merge our selected matches with the rest of the jobs dataframe
approved_only["index_approved_only"] = approved_only.index 
approved_only_columns = ["status","has_investigation","JOB_START_DATE","JOB_END_DATE","EMPLOYER_STATE", "name","index_approved_only"] 
m1_add_approved_only = pd.merge(selected, approved_only[approved_only_columns], on = "index_approved_only", how = "inner")
print("CHECKPINT 3")
## we'll repeat the same process as above with debar
## here, we'll merge our debar dataframe with the jobs + selected merged dataframe
nonzero_violations["index_violations_cleaned"] = nonzero_violations.index
violations_cleaned_columns = ["st_cd", "name", "index_violations_cleaned", "case_violtn_cnt","findings_start_date","findings_end_date"]
m2_add_violations_cleaned = pd.merge(m1_add_approved_only, nonzero_violations[violations_cleaned_columns], on = "index_violations_cleaned", how = "inner", suffixes = ["_approved_only", "_violations_cleaned"])
print("CHECKPINT 4")
## print out the final merged dataframe
m2_add_violations_cleaned.loc[m2_add_violations_cleaned.case_violtn_cnt > 4000,:]


<Index>

<Compare>

pre-compare
post-compare
CHECKPOINT 1
CHECKPINT 2
CHECKPINT 3
CHECKPINT 4


Unnamed: 0,Name,index_approved_only,index_violations_cleaned,status,has_investigation,JOB_START_DATE,JOB_END_DATE,EMPLOYER_STATE,name_approved_only,st_cd,name_violations_cleaned,case_violtn_cnt,findings_start_date,findings_end_date
9723,1.0,1665,300336,CERTIFICATION,Yes,2017-11-28,2018-06-17,NV,"PERI & SONS FARMS, INC",NV,"PERI & SONS FARM, INC",4530,2013-04-01,2014-04-30
9724,1.0,2219,300336,CERTIFICATION,Yes,2017-12-13,2018-07-01,NV,"PERI & SONS FARMS, INC",NV,"PERI & SONS FARM, INC",4530,2013-04-01,2014-04-30
9725,1.0,2322,300336,CERTIFICATION,Yes,2017-12-13,2018-07-01,NV,"PERI & SONS FARMS, INC",NV,"PERI & SONS FARM, INC",4530,2013-04-01,2014-04-30
9726,1.0,2635,300336,CERTIFICATION,No,2017-12-07,2018-07-01,NV,"PERI & SONS FARMS OF CALIFORNIA, LLC",NV,"PERI & SONS FARM, INC",4530,2013-04-01,2014-04-30
9727,1.0,4420,300336,CERTIFICATION,Yes,2018-01-11,2018-07-08,NV,"PERI & SONS FARMS, INC",NV,"PERI & SONS FARM, INC",4530,2013-04-01,2014-04-30
9728,1.0,5001,300336,CERTIFICATION,Yes,2018-01-25,2018-07-17,NV,"PERI & SONS FARMS, INC",NV,"PERI & SONS FARM, INC",4530,2013-04-01,2014-04-30
9729,1.0,8133,300336,CERTIFICATION,Yes,2018-03-07,2018-08-01,NV,"PERI & SONS FARMS, INC",NV,"PERI & SONS FARM, INC",4530,2013-04-01,2014-04-30
9730,1.0,9078,300336,CERTIFICATION,No,2018-03-23,2018-07-28,NV,PERI & SONS FARMS. INC,NV,"PERI & SONS FARM, INC",4530,2013-04-01,2014-04-30
9731,1.0,9173,300336,CERTIFICATION,Yes,2018-03-23,2018-08-09,NV,"PERI & SONS FARMS, INC",NV,"PERI & SONS FARM, INC",4530,2013-04-01,2014-04-30
9732,1.0,9320,300336,CERTIFICATION,Yes,2018-03-23,2018-08-09,NV,"PERI & SONS FARMS, INC",NV,"PERI & SONS FARM, INC",4530,2013-04-01,2014-04-30
