## Imports 

In [11]:
#imports
import pandas as pd
import numpy as np
import random
import re
import recordlinkage
import time
import matplotlib.pyplot as plt

# ML imports 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.impute import SimpleImputer

# prevent depreciation warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
## repeated printouts
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


## Read in, assign a unique identifier via the index, set to dates

In [2]:
# read in our PreMatrix csv from step A
preMatrix = pd.read_csv('RepMatrix.csv').drop(columns=['Unnamed: 0'])
preMatrix.head()

preMatrix = preMatrix.reset_index().copy()
preMatrix = preMatrix.rename(columns={"index": 'unique_id'})
preMatrix.is_violator.value_counts()

## convert the dates to datetime objects
for col in ['CASE_RECEIVED_DATE', 'DECISION_DATE', 
            'REQUESTED_START_DATE_OF_NEED', 'REQUESTED_END_DATE_OF_NEED',
            'JOB_START_DATE', 'JOB_END_DATE']: 
    preMatrix[col] = pd.to_datetime(preMatrix[col])
    
preMatrix.columns
preMatrix.info()

Unnamed: 0,CASE_NO,DECISION_DATE,VISA_CLASS,CASE_RECEIVED_DATE,CASE_STATUS,REQUESTED_START_DATE_OF_NEED,REQUESTED_END_DATE_OF_NEED,PRIMARY_SUB,EMPLOYER_NAME,TRADE_NAME_DBA,...,WORKSITE_POSTAL_CODE,OTHER_WORKSITE_LOCATION,ORGANIZATION_FLAG,SWA_NAME,JOB_IDNUMBER,JOB_START_DATE,JOB_END_DATE,status,name,is_violator
0,H-300-17258-492669,2017-10-02 13:46:39,H-2A,2017-09-15 00:00:00,DETERMINATION ISSUED - CERTIFICATION,2017-12-01 00:00:00,2018-11-29 00:00:00,PRI,ECOSYSTEM CONCEPTS INC.,,...,95620,Y,Association - Filing as Agent (H-2A Only),California Employment Development Department,15575066,2017-09-26 00:00:00,2018-05-31 00:00:00,CERTIFICATION,ECOSYSTEM CONCEPTS INC,0.0
1,H-300-17257-446860,2017-10-02 16:36:19,H-2A,2017-08-17 00:00:00,DETERMINATION ISSUED - CERTIFICATION,2017-10-16 00:00:00,2019-12-06 00:00:00,PRI,WESTERN RANGE ASSOCIATION,,...,93635,Y,Association - Joint Employer (H-2A Only),California Employment Development Department,,2017-08-22 00:00:00,2019-09-07 00:00:00,CERTIFICATION,WESTERN RANGE ASSOCIATION,0.0
2,H-300-17257-446860,2017-10-02 16:36:19,H-2A,2017-09-14 00:00:00,DETERMINATION ISSUED - CERTIFICATION,2017-12-08 00:00:00,2019-09-06 00:00:00,SUB,SHEEP COMPANY LLC,,...,93210,Y,,California Employment Development Department,15567271,2017-09-20 00:00:00,2019-03-08 00:00:00,CERTIFICATION,SHEEP COMPANY LLC,0.0
3,H-300-17262-816860,2017-10-02 16:44:54,H-2A,2017-09-19 00:00:00,DETERMINATION ISSUED - CERTIFICATION,2018-01-01 00:00:00,2018-12-30 00:00:00,PRI,Sieben Ranch Co.,,...,59648,Y,Association - Filing as Agent (H-2A Only),Montana State Workforce,10316116,2017-09-21 00:00:00,2018-06-29 00:00:00,CERTIFICATION,SIEBEN RANCH CO,0.0
4,H-300-17256-339375,2017-10-03 13:28:16,H-2A,2017-09-13 00:00:00,DETERMINATION ISSUED - CERTIFICATION,2017-12-08 00:00:00,2019-11-06 00:00:00,SUB,TUTTLE LIVESTOCK COMPANY LLC,,...,81625,Y,,Craig Workforce Center,CO 7136889,2017-09-15 00:00:00,2019-05-08 00:00:00,CERTIFICATION,TUTTLE LIVESTOCK COMPANY LLC,0.0


0.0    7426
1.0     217
Name: is_violator, dtype: int64

Index(['unique_id', 'CASE_NO', 'DECISION_DATE', 'VISA_CLASS',
       'CASE_RECEIVED_DATE', 'CASE_STATUS', 'REQUESTED_START_DATE_OF_NEED',
       'REQUESTED_END_DATE_OF_NEED', 'PRIMARY_SUB', 'EMPLOYER_NAME',
       'TRADE_NAME_DBA', 'EMPLOYER_ADDRESS1', 'EMPLOYER_ADDRESS2',
       'EMPLOYER_CITY', 'EMPLOYER_STATE', 'EMPLOYER_POSTAL_CODE',
       'EMPLOYER_COUNTRY', 'EMPLOYER_PROVINCE', 'EMPLOYER_PHONE',
       'EMPLOYER_PHONE_EXT', 'AGENT_POC_EMPLOYER_REP_BY_AGENT', 'LAWFIRM_NAME',
       'AGENT_ATTORNEY_NAME', 'AGENT_ATTORNEY_CITY', 'AGENT_ATTORNEY_STATE',
       'JOB_TITLE', 'SOC_CODE', 'SOC_TITLE', 'PRIMARY_CROP', 'NAICS_CODE',
       'NBR_WORKERS_REQUESTED', 'NBR_WORKERS_CERTIFIED', 'FULL_TIME',
       'NATURE_OF_TEMPORARY_NEED', 'BASIC_NUMBER_OF_HOURS',
       'HOURLY_WORK_SCHEDULE_AM', 'HOURLY_WORK_SCHEDULE_PM',
       'BASIC_RATE_OF_PAY', 'OVERTIME_RATE_FROM', 'OVERTIME_RATE_TO',
       'BASIC_UNIT_OF_PAY', 'SUPERVISE_OTHER_EMP', 'SUPERVISE_HOW_MANY',
       'EDUCATION_LEVEL', 'O

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7643 entries, 0 to 7642
Data columns (total 66 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   unique_id                        7643 non-null   int64         
 1   CASE_NO                          7643 non-null   object        
 2   DECISION_DATE                    7643 non-null   datetime64[ns]
 3   VISA_CLASS                       7643 non-null   object        
 4   CASE_RECEIVED_DATE               7643 non-null   datetime64[ns]
 5   CASE_STATUS                      7643 non-null   object        
 6   REQUESTED_START_DATE_OF_NEED     7641 non-null   datetime64[ns]
 7   REQUESTED_END_DATE_OF_NEED       7641 non-null   datetime64[ns]
 8   PRIMARY_SUB                      7643 non-null   object        
 9   EMPLOYER_NAME                    7643 non-null   object        
 10  TRADE_NAME_DBA                   926 non-null    object     

In [3]:
# Second Diploma Major has no non null values so drop it 
preMatrix = preMatrix.drop(columns=['SECOND_DIPLOMA_MAJOR'])

In [4]:
# Assign the is_violator status to the y (value we are trying to predict)
y = list(preMatrix.is_violator)

# remove the is_violator status from the preMatrix ... because that would be too easy!
preMatrix = preMatrix.drop(columns=['is_violator'])

In [5]:
## dtypes auto-separate
## list of non-features

numeric_options = ["int64", "float64", "datetime64[ns]"]
num_cols = [one for one in preMatrix.columns if preMatrix.dtypes[one] in numeric_options]
cat_cols = [one for one in preMatrix.columns if preMatrix.dtypes[one] not in numeric_options]

print('Numeric Columns:')
print(num_cols)
print('\nCategorical Columns:')
print(cat_cols)


Numeric Columns:
['unique_id', 'DECISION_DATE', 'CASE_RECEIVED_DATE', 'REQUESTED_START_DATE_OF_NEED', 'REQUESTED_END_DATE_OF_NEED', 'EMPLOYER_PHONE_EXT', 'NAICS_CODE', 'NBR_WORKERS_REQUESTED', 'NBR_WORKERS_CERTIFIED', 'BASIC_NUMBER_OF_HOURS', 'BASIC_RATE_OF_PAY', 'OVERTIME_RATE_FROM', 'OVERTIME_RATE_TO', 'SUPERVISE_HOW_MANY', 'NUM_MONTHS_TRAINING', 'EMP_EXP_NUM_MONTHS', 'JOB_START_DATE', 'JOB_END_DATE']

Categorical Columns:
['CASE_NO', 'VISA_CLASS', 'CASE_STATUS', 'PRIMARY_SUB', 'EMPLOYER_NAME', 'TRADE_NAME_DBA', 'EMPLOYER_ADDRESS1', 'EMPLOYER_ADDRESS2', 'EMPLOYER_CITY', 'EMPLOYER_STATE', 'EMPLOYER_POSTAL_CODE', 'EMPLOYER_COUNTRY', 'EMPLOYER_PROVINCE', 'EMPLOYER_PHONE', 'AGENT_POC_EMPLOYER_REP_BY_AGENT', 'LAWFIRM_NAME', 'AGENT_ATTORNEY_NAME', 'AGENT_ATTORNEY_CITY', 'AGENT_ATTORNEY_STATE', 'JOB_TITLE', 'SOC_CODE', 'SOC_TITLE', 'PRIMARY_CROP', 'FULL_TIME', 'NATURE_OF_TEMPORARY_NEED', 'HOURLY_WORK_SCHEDULE_AM', 'HOURLY_WORK_SCHEDULE_PM', 'BASIC_UNIT_OF_PAY', 'SUPERVISE_OTHER_EMP', 'EDUCA

In [6]:
# OLD USELESS CODE SAVED FOR POSTERITY...JUST IN CASE

# encoded_text_feature_pre = text_feature_pre.copy()
# for one in encoded_text_feature_pre.columns:
#     enc = LabelEncoder()
#     enc.fit(encoded_text_feature_pre[one].astype(str))
#     encoded_text_feature_pre[one] = enc.transform(encoded_text_feature_pre[one].astype(str))


In [7]:
# get the categorical features in one dataframe
cat_feature_pre = preMatrix.loc[:, cat_cols].copy()
print("Shape of non-imputed: ")
print(cat_feature_pre.shape)
# and the numerical features into another dataframe
num_feature_pre = preMatrix.loc[:, num_cols].copy()
print(num_feature_pre.shape)

# SimpleImputer on the categorical features and apply a "missing_value" to NANs 
imputer_cat = SimpleImputer(strategy='constant', fill_value='missing_value')
imputed_cat_feature_pre = pd.DataFrame(imputer_cat.fit_transform(cat_feature_pre))
imputed_cat_feature_pre.columns = cat_feature_pre.columns

# SimpleImputer on the numerical features and apply mode to NANs 
imputer_num = SimpleImputer(strategy='most_frequent', verbose=5)
imputed_num_feature_pre = pd.DataFrame(imputer_num.fit_transform(num_feature_pre))
imputed_num_feature_pre.columns = num_feature_pre.columns

print("Shape of imputed: ")
print(imputed_cat_feature_pre.shape)
print(imputed_num_feature_pre.shape)

# recombine the imputed cat and imputed num 

# we need to drop some columns which are going to be unique identifiers and could 
# be an issue within our model
unique_cols_to_drop = ['unique_id', 'CASE_NO', 'EMPLOYER_NAME', 'TRADE_NAME_DBA']
for l in [cat_cols, num_cols]: 
    for col in l: 
        if col in unique_cols_to_drop: 
            l.remove(col)

Shape of non-imputed: 
(7643, 46)
(7643, 18)
Shape of imputed: 
(7643, 46)
(7643, 18)


In [25]:
# prepare input data with OneHotEncoder
def prepare_inputs(X_train, X_test):
    oe = OneHotEncoder(handle_unknown='ignore')
    oe.fit(X_train)
    X_train_enc = oe.transform(X_train)
    X_test_enc = oe.transform(X_test)
    return X_train_enc, X_test_enc

imputed_combined = pd.merge(imputed_cat_feature_pre.reset_index(),
                            imputed_num_feature_pre.reset_index(), how='left', 
                            on='index')
print('%s rows lost in merge' %(imputed_num_feature_pre.shape[0]-imputed_combined.shape[0]))
print(imputed_combined.shape)
imputed_combined = imputed_combined.drop(columns = 'index')

# do a train test split 
# split into train and test sets (80/20)

# X_train, X_test, y_train, y_test = train_test_split(imputed_cat_feature_pre, y, test_size=0.20, random_state=1)
X_train, X_test, y_train, y_test = train_test_split(imputed_combined, y, test_size=0.20, random_state=3)

# apply the oneHotEcoder within prepare_inputs
X_train, X_test = prepare_inputs(X_train, X_test)

0 rows lost in merge
(7643, 65)


In [26]:
clf = RandomForestClassifier(max_depth = 10, random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("Confusion matrix \n")
print(pd.crosstab(pd.Series(y_test, name='Actual'), pd.Series(y_pred, name='Predicted')))
def get_metrics(y_test, y_predicted):
    accuracy = accuracy_score(y_test, y_predicted)
    precision = precision_score(y_test, y_predicted, average='weighted')
    recall = recall_score(y_test, y_predicted, average='weighted')
    f1 = f1_score(y_test, y_predicted, average='weighted')
    return accuracy, precision, recall, f1
accuracy, precision, recall, f1 = get_metrics(y_test, y_pred)
print("accuracy = %.3f \nprecision = %.3f \nrecall = %.3f \nf1 = %.3f" % (accuracy, precision, recall, f1))


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=10, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

Confusion matrix 

Predicted   0.0
Actual         
0.0        1495
1.0          34
accuracy = 0.978 
precision = 0.956 
recall = 0.978 
f1 = 0.967


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [10]:
'''
start_time = time.time()
importances = clf.feature_importances_
std = np.std([
    tree.feature_importances_ for tree in clf.estimators_], axis=0)
elapsed_time = time.time() - start_time

print(f"Elapsed time to compute the importances: "
      f"{elapsed_time:.3f} seconds")
print(importances)
forest_importances = pd.Series(importances, index=cat_cols)

fig, ax = plt.subplots()
fig.set_figheight(10)
fig.set_figwidth(10)
forest_importances.plot.bar(yerr=std, ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()
'''

'\nstart_time = time.time()\nimportances = clf.feature_importances_\nstd = np.std([\n    tree.feature_importances_ for tree in clf.estimators_], axis=0)\nelapsed_time = time.time() - start_time\n\nprint(f"Elapsed time to compute the importances: "\n      f"{elapsed_time:.3f} seconds")\nprint(importances)\nforest_importances = pd.Series(importances, index=cat_cols)\n\nfig, ax = plt.subplots()\nfig.set_figheight(10)\nfig.set_figwidth(10)\nforest_importances.plot.bar(yerr=std, ax=ax)\nax.set_title("Feature importances using MDI")\nax.set_ylabel("Mean decrease in impurity")\nfig.tight_layout()\n'