In [6]:
#imports
import pandas as pd
import numpy as np
import random
import re
import recordlinkage

# ML imports 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer

# prevent depreciation warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

## repeated printouts
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


## Read in and assign a unique identifier via the index

In [2]:
# read in our PreMatrix csv from step A
preMatrix = pd.read_csv('PreMatrix.csv').drop(columns=['Unnamed: 0'])
preMatrix = preMatrix.reset_index().copy()
preMatrix = preMatrix.rename(columns={"index": 'unique_id'})
## convert the dates to datetime objects

preMatrix.columns
preMatrix.info()
y = preMatrix.loc[:, ['is_violator']].copy()

Index(['unique_id', 'CASE_NO', 'DECISION_DATE', 'VISA_CLASS',
       'CASE_RECEIVED_DATE', 'CASE_STATUS', 'REQUESTED_START_DATE_OF_NEED',
       'REQUESTED_END_DATE_OF_NEED', 'PRIMARY_SUB', 'EMPLOYER_NAME',
       'TRADE_NAME_DBA', 'EMPLOYER_ADDRESS1', 'EMPLOYER_ADDRESS2',
       'EMPLOYER_CITY', 'EMPLOYER_STATE', 'EMPLOYER_POSTAL_CODE',
       'EMPLOYER_COUNTRY', 'EMPLOYER_PROVINCE', 'EMPLOYER_PHONE',
       'EMPLOYER_PHONE_EXT', 'AGENT_POC_EMPLOYER_REP_BY_AGENT', 'LAWFIRM_NAME',
       'AGENT_ATTORNEY_NAME', 'AGENT_ATTORNEY_CITY', 'AGENT_ATTORNEY_STATE',
       'JOB_TITLE', 'SOC_CODE', 'SOC_TITLE', 'PRIMARY_CROP', 'NAICS_CODE',
       'NBR_WORKERS_REQUESTED', 'NBR_WORKERS_CERTIFIED', 'FULL_TIME',
       'NATURE_OF_TEMPORARY_NEED', 'BASIC_NUMBER_OF_HOURS',
       'HOURLY_WORK_SCHEDULE_AM', 'HOURLY_WORK_SCHEDULE_PM',
       'BASIC_RATE_OF_PAY', 'OVERTIME_RATE_FROM', 'OVERTIME_RATE_TO',
       'BASIC_UNIT_OF_PAY', 'SUPERVISE_OTHER_EMP', 'SUPERVISE_HOW_MANY',
       'EDUCATION_LEVEL', 'O

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7643 entries, 0 to 7642
Data columns (total 66 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   unique_id                        7643 non-null   int64  
 1   CASE_NO                          7643 non-null   object 
 2   DECISION_DATE                    7643 non-null   object 
 3   VISA_CLASS                       7643 non-null   object 
 4   CASE_RECEIVED_DATE               7643 non-null   object 
 5   CASE_STATUS                      7643 non-null   object 
 6   REQUESTED_START_DATE_OF_NEED     7640 non-null   object 
 7   REQUESTED_END_DATE_OF_NEED       7640 non-null   object 
 8   PRIMARY_SUB                      7643 non-null   object 
 9   EMPLOYER_NAME                    7643 non-null   object 
 10  TRADE_NAME_DBA                   927 non-null    object 
 11  EMPLOYER_ADDRESS1                7641 non-null   object 
 12  EMPLOYER_ADDRESS2   

In [3]:
# prepare input data
def prepare_inputs(X_train, X_test):
    oe = OneHotEncoder()
    oe.fit(X_train)
    X_train_enc = oe.transform(X_train)
    X_test_enc = oe.transform(X_test)
    return X_train_enc, X_test_enc


In [4]:
text_features_of_int = ['unique_id','VISA_CLASS', 'PRIMARY_SUB', 'EMPLOYER_ADDRESS1', 
                       'EMPLOYER_CITY', 'EMPLOYER_STATE', 'EMPLOYER_POSTAL_CODE', 
                       'EMPLOYER_COUNTRY', 'EMPLOYER_PROVINCE', 'AGENT_POC_EMPLOYER_REP_BY_AGENT', 
                       'LAWFIRM_NAME','AGENT_ATTORNEY_NAME', 'AGENT_ATTORNEY_CITY', 'AGENT_ATTORNEY_STATE',
                       'JOB_TITLE', 'SOC_CODE', 'SOC_TITLE', 'PRIMARY_CROP', 'NAICS_CODE','FULL_TIME',
                       'NATURE_OF_TEMPORARY_NEED', 'BASIC_UNIT_OF_PAY', 'EDUCATION_LEVEL', 
                       'SECOND_DIPLOMA', 'TRAINING_REQ', 'EMP_EXPERIENCE_REQD', 'WORKSITE_CITY', 'WORKSITE_STATE', 
                       'WORKSITE_POSTAL_CODE', 'ORGANIZATION_FLAG']

text_feature_pre = preMatrix.loc[:, text_features_of_int].copy()

# do a train test split 
# split into train and test sets (80/20)
X_train, X_test, y_train, y_test = train_test_split(text_feature_pre, y, test_size=0.20, random_state=1)
for df in [X_train, X_test, y_train, y_test]: 
    print(df.shape)

# prepare input data
from sklearn.preprocessing import LabelBinarizer



(6114, 30)
(1529, 30)
(6114, 1)
(1529, 1)


In [5]:
#transformer = ColumnTransformer(transformers= ['cat'] = OneHotEncoder(), 
