In [2]:
!pip install --upgrade pandas --index-url https://artifactory.alight.com/artifactory/api/pypi/python-pypi-remote/simple

Keyring is skipped due to an exception: 'keyring.backends'
Looking in indexes: https://artifactory.alight.com/artifactory/api/pypi/python-pypi-remote/simple
[0m

In [3]:
!pip install --upgrade numpy --index-url https://artifactory.alight.com/artifactory/api/pypi/python-pypi-remote/simple

Keyring is skipped due to an exception: 'keyring.backends'
Looking in indexes: https://artifactory.alight.com/artifactory/api/pypi/python-pypi-remote/simple
[0m

In [9]:
# Mention all file names as string in the below list which are available at "raw_data_path" directory
raw_data_train = ['train_raw_data.csv']

raw_data_X_test = ['test_raw_data.csv']

# Mention the suffix which will used in file name to store pre-processed-data, transformed data and predition results 
fname_suffix = 'loa.csv'

In [5]:
#####################################################################################################################
# FOR LOADING RAW DATA
# Mention the bucket name where raw data is present
bucket='adl-core-sagemaker-studio'

# Mention the correct directory where raw-data is present, if data is in multiple files make sure all files are present in this directory
raw_data_path = f'external/artichauhan/LOA/loa_script/Data'

#####################################################################################################################
# FOR X DAYS LOGIC FOR CREATING TARGET VARIABLE
# n-> number of days to look forward and check of participant is contributing in HSA or not.
# At time this use-case was worked upon, it was 30 days, so 'n' is taken as 30
# This value is use at code line 26
n = 30

#####################################################################################################################
# FOR STORING PRE-PROCESSED DATA
# This is used to seperate different versions of data for whole lifecycle i.e., data pre-process, transformations, modeling and inferences.
#Make sure "version" name is as per version of data pre-processing script. Eg. dpp0-xgb-v1-final.ipynb means "version-1"
version = 'version-1'

# Mention the directory where pre-processed data will be stored with file name
#pre_processed_data_path_with_fname = f'external/artichauhan/LOA/loa_script/Artifacts/{version}/preprocessed-train-data-{fname_suffix}'

#####################################################################################################################
# FOR STORING TRANSFORMED DATA
# This is used to seperate different versions of data transformation pipelines
# Make sure "dppn" name is as per version of data transformation script. Eg. dpp0-xgb-v1-final.ipynb means "dpp0"
dppn = 'dpp0'

# Mention the directory where pre-processed data will be stored (data that will get at end of this script)
pre_processed_train_data_path = f'external/artichauhan/LOA/loa_script/Artifacts/{version}/preprocessed-train-data/{dppn}-train-{fname_suffix}'

#####################################################################################################################
# FOR SAVING PREPROCESSED TEST DATA
# Mention the path where pre-processed test data will be stored in S3
pre_processed_test_data_path = f'external/artichauhan/LOA/loa_script/Artifacts/{version}/preprocessed-test-data/{dppn}-x-test-{fname_suffix}'

#####################################################################################################################
# FOR LOADING TRAINED ML MODEL
# Mention the directory with file name where trained ML model is stored (naming convention: version-dppn-model.pkl)
model_path_with_fname = f'external/artichauhan/LOA/loa_script/Model/{version}/{dppn}-xgb.pkl'

#####################################################################################################################
# FOR STORING PREDICTION RESULTS
# Mention the directory where final predction out will be stored.
# It will save file in CSV format, 1st column contains "platform_internal_id" and 2nd column will have predicted label 1/0
prediction_output_path_with_fname = f'external/artichauhan/LOA/loa_script/Output/{version}/{dppn}-xgb-out-{fname_suffix}'

In [6]:
#importing librarires
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import boto3
import io
import time

from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score,\
roc_auc_score, make_scorer, plot_precision_recall_curve, plot_roc_curve, plot_confusion_matrix, average_precision_score,\
ConfusionMatrixDisplay


import tempfile
import boto3
import joblib

%matplotlib inline
pd.set_option('display.max_columns',None)
warnings.filterwarnings('ignore')
s3 = boto3.client('s3')

In [7]:
def load_data_from_s3(bucket,raw_data_path,raw_data_fnames):
    s3 = boto3.client('s3')
    dataset_nrows = []
    for i, fname in enumerate(raw_data_fnames):
        if i==0:
            print(f'Reading file: {fname}')
            key = f'{raw_data_path}/{fname}'
            obj = s3.get_object(Bucket=bucket, Key=key)
            data = pd.read_csv(io.BytesIO(obj['Body'].read()))
            dataset_nrows.append(data.shape[0])
            print(f'\tFile read successfully | Shape: {data.shape}')
        else:
            print(f'Reading file: {fname}')
            key = f'{raw_data_path}/{fname}'
            obj = s3.get_object(Bucket=bucket, Key=key)
            data2 = pd.read_csv(io.BytesIO(obj['Body'].read()))
            data = data.append(data2,ignore_index=True)
            dataset_nrows.append(data2.shape[0])
            print(f'\tFile read successfully | Shape: {data2.shape}')

    if sum(dataset_nrows) == data.shape[0]:
        print(f'Data from all files loaded successfully | Final Shape: {data.shape}')
        return data.copy()
    else:
        print('There is discrepency in numbers')
        print(f'\tTotal number of rows combined in all files: {sum(dataset_nrows)}')
        print(f'\tAfter combining all files total number of rows are: {data.shape[0]}')
        return None

In [10]:
train_set = load_data_from_s3(bucket,raw_data_path,raw_data_train)

Reading file: train_raw_data.csv


NoSuchKey: An error occurred (NoSuchKey) when calling the GetObject operation: The specified key does not exist.

In [None]:
train_set.head(2)

In [None]:
train_set.isna().sum()

In [None]:
train_set.shape

In [None]:
train_set.columns

#### Preprocessing of the Train Set

In [None]:
#Drop Unnecessary columns
drop_list = ['new_id','new_id_3','udp_global_id', 'state','mapped_employment_status_code', 'mapped_fullpart_description', 
             'mapped_permanent_temporary_description', 'mapped_hourly_salary_description', 'mapped_flex_status_description', 
            'original_hire_date', 'rehire_date', 'termination_date', 'base_pay_regular_frequency_description',
             'base_pay_regular_expectedannualsalary_range','annual_benefits_base_rate']

In [None]:
train_set.drop(columns = drop_list, axis=1, inplace = True)

#### Data cleaning: columns are having mutiple entries for a same text, hence reducing them to individual entries:

In [None]:
Divorce_list = ['Divorced', 'Divorced_United_States_of_America', 'Separated_United_States_of_America', \
                           'Divorced_USA', 'USA_Divorced', 'USA_Separated', 'Separated_USA', 'Separated USA', 'Divorced USA', \
                           'Legally_Separated_United_States_of_America', 'USA-Divorced', 'Widowed_United_States_of_America', 'USA-Separated','Widowed_USA','PR_Divorced']
Single_list = ['S', 'Single', 'Single_United_States_of_America', 'Single_USA', 'USA-Single', 'USA_Single', 'Single USA',\
                                           'S-USA', 'USA-Single, PR_Single', 'PR_Single']   
Unknown_list = ['Unknown_USA', 'Unknown_United_States_of_America', 'Not_Indicated_United_States_of_America', \
                  'PR_Living Together','USA_Living together','USA_Not Disclosed', 'USA_Not Disclosed','Living_Together_United_States_of_America',
       'Domestic_Partner _United_States_America', 'Civil P_United_States_of_America']
Married_list = ['M','Married_United_States_of_America','Married_USA','Married USA','Married','M_USA','M-USA','USA-Married','USA_Married','Married_USA','Maried','USA-Married/ Civil Partnership','Domestic_Partner_United_States_of_America','Partnered_United_States_of_America','Co-Habiting_United_States_of_America','Partnered USA','D-USA']
values = ['Single_Canada','Single_MEX', 'Single_DEU','Head_of_Household_USA','DE_FACTO','O-USA','MI_NOT_DISCLOSED','Domestic Partner', 'MARITAL_STATUS-3-321', 'Single_COL', 'PR_Partnered','Divorced_United_Kingdom','Single_MEX','MARITAL_STATUS-6-321','Unknown_United_States_of_America','Not_Indicated_United_States_of_America','Not_Disclosed_United_States_of_America','IN_Single',
          'Unknown_USA','MARITAL_STATUS-3-321','Married_MEX', 'S-HKG',
       'Unknown_New_Zealand', 'Unknown_Korea_Republic_of',
       'Married_Saudi_Arabia', 'Single_United Kingdom', 'Single_Thailand',
       'Unknown_United Kingdom', 'Unknown_India', 'Married_Taiwan',
       'Single_Taiwan', 'Single_Hong_Kong', 'Single_Korea_Republic_of', 'P-USA', 'Married_BRA', 'Married_DEU',
       'Unknown_SWE','Single_DOM',
       'Married_CHN', 'Married_ESP', 'Single_THA',
       'Married_Kenya', 'Unknown_Taiwan', 'Married_China',
       'Married_New_Zealand', 'Single0_Indonesia', 'Single_Spain',
       'Unknown_Thailand', 'Unknown_Spain', 'Married_Malaysia',
       'Married2_Indonesia', 'Married_Thailand','MARITAL_STATUS-3-40', 'Married_United_Kingdom',
       'Unknown_Puerto Rico','Single_United Arab Emirates', 'Divorced_Hong_Kong',
       'Civil_Partnership_MEX', 'Married_SGP', 'Unknown_AUS',
        'Single_SGP', 'Married_Spain',
       'Divorced_Singapore', 'Divorced_Canada', 'Married_Italy',
       'Married_MEX', 'Married_AUS', 'Single_ESP', 'Unknown_JPN',
       'PR_Married', 'M-GBR', 'Married_Switzerland', 'Domestic Partner',
       'W-USA', 'DE_FACTO', 'PR_Widowed', 'MI_NOT_DISCLOSED',
       'Married_CAN', 'Divorced_DEU', 'Dissolved_Civil_Partnership_MEX',
       'Single_FRA', 'Single_COL', 'Single_Puerto Rico', 'MARITAL_STATUS-6-301', 'MARITAL_STATUS-6-322', 'Hd Hsehld_United_States_of_America',
       'Common_Law_United_States_of_America', 'Married_Ireland',
       'Common_law_Canada', 'Married_Canada','Domestic_Partner_Canada',
       'U-USA','MARITAL_STATUS-3-323', 'USA-Cohabit', 'Married0_Indonesia', 'Married_Korea_Republic_of', 'Single_China',
       'Married1_Indonesia', 'Single_CAN', 'Married_SWE',
       'Single_Belgium', 'Married_United Kingdom', 'Married_Puerto Rico',
       'Married_ITA', 'Single_BEL', 'Married_BEL', 'Single_Lebanon',
       'Unknown_Kenya', 'M-DEU', 'Unknown_GBR', 'Civil Partner_Belgium',
       'Married_IRL', 'USA-Civil Partnership', 'Married_Hong_Kong',
       'Domestic Partner Civil Union_United Kingdom',
       'Civil_Partnership_COL', 'Unknown_Canada', 'Civil_Partnership_USA',
       'Single_JPN', 'Married_MYS', 'Single_IRL', 'Single_New_Zealand',
       'Married_Belgium', 'Civil_Partnership_United_States_of_America',
       'Married_COL', 'Common_Law_COL', 'Married_Austria', 'Single_GBR',
       'Married_GBR', 'Single_SWE', 'Married_JPN', 'Unknown_MLT',
       'Married_NZL', 'Unknown_CHN', 'PR_Not Disclosed', 'R-USA', 'RDP',
       'S-GBR', 'Married_Puerto_Rico', 'Married_Singapore', 'S-CHN',
       'Married_United Arab Emirates', 'M-SGP', 'Single_Singapore',
       'Separated_MLT', 'USA_Common-law', 'C-USA', 'S-ARE', 'S-SGP',
       'Single_CHN', 'Single_United_Kingdom', 'M-CAN', 'M-IND',
       'Single_BRA', 'Divorced_ESP', 'M-AUS', 'USA-Widowed',
       'PRI-Married', 'Single_Argentina', 'Unspecified_United_Kingdom',
       'Domestic_Partnered_United_States_of_America',
       'Co-Habiting_United_Kingdom', 'Married_Turkey',
       'Registered_Partnership_United_States_of_America',
       'Unknown_Malaysia', 'USA-Unknown', 'USA_Widowed', 'M-HKG']

In [None]:
train_set.loc[train_set['marital_status'].isin(Divorce_list), 'marital_status'] = 'Divorced'
train_set.loc[train_set['marital_status'].isin(Single_list), 'marital_status'] = 'Single'
train_set.loc[train_set['marital_status'].isin(Married_list), 'marital_status'] = 'Married'
train_set.loc[train_set['marital_status'].isin(Unknown_list), 'marital_status'] = 'unknown'

In [None]:
train_set = train_set[train_set.marital_status.isin(values) == False]

In [None]:
train_set['country_description'] = train_set['country_description'].replace(['United States of America', 'United States', 'UNITED STATES', 'USA'],'United States of America')
train_set = train_set[train_set['country_description'] == 'United States of America']

In [None]:
train_set['mapped_fullpart_code'] = train_set['mapped_fullpart_code'].replace(['FT','PT'],['FLTM', 'PRTM'])
fullpart_code_remove_list = ['DNM', 'ERROR']
train_set = train_set[train_set.mapped_fullpart_code.isin(fullpart_code_remove_list) == False]

In [None]:
train_set['mapped_permanent_temporary_code'] = train_set['mapped_permanent_temporary_code'].replace(['P','T'],['PERM', 'TEMP'])
pertemp_code_remove_list = ['DNM', 'R', 'F', 'FR']
train_set= train_set[train_set['mapped_permanent_temporary_code'].isin(pertemp_code_remove_list) == False]

In [None]:
train_set['mapped_hourly_salary_code'] = train_set['mapped_hourly_salary_code'].replace(['S', 'H'],['SLRY', 'HRLY'])
hr_code_remove_list = ['DNM', 'A', 'E', '1', 'L', 'N']
train_set = train_set[train_set['mapped_hourly_salary_code'].isin(hr_code_remove_list) == False]

In [None]:
train_set['mapped_flex_status_code'] = train_set['mapped_flex_status_code'].replace(['INELIG'],['NOTELIGIBLE'])
flex_code_remove_list = ['DNM', 'ACTIVE', 'INACTIVE', 'TSDACT', 'RETIRE', 'HMFT']
train_set = train_set[train_set['mapped_flex_status_code'].isin(flex_code_remove_list) == False]

In [None]:
male = ['M', 'Male', 'male', 'Gender_Male']
female = ['F', 'Female', 'female', 'Gender_Female']
unknown = ['U', 'Unknown', 'Not_declared', 'D', 'Decline to answer', 'Undeclared',
           'Not specified', 'Declined to State', 'N', 'O', 'Not Declared']

In [None]:
train_set.loc[train_set['gender'].isin(male), 'gender'] = 'male'
train_set.loc[train_set['gender'].isin(female), 'gender'] = 'female'
train_set.loc[train_set['gender'].isin(unknown), 'gender'] = 'unknown'

In [None]:
train_set['gender'].fillna('unknown',inplace=True)

In [None]:
train_set['marital_status'].fillna('unknown',inplace=True)

In [None]:
train_set['is_union'].fillna('unknown',inplace=True)

In [None]:
train_set['platform_indicator_code'].fillna('unknown',inplace=True)

In [None]:
train_set['mapped_fullpart_code'].fillna('unknown',inplace=True)

In [None]:
train_set['mapped_hourly_salary_code'].fillna('unknown',inplace=True)

In [None]:
train_set['is_rehire'].fillna('unknown',inplace=True)

In [None]:
train_set['base_pay_regular_frequency_code'].fillna('unknown',inplace=True)

In [None]:
train_set['mapped_employment_status_description'] = train_set['mapped_employment_status_description'].replace(["LOA - Workers Compensation"], ["LOA - Worker's Compensation"])

In [None]:
train_set['mapped_employment_status_description'] = train_set['mapped_employment_status_description'].replace(["LOA - no Pay"], ["LOA - No Pay"])

In [None]:
train_set['mapped_employment_status_description'] = train_set['mapped_employment_status_description'].replace(["LOA - with Pay"], ["LOA - With Pay"])

In [None]:
x = train_set.groupby(['mapped_employment_status_description'])['age'].quantile([0.01, 0.99]).unstack()
capping=x.reset_index()
capping

In [None]:
train_set = train_set.merge(capping, how='left', on='mapped_employment_status_description')
train_set.shape

In [None]:
train_set.rename(columns={0.01:'lower_age_cap',0.99:'upper_age_cap'},inplace=True)

In [None]:
train_set[['age','lower_age_cap','upper_age_cap']]

In [None]:
train_set['age'] = np.where(train_set['age'] > train_set['upper_age_cap'], train_set['upper_age_cap'], train_set['age'])

In [None]:
train_set['age'] = np.where(train_set['age'] < train_set['lower_age_cap'], train_set['lower_age_cap'], train_set['age'])

In [None]:
col=['base_pay_regular_payrate_amount','lower_age_cap','upper_age_cap','country_description','mapped_flex_status_code',
    'mapped_permanent_temporary_code']
train_set.drop(columns=col,axis=1,inplace=True)

In [None]:
train_cat_columns = train_set.select_dtypes(include=['object']).columns
train_num_columns = train_set.select_dtypes(include=['int']).columns
train_float_columns = train_set.select_dtypes(include=['float']).columns

In [None]:
print(train_cat_columns)
print(train_num_columns)
print(train_float_columns)

#### Numeric Imputer for missing values

In [None]:
class CustomNumericImputer:
    
    def __init__(self):
        return None
    
    def fit(self, X, impute_cols, using_cols, method='median'):
        self.using_cols = using_cols
        self.impute_cols = impute_cols
        self.method = method
        cols = self.using_cols + self.impute_cols
        X_ = X[cols]
        
        if self.method == 'median':
            self.fit_values = X_.groupby(using_cols)[impute_cols].median().reset_index()
            impute_cols_rename_dict = dict([(x,x+'_median') for x in self.impute_cols])
            self.fit_values.rename(columns=impute_cols_rename_dict, inplace=True)
            self.fit_values.fillna(self.fit_values.median(), inplace=True)
        elif self.method == 'mean':
            self.fit_values = X_.groupby(using_cols)[impute_cols].mean().reset_index()
            impute_cols_rename_dict = dict([(x,x+'_mean') for x in self.impute_cols])
            self.fit_values.rename(columns=impute_cols_rename_dict, inplace=True)
            self.fit_values.fillna(self.fit_values.mean(), inplace=True)
        else: print('Method can be "median" or "mean"')
        self.new_cols = list(impute_cols_rename_dict.values())
        return self
    
    def fit_transform(self, X, impute_cols, using_cols, method='median'):
        self.fit(X, impute_cols, using_cols, method)
        
        cols = self.using_cols + self.impute_cols
        X_ = X[cols]
        X_ = X_.merge(self.fit_values, how='left', on=self.using_cols)
        for col, ncol in zip(self.impute_cols, self.new_cols):
            X_[col] = np.where(X_[col].isnull(), X_[ncol], X_[col])
            
        if self.method == 'median':
            return X_[self.impute_cols].fillna(X_[self.impute_cols].median())
        elif self.method == 'mean':
            return X_[self.impute_cols].fillna(X_[self.impute_cols].mean())
    
    def transform(self, X, impute_cols):
        cols = self.using_cols + self.impute_cols
        X_ = X[cols]
        X_ = X_.merge(self.fit_values, how='left', on=self.using_cols)
        for col, ncol in zip(self.impute_cols,self.new_cols):
            X_[col] = np.where(X_[col].isnull(), X_[ncol], X_[col])
            
        if self.method == 'median':
            return X_[self.impute_cols].fillna(X_[self.impute_cols].median())
        elif self.method == 'mean':
            return X_[self.impute_cols].fillna(X_[self.impute_cols].mean())
         

In [None]:
custom_numeric_imputer = CustomNumericImputer()

In [None]:
train_set[train_num_columns].describe()

In [None]:
train_set[train_float_columns].describe()

In [None]:
train_set[train_float_columns] = custom_numeric_imputer.fit_transform(X=train_set, impute_cols=train_float_columns.tolist(),
                                                              using_cols=['mapped_employment_status_description'], method='median')

In [None]:
custom_numeric_imputer.fit_values

In [None]:
# train_set.isna().sum()

In [None]:
# creating bins on the column base_pay_regular_expectedannualsalary and converting it into a range
bins = [-1, 19999, 39999, 59999, 79999, 99999, 174999,249999, 999999999999]
labels = ['<20,000', '20,000 - 39,999', '40,000 - 59,999', '60,000 - 79,999', '80,000 - 99,999', '100,000-174999', '175000-249999', '>250000']
train_set['base_pay_regular_expected_annual_salary_range'] = pd.cut(x=train_set['base_pay_regular_expectedannualsalary'], bins=bins, labels=labels)

In [None]:
train_set.info()

In [None]:
#Let's drop some columns which are not required
col = ['person_internal_id','client_id','base_pay_regular_expectedannualsalary','mapped_employment_status_description']
train_set.drop(columns=col,axis=1,inplace=True)

In [None]:
train_set.shape

In [None]:
train_set['age'] =train_set['age'].astype('int')

In [None]:
train_set['base_pay_regular_expected_annual_salary_range'] =train_set['base_pay_regular_expected_annual_salary_range'].astype('object')

### Exporting pre-processed train data to S3

In [None]:
print(f'Uploading pre-processed data here -> s3://{bucket}/{pre_processed_train_data_path}')

train_set.to_csv(f's3://{bucket}/{pre_processed_train_data_path}', index=False)

### Reading Test Data

In [None]:
test_set = load_data_from_s3(bucket,raw_data_path,raw_data_X_test)

#### Preprocessing of the Test Set

In [None]:
test_set['mapped_employment_status_description'] = test_set['mapped_employment_status_description'].replace(["LOA - Workers Compensation"], ["LOA - Worker's Compensation"])

In [None]:
test_set['mapped_employment_status_description'] = test_set['mapped_employment_status_description'].replace(["LOA - no Pay"], ["LOA - No Pay"])

In [None]:
test_set['mapped_employment_status_description'] = test_set['mapped_employment_status_description'].replace(["LOA - with Pay"], ["LOA - With Pay"])

In [None]:
#Drop Unnecessary columns
drop_list = ['new_id','new_id_3','udp_global_id', 'state','mapped_employment_status_code', 'mapped_fullpart_description', \
             'mapped_permanent_temporary_description', 'mapped_hourly_salary_description', 'mapped_flex_status_description', 
            'original_hire_date', 'rehire_date', 'termination_date', 'base_pay_regular_frequency_description',
             'base_pay_regular_expectedannualsalary_range']

In [None]:
test_set.drop(columns = drop_list, axis=1, inplace = True)

#### Data cleaning: columns are having mutiple entries for a same text, hence reducing them to individual entries:

In [None]:
Divorce_list = ['Divorced', 'Divorced_United_States_of_America', 'Separated_United_States_of_America', \
                           'Divorced_USA', 'USA_Divorced', 'USA_Separated', 'Separated_USA', 'Separated USA', 'Divorced USA', \
                           'Legally_Separated_United_States_of_America', 'USA-Divorced', 'Widowed_United_States_of_America', 'USA-Separated','Widowed_USA','PR_Divorced']
Single_list = ['S', 'Single', 'Single_United_States_of_America', 'Single_USA', 'USA-Single', 'USA_Single', 'Single USA',\
                                           'S-USA', 'USA-Single, PR_Single', 'PR_Single']   
Unknown_list = ['Unknown_USA', 'Unknown_United_States_of_America', 'Not_Indicated_United_States_of_America', \
                  'PR_Living Together','USA_Living together','USA_Not Disclosed', 'USA_Not Disclosed','Living_Together_United_States_of_America',
       'Domestic_Partner _United_States_America', 'Civil P_United_States_of_America']
Married_list = ['M','Married_United_States_of_America','Married_USA','Married USA','Married','M_USA','M-USA','USA-Married','USA_Married','Married_USA','Maried','USA-Married/ Civil Partnership','Domestic_Partner_United_States_of_America','Partnered_United_States_of_America','Co-Habiting_United_States_of_America','Partnered USA','D-USA']
values = ['Single_Canada','Single_MEX', 'Single_DEU','Head_of_Household_USA','DE_FACTO','O-USA','MI_NOT_DISCLOSED','Domestic Partner', 'MARITAL_STATUS-3-321', 'Single_COL', 'PR_Partnered','Divorced_United_Kingdom','Single_MEX','MARITAL_STATUS-6-321','Unknown_United_States_of_America','Not_Indicated_United_States_of_America','Not_Disclosed_United_States_of_America','IN_Single',
          'Unknown_USA','MARITAL_STATUS-3-321','Married_MEX', 'S-HKG',
       'Unknown_New_Zealand', 'Unknown_Korea_Republic_of',
       'Married_Saudi_Arabia', 'Single_United Kingdom', 'Single_Thailand',
       'Unknown_United Kingdom', 'Unknown_India', 'Married_Taiwan',
       'Single_Taiwan', 'Single_Hong_Kong', 'Single_Korea_Republic_of', 'P-USA', 'Married_BRA', 'Married_DEU',
       'Unknown_SWE','Single_DOM',
       'Married_CHN', 'Married_ESP', 'Single_THA',
       'Married_Kenya', 'Unknown_Taiwan', 'Married_China',
       'Married_New_Zealand', 'Single0_Indonesia', 'Single_Spain',
       'Unknown_Thailand', 'Unknown_Spain', 'Married_Malaysia',
       'Married2_Indonesia', 'Married_Thailand','MARITAL_STATUS-3-40', 'Married_United_Kingdom',
       'Unknown_Puerto Rico','Single_United Arab Emirates', 'Divorced_Hong_Kong',
       'Civil_Partnership_MEX', 'Married_SGP', 'Unknown_AUS',
        'Single_SGP', 'Married_Spain',
       'Divorced_Singapore', 'Divorced_Canada', 'Married_Italy',
       'Married_MEX', 'Married_AUS', 'Single_ESP', 'Unknown_JPN',
       'PR_Married', 'M-GBR', 'Married_Switzerland', 'Domestic Partner',
       'W-USA', 'DE_FACTO', 'PR_Widowed', 'MI_NOT_DISCLOSED',
       'Married_CAN', 'Divorced_DEU', 'Dissolved_Civil_Partnership_MEX',
       'Single_FRA', 'Single_COL', 'Single_Puerto Rico', 'MARITAL_STATUS-6-301', 'MARITAL_STATUS-6-322', 'Hd Hsehld_United_States_of_America',
       'Common_Law_United_States_of_America', 'Married_Ireland',
       'Common_law_Canada', 'Married_Canada','Domestic_Partner_Canada',
       'U-USA','MARITAL_STATUS-3-323', 'USA-Cohabit', 'Married0_Indonesia', 'Married_Korea_Republic_of', 'Single_China',
       'Married1_Indonesia', 'Single_CAN', 'Married_SWE',
       'Single_Belgium', 'Married_United Kingdom', 'Married_Puerto Rico',
       'Married_ITA', 'Single_BEL', 'Married_BEL', 'Single_Lebanon',
       'Unknown_Kenya', 'M-DEU', 'Unknown_GBR', 'Civil Partner_Belgium',
       'Married_IRL', 'USA-Civil Partnership', 'Married_Hong_Kong',
       'Domestic Partner Civil Union_United Kingdom',
       'Civil_Partnership_COL', 'Unknown_Canada', 'Civil_Partnership_USA',
       'Single_JPN', 'Married_MYS', 'Single_IRL', 'Single_New_Zealand',
       'Married_Belgium', 'Civil_Partnership_United_States_of_America',
       'Married_COL', 'Common_Law_COL', 'Married_Austria', 'Single_GBR',
       'Married_GBR', 'Single_SWE', 'Married_JPN', 'Unknown_MLT',
       'Married_NZL', 'Unknown_CHN', 'PR_Not Disclosed', 'R-USA', 'RDP',
       'S-GBR', 'Married_Puerto_Rico', 'Married_Singapore', 'S-CHN',
       'Married_United Arab Emirates', 'M-SGP', 'Single_Singapore',
       'Separated_MLT', 'USA_Common-law', 'C-USA', 'S-ARE', 'S-SGP',
       'Single_CHN', 'Single_United_Kingdom', 'M-CAN', 'M-IND',
       'Single_BRA', 'Divorced_ESP', 'M-AUS', 'USA-Widowed',
       'PRI-Married', 'Single_Argentina', 'Unspecified_United_Kingdom',
       'Domestic_Partnered_United_States_of_America',
       'Co-Habiting_United_Kingdom', 'Married_Turkey',
       'Registered_Partnership_United_States_of_America',
       'Unknown_Malaysia', 'USA-Unknown', 'USA_Widowed', 'M-HKG']

In [None]:
test_set.loc[test_set['marital_status'].isin(Divorce_list), 'marital_status'] = 'Divorced'
test_set.loc[test_set['marital_status'].isin(Single_list), 'marital_status'] = 'Single'
test_set.loc[test_set['marital_status'].isin(Married_list), 'marital_status'] = 'Married'
test_set.loc[test_set['marital_status'].isin(Unknown_list), 'marital_status'] = 'unknown'

In [None]:
test_set = test_set[test_set.marital_status.isin(values) == False]

In [None]:
test_set['country_description'] = test_set['country_description'].replace(['United States of America', 'United States', 'UNITED STATES', 'USA'],'United States of America')
test_set = test_set[test_set['country_description'] == 'United States of America']

In [None]:
test_set['mapped_fullpart_code'] = test_set['mapped_fullpart_code'].replace(['FT','PT'],['FLTM', 'PRTM'])
fullpart_code_remove_list = ['DNM', 'ERROR']
test_set = test_set[test_set.mapped_fullpart_code.isin(fullpart_code_remove_list) == False]

In [None]:
test_set['mapped_permanent_temporary_code'] = test_set['mapped_permanent_temporary_code'].replace(['P','T'],['PERM', 'TEMP'])
pertemp_code_remove_list = ['DNM', 'R', 'F', 'FR']
test_set= test_set[test_set['mapped_permanent_temporary_code'].isin(pertemp_code_remove_list) == False]

In [None]:
test_set['mapped_hourly_salary_code'] = test_set['mapped_hourly_salary_code'].replace(['S', 'H'],['SLRY', 'HRLY'])
hr_code_remove_list = ['DNM', 'A', 'E', '1', 'L', 'N']
test_set = test_set[test_set['mapped_hourly_salary_code'].isin(hr_code_remove_list) == False]

In [None]:
test_set['mapped_flex_status_code'] = test_set['mapped_flex_status_code'].replace(['INELIG'],['NOTELIGIBLE'])
flex_code_remove_list = ['DNM', 'ACTIVE', 'INACTIVE', 'TSDACT', 'RETIRE', 'HMFT']
test_set = test_set[test_set['mapped_flex_status_code'].isin(flex_code_remove_list) == False]

In [None]:
male = ['M', 'Male', 'male', 'Gender_Male']
female = ['F', 'Female', 'female', 'Gender_Female']
unknown = ['U', 'Unknown', 'Not_declared', 'D', 'Decline to answer', 'Undeclared',
           'Not specified', 'Declined to State', 'N', 'O', 'Not Declared']

In [None]:
test_set.loc[test_set['gender'].isin(male), 'gender'] = 'male'
test_set.loc[test_set['gender'].isin(female), 'gender'] = 'female'
test_set.loc[test_set['gender'].isin(unknown), 'gender'] = 'unknown'

#### Test set transformation (cleaning steps)

In [None]:
test_set['gender'].fillna('unknown',inplace=True)

In [None]:
test_set['marital_status'].fillna('unknown',inplace=True)

In [None]:
test_set['is_union'].fillna('unknown',inplace=True)

In [None]:
test_set['platform_indicator_code'].fillna('unknown',inplace=True)

In [None]:
test_set['mapped_fullpart_code'].fillna('unknown',inplace=True)

In [None]:
test_set['mapped_hourly_salary_code'].fillna('unknown',inplace=True)

In [None]:
test_set['is_rehire'].fillna('unknown',inplace=True)

In [None]:
test_set['base_pay_regular_frequency_code'].fillna('unknown',inplace=True)

In [None]:
test_set = test_set.merge(capping, how='left', on='mapped_employment_status_description')
test_set.shape

In [None]:
test_set.rename(columns={0.01:'lower_age_cap',0.99:'upper_age_cap'},inplace=True)

In [None]:
test_set['age'] = np.where(test_set['age'] > test_set['upper_age_cap'], test_set['upper_age_cap'], test_set['age'])

In [None]:
test_set['age'] = np.where(test_set['age'] < test_set['lower_age_cap'], test_set['lower_age_cap'], test_set['age'])

In [None]:
col=['base_pay_regular_payrate_amount','lower_age_cap','upper_age_cap','country_description','mapped_flex_status_code',
    'mapped_permanent_temporary_code']
test_set.drop(columns=col,axis=1,inplace=True)

In [None]:
test_cat_columns = test_set.select_dtypes(include=['object']).columns
test_num_columns = test_set.select_dtypes(include=['int']).columns
test_float_columns = test_set.select_dtypes(include=['float']).columns

In [None]:
print(test_cat_columns)
print(test_num_columns)
print(test_float_columns)

#### Numeric Imputer

In [None]:
test_set[test_float_columns] = custom_numeric_imputer.transform(X=test_set, impute_cols=test_float_columns)
test_set.shape

In [None]:
test_set[test_float_columns].isna().sum()

In [None]:
# creating bins on the column base_pay_regular_expectedannualsalary and converting it into a range
bins = [-1, 19999, 39999, 59999, 79999, 99999, 174999,249999, 999999999999]
labels = ['<20,000', '20,000 - 39,999', '40,000 - 59,999', '60,000 - 79,999', '80,000 - 99,999', '100,000-174999', '175000-249999', '>250000']
test_set['base_pay_regular_expected_annual_salary_range'] = pd.cut(x=test_set['base_pay_regular_expectedannualsalary'], bins=bins, labels=labels)

In [None]:
test_set['age'] = test_set['age'].astype(int)

In [None]:
test_set['base_pay_regular_expected_annual_salary_range'] = test_set['base_pay_regular_expected_annual_salary_range'].astype('object')

In [None]:
test_set.shape

In [None]:
test_set.isna().sum()

In [None]:
col = ['person_internal_id','client_id','base_pay_regular_expectedannualsalary','mapped_employment_status_description']
new_test_set=test_set.drop(columns=col,axis=1)

In [None]:
new_test_set.shape

In [None]:
new_test_set.head(2)

### Exporting pre-processed test data to S3

In [None]:
print(f'Uploading pre-processed data here -> s3://{bucket}/{pre_processed_test_data_path}')

new_test_set.to_csv(f's3://{bucket}/{pre_processed_test_data_path}', index=False)

#### Hot encoding the categorical variables in training dataset:

In [None]:
!pip install category_encoders --index-url https://artifactory.alight.com/artifactory/api/pypi/python-pypi-remote/simple --trusted-host=artifactory.alight.com

In [None]:
import category_encoders as ce

#### Separating out dependent and independent variables from the train data:

In [None]:
X_train = train_set.drop(columns=['mapped_employment_status_code_label'], axis=1)
y_train = train_set['mapped_employment_status_code_label']

In [None]:
X_train.shape

In [None]:
cat_columns = X_train.select_dtypes(include=['object']).columns

In [None]:
encoder = ce.OneHotEncoder(cols=cat_columns, drop_invariant = True)

In [None]:
X_train = encoder.fit_transform(X_train)

In [None]:
#Hot encoded data
X_train.head(2)

#### Hot encoding the test data:

In [None]:
X_test = encoder.transform(new_test_set)

### Loading Trained Model

In [None]:
!pip install xgboost --index-url https://artifactory.alight.com/artifactory/api/pypi/python-pypi-remote/simple --trusted-host=artifactory.alight.com

In [None]:
# READ
s3_client = boto3.client('s3')

with tempfile.TemporaryFile() as fp:
    s3_client.download_fileobj(Fileobj=fp, Bucket=bucket, Key=model_path_with_fname)
    fp.seek(0)
    model = joblib.load(fp)

In [None]:
print(model)

In [None]:
pred = model.predict(X_test)

In [None]:
result = pd.DataFrame({'person_internal_id':test_set['person_internal_id'],'prediction':pred})
result.shape

In [None]:
print(f'Uploading prediction results here -> s3://{bucket}/{prediction_output_path_with_fname}')

result.to_csv(f's3://{bucket}/{prediction_output_path_with_fname}', index=False)

### Artifacts

In [None]:
print(f'Raw Data is available at ->\n\ts3://{bucket}/{raw_data_path}/{raw_data_train}')
print(f'\nPre-processed train data is available at ->\n\ts3://{bucket}/{pre_processed_train_data_path}')
print(f'\nPre-processed data is available at ->\n\ts3://{bucket}/{pre_processed_test_data_path}')
print(f'\nPrediction results is available at ->\n\ts3://{bucket}/{prediction_output_path_with_fname}')