* This file is to explore the dataset used and clean in terms of duplicates. Final Data is saved in S3. 
* The two files used are Hype_ML_demographics_2022_06_01_LOA_AC.csv and Hype_ML_demographics_2022_06_01_LOA_ACTV_AC.csv.
* The difference in two files is just in one column- mapped_employment_status_code. Hype_ML_demographics_2022_06_01_LOA_AC includes LOA related data and Hype_ML_demographics_2022_06_01_LOA_ACTV_AC.csv includes Active employees data.

In [None]:
!pip install --upgrade pip --index-url https://artifactory.alight.com/artifactory/api/pypi/python-pypi-remote/simple

In [3]:
!pip install --upgrade pandas --index-url https://artifactory.alight.com/artifactory/api/pypi/python-pypi-remote/simple

Looking in indexes: https://artifactory.alight.com/artifactory/api/pypi/python-pypi-remote/simple
[0m

In [4]:
!pip install --upgrade numpy --index-url https://artifactory.alight.com/artifactory/api/pypi/python-pypi-remote/simple

Looking in indexes: https://artifactory.alight.com/artifactory/api/pypi/python-pypi-remote/simple
[0m

In [5]:
# Mention all file names as string in the below list which are available at "raw_data_path" directory

data_path = f'external/artichauhan/LOA/loa_script/Data'

raw_data_path = f'{data_path}/raw_data'

raw_data_fnames = ['Hype_ML_demographics_2022_06_01_LOA_AC.csv','Hype_ML_demographics_2022_06_01_LOA_ACTV_AC.csv']

bucket='adl-core-sagemaker-studio'

In [6]:
#importing librarires
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import boto3
import io
import time

from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score,\
roc_auc_score, make_scorer, plot_precision_recall_curve, plot_roc_curve, plot_confusion_matrix, average_precision_score,\
ConfusionMatrixDisplay


import tempfile
import boto3
import joblib

%matplotlib inline
pd.set_option('display.max_columns',None)
warnings.filterwarnings('ignore')
s3 = boto3.client('s3')

In [7]:
def load_data_from_s3(bucket,raw_data_path,raw_data_fnames):
    s3 = boto3.client('s3')
    dataset_nrows = []
    for i, fname in enumerate(raw_data_fnames):
        if i==0:
            print(f'Reading file: {fname}')
            key = f'{raw_data_path}/{fname}'
            obj = s3.get_object(Bucket=bucket, Key=key)
            data = pd.read_csv(io.BytesIO(obj['Body'].read()))
            dataset_nrows.append(data.shape[0])
            print(f'\tFile read successfully | Shape: {data.shape}')
        else:
            print(f'Reading file: {fname}')
            key = f'{raw_data_path}/{fname}'
            obj = s3.get_object(Bucket=bucket, Key=key)
            data2 = pd.read_csv(io.BytesIO(obj['Body'].read()))
            data = data.append(data2,ignore_index=True)
            dataset_nrows.append(data2.shape[0])
            print(f'\tFile read successfully | Shape: {data2.shape}')

    if sum(dataset_nrows) == data.shape[0]:
        print(f'Data from all files loaded successfully | Final Shape: {data.shape}')
        return data.copy()
    else:
        print('There is discrepency in numbers')
        print(f'\tTotal number of rows combined in all files: {sum(dataset_nrows)}')
        print(f'\tAfter combining all files total number of rows are: {data.shape[0]}')
        return None

In [8]:
DF = load_data_from_s3(bucket,raw_data_path,raw_data_fnames)

Reading file: Hype_ML_demographics_2022_06_01_LOA_AC.csv
	File read successfully | Shape: (301406, 33)
Reading file: Hype_ML_demographics_2022_06_01_LOA_ACTV_AC.csv
	File read successfully | Shape: (1000000, 33)
Data from all files loaded successfully | Final Shape: (1301406, 33)


In [9]:
DF.duplicated().sum()

197

In [10]:
DF['cpid']=DF['person_internal_id']+'/'+ DF['client_id'].astype(str)

In [11]:
DF['cpid'].duplicated().sum()3

533

In [12]:
DF.iloc[:,:2].duplicated().sum()

505

In [13]:
value_loa=DF['cpid'].value_counts()

In [14]:
value_loa.sort_values(ascending=False)

267747321/3979     6
219351251/1362     4
599477890/1362     3
324334/1012        2
353268/1012        2
                  ..
0D7259/3099        1
593241747/14470    1
606214205/936      1
33829552/7538      1
582636779/16878    1
Name: cpid, Length: 1300872, dtype: int64

In [15]:
new_df_loa = pd.DataFrame(value_loa)

In [16]:
new_df_loa = new_df_loa.reset_index()

In [17]:
new_df_loa['TF'] = new_df_loa['cpid']>1

In [18]:
new_df_loa.head(2)

Unnamed: 0,index,cpid,TF
0,267747321/3979,6,True
1,219351251/1362,4,True


In [19]:
True_df_loa =new_df_loa[new_df_loa['TF']==True]

In [20]:
True_df_loa.shape

(477, 3)

In [21]:
duplicated_df_loa = pd.merge(DF,True_df_loa,left_on='cpid',right_on='index',how='inner')

In [22]:
duplicated_df_loa.shape

(961, 37)

In [23]:
duplicated_df_loa.columns

Index(['person_internal_id', 'client_id', 'udp_global_id', 'age', 'gender',
       'marital_status', 'is_union', 'primary_language_code',
       'country_description', 'state', 'platform_indicator_code',
       'mapped_employment_status_code', 'mapped_employment_status_description',
       'mapped_fullpart_code', 'mapped_fullpart_description',
       'mapped_permanent_temporary_code',
       'mapped_permanent_temporary_description', 'mapped_hourly_salary_code',
       'mapped_hourly_salary_description', 'subsidiary_code',
       'subsidiary_description', 'mapped_flex_status_code',
       'mapped_flex_status_description', 'original_hire_date', 'rehire_date',
       'is_rehire', 'termination_date', 'base_pay_regular_payrate_amount',
       'base_pay_regular_frequency_code',
       'base_pay_regular_frequency_description',
       'base_pay_regular_expectedannualsalary',
       'base_pay_regular_expectedannualsalary_range',
       'annual_benefits_base_rate', 'cpid_x', 'index', 'cpid_y', '

In [24]:
duplicated_df_loa['cpid_x'].value_counts()

267747321/3979    6
219351251/1362    4
599477890/1362    3
371142/1012       2
182297679/695     2
                 ..
820512562/695     2
479907352/7544    2
514330/1012       2
673173432/695     2
129742526/7825    2
Name: cpid_x, Length: 477, dtype: int64

In [25]:
duplicated_df_loa.nunique()

person_internal_id                             477
client_id                                       24
udp_global_id                                  493
age                                             52
gender                                           6
marital_status                                  37
is_union                                         1
primary_language_code                            2
country_description                             23
state                                           43
platform_indicator_code                          5
mapped_employment_status_code                    8
mapped_employment_status_description             7
mapped_fullpart_code                             4
mapped_fullpart_description                      3
mapped_permanent_temporary_code                  4
mapped_permanent_temporary_description           3
mapped_hourly_salary_code                        4
mapped_hourly_salary_description                 3
subsidiary_code                

In [26]:
DF['duplicated_loa'] = DF['cpid'].duplicated()

In [27]:
DF[DF['duplicated_loa']==True].shape

(533, 35)

In [28]:
DF[DF['duplicated_loa']==True].nunique()

person_internal_id                             477
client_id                                       51
udp_global_id                                  526
age                                             52
gender                                           6
marital_status                                  37
is_union                                         1
primary_language_code                            2
country_description                             22
state                                           38
platform_indicator_code                          5
mapped_employment_status_code                    9
mapped_employment_status_description             8
mapped_fullpart_code                             5
mapped_fullpart_description                      4
mapped_permanent_temporary_code                  4
mapped_permanent_temporary_description           3
mapped_hourly_salary_code                        4
mapped_hourly_salary_description                 3
subsidiary_code                

In [29]:
Hype_LOA_Active = DF[DF['duplicated_loa']!=True]

In [30]:
Hype_LOA_Active.shape

(1300873, 35)

In [31]:
Hype_LOA_Active.duplicated().sum()

0

In [32]:
Hype_LOA_Active.columns

Index(['person_internal_id', 'client_id', 'udp_global_id', 'age', 'gender',
       'marital_status', 'is_union', 'primary_language_code',
       'country_description', 'state', 'platform_indicator_code',
       'mapped_employment_status_code', 'mapped_employment_status_description',
       'mapped_fullpart_code', 'mapped_fullpart_description',
       'mapped_permanent_temporary_code',
       'mapped_permanent_temporary_description', 'mapped_hourly_salary_code',
       'mapped_hourly_salary_description', 'subsidiary_code',
       'subsidiary_description', 'mapped_flex_status_code',
       'mapped_flex_status_description', 'original_hire_date', 'rehire_date',
       'is_rehire', 'termination_date', 'base_pay_regular_payrate_amount',
       'base_pay_regular_frequency_code',
       'base_pay_regular_frequency_description',
       'base_pay_regular_expectedannualsalary',
       'base_pay_regular_expectedannualsalary_range',
       'annual_benefits_base_rate', 'cpid', 'duplicated_loa'],
   

In [33]:
col = ['cpid','duplicated_loa']
Hype_LOA_Active.drop(columns=col,axis=1,inplace=True)

In [34]:
Hype_LOA_Active.shape

(1300873, 33)

In [33]:
# print(f'Uploading pre-processed data here -> s3://{bucket}/{raw_data_path}')

# Hype_LOA_Active.to_csv(f's3://{bucket}/{raw_data_path}/preprocessed_raw_data.csv', index=False)

Uploading pre-processed data here -> s3://adl-core-sagemaker-studio/external/artichauhan/LOA/loa_script/Data/raw_data
