    To pre-process the datasets

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd
import os

import re
from typing import Dict, Tuple

import numpy as np

In [None]:
import random
random.seed(42)

## Check sanity of labels

In [None]:
def put_labels(df: pd.DataFrame) -> pd.DataFrame:
  df['ACTION'] = df['ACTION'].apply(lambda x: 0 if x < 0.21 else 1)
  return df

parent_path = "/DATA/"
train_label = put_labels(pd.read_csv(parent_path + "amazon/train_label1.csv"))
test_label = put_labels(pd.read_csv(parent_path + "amazon/test_label1.csv"))
train_label = train_label.rename(columns={'ACTION':'ACTION_pred'})
test_label = test_label.rename(columns={'ACTION':'access_granted'})

In [None]:
train_data_path = parent_path + "amazon/data.csv"

train_data = pd.read_csv(train_data_path)
train_data = train_data.rename(columns={'ACTION':'access_granted'})
train_data = pd.concat([train_data, train_label], axis=1)
mismatched_rows = train_data[train_data['ACTION_pred'] != train_data['access_granted']]

print(len(mismatched_rows), " mismatched instanes")
print(f"Misclassification percentage: {(len(mismatched_rows)/len(train_data))*100:.2f} %")
print(f"Accuracy:  {100 - (len(mismatched_rows)/len(train_data))*100:.2f} %")

538  mismatched instanes
Misclassification percentage: 1.64 %
Accuracy:  98.36 %


In [None]:
test_data_path = parent_path + "amazon/test.csv"

test_data = pd.read_csv(test_data_path)
test_data = pd.concat([test_data, test_label], axis=1)

## Merge Data

In [None]:
# Validation data
train_data = train_data.drop(columns=['ACTION_pred', 'id'])
train_dataset, val_data = train_test_split(train_data, test_size=0.1, random_state=42, shuffle=True)

In [None]:
file_name = f'amazon_val_data.csv'
output_file = os.path.join(parent_path, file_name)
val_data.to_csv(output_file, index=False)

print('VAL Original shape: ',val_data.shape)

VAL Original shape:  (3277, 10)


In [None]:
print('TEST Original shape: ',test_data.shape)
test_data = test_data.drop(columns=['id'])
print('Final shape: ',test_data.shape)

file_name = f'amazon_test_data.csv'
output_file = os.path.join(parent_path, file_name)
test_data.to_csv(output_file, index=False)

TEST Original shape:  (58921, 12)
Final shape:  (58921, 10)


In [None]:
# train_dataset = train_data.copy()#############################
print('TRAIN Original shape: ',train_dataset.shape)
train_dataset = train_dataset[train_dataset['access_granted'] != 0] #To remove the noise
print('Final shape: ',train_dataset.shape)

file_name = f'amazon_train_data.csv'
output_file = os.path.join(parent_path, file_name)
train_dataset.to_csv(output_file, index=False)

TRAIN Original shape:  (29492, 10)
Final shape:  (29492, 10)


In [None]:
full_data = pd.concat([train_dataset, val_data, test_data], axis=0)
full_data = full_data.reset_index(drop=True)
print(full_data.shape)
full_data.drop_duplicates()
print(full_data.shape)
for column in full_data.columns:
  if column not in ('access_granted'): #'userID', 'resourceID',
    unique_values = full_data[column].unique()
    print(f"Unique values in column '{column}': {len(unique_values)}")

(91690, 10)
(91690, 10)
Unique values in column 'RESOURCE': 7518
Unique values in column 'MGR_ID': 4913
Unique values in column 'ROLE_ROLLUP_1': 130
Unique values in column 'ROLE_ROLLUP_2': 183
Unique values in column 'ROLE_DEPTNAME': 476
Unique values in column 'ROLE_TITLE': 361
Unique values in column 'ROLE_FAMILY_DESC': 2951
Unique values in column 'ROLE_FAMILY': 68
Unique values in column 'ROLE_CODE': 361


In [None]:
train_data.head()

Unnamed: 0,access_granted,RESOURCE,MGR_ID,ROLE_ROLLUP_1,ROLE_ROLLUP_2,ROLE_DEPTNAME,ROLE_TITLE,ROLE_FAMILY_DESC,ROLE_FAMILY,ROLE_CODE
0,1,39353,85475,117961,118300,123472,117905,117906,290919,117908
1,1,17183,1540,117961,118343,123125,118536,118536,308574,118539
2,1,36724,14457,118219,118220,117884,117879,267952,19721,117880
3,1,36135,5396,117961,118343,119993,118321,240983,290919,118322
4,1,42680,5905,117929,117930,119569,119323,123932,19793,119325


In [None]:
test_data.head()

Unnamed: 0,RESOURCE,MGR_ID,ROLE_ROLLUP_1,ROLE_ROLLUP_2,ROLE_DEPTNAME,ROLE_TITLE,ROLE_FAMILY_DESC,ROLE_FAMILY,ROLE_CODE,access_granted
0,78766,72734,118079,118080,117878,117879,118177,19721,117880,1
1,40644,4378,117961,118327,118507,118863,122008,118398,118865,1
2,75443,2395,117961,118300,119488,118172,301534,249618,118175,1
3,43219,19986,117961,118225,118403,120773,136187,118960,120774,1
4,42093,50015,117961,118343,119598,118422,300136,118424,118425,1


## DO NOT RUN BOTH CODE VERSION AT THE SAME TIME

In [None]:
# from sklearn.model_selection import train_test_split
# import pandas as pd
# import os

# import re
# from typing import Dict, Tuple

# import random
# import numpy as np
# ##############################################

# parent_path = "/DATA/"
# train_data_path = parent_path + "amazon/data.csv"
# train_data = pd.read_csv(train_data_path)

# train_dataf, test_data = train_test_split(train_data, test_size=0.2, random_state=42, shuffle=True)
# for run_script, df in ('train', train_dataf), ('test', test_data):
#   if run_script == 'train':
#     print('Original shape: ',df.shape)
#     df = df[df['ACTION'] != 0]
#     print('Final shape: ',df.shape)

#   df = df.rename(columns={'ACTION':'access_granted'})
#   file_name = f'amazon_{run_script}_data.csv'
#   output_file = os.path.join(parent_path, file_name)
#   df.to_csv(output_file, index=False)
#   # print(df.head())