In [1]:
import os
root_dir ='../'
os.chdir(root_dir)

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
model_dir = './model_checkpoint'

### load package

In [4]:
import pandas as pd
import pickle
import copy

In [5]:
pd.set_option('display.max_columns', 50)

### load training dataset

In [6]:
test_path ='./data/testset-for-participants.xlsx'
test_df = pd.read_excel(test_path)

### feature processing

In [7]:
from data_processing import processing

p_conn =processing.processing()

#### missing value

In [8]:
missing_value =-999
missing_label ='missing'

In [9]:
for col in test_df.columns:
    test_df[col] =test_df[col].apply(lambda x : None if str.lower(str(x)) in ['none','non','nan'] else x)

In [10]:
test_df.fillna(-999,inplace=True)

#### discrete features -more than 10

In [11]:
discrete_cols_over_10 =['CARR_NAME','RGN_NAME','STATE_PRVNC_TXT','CUST_STATE']

In [12]:
risk_map =pickle.load(open(f"{model_dir}/risk_map.pkl", "rb"))

In [13]:
# reduce dimension with risk ratio
for idx,col in enumerate(discrete_cols_over_10):
    print(f"...{col}")
    target_col = f"{col}_bin_10_feature"
    value_map =risk_map[col]
    test_df[target_col] = test_df[col].apply(lambda x : value_map.get(x) if x in value_map.keys() else missing_label)

...CARR_NAME
...RGN_NAME
...STATE_PRVNC_TXT
...CUST_STATE


####  discrete features -less than 10

    none treatment

#### datetime formate features

In [14]:
ts_cols =['PWD_UPDT_TS','PH_NUM_UPDT_TS', 'TRAN_TS']

##### operating hour 

In [15]:
for idx,col in enumerate(ts_cols):
    target_col = f"{col}_hour"
    test_df = p_conn.operate_hour(test_df,col)

##### gap between updating operation, like passwaor or phone number,and transaction 

In [16]:
test_df['PWD_UPDT_TS_day'] = test_df.apply(lambda x : p_conn.timedelta_day(x.TRAN_TS,x.PWD_UPDT_TS),axis =1)    
test_df['PH_NUM_UPDT_TS_day'] = test_df.apply(lambda x : p_conn.timedelta_day(x.TRAN_TS,x.PH_NUM_UPDT_TS),axis =1)    

### feature label to idx

In [17]:
txt_value_idx_map =pickle.load(open(f'{model_dir}/txt_value_idx_map.pkl','rb'))

In [18]:
discrete_features =['CARR_NAME_bin_10_feature','PH_NUM_UPDT_TS_hour','RGN_NAME_bin_10_feature',
                   'STATE_PRVNC_TXT_bin_10_feature','PWD_UPDT_TS_hour','PH_NUM_UPDT_TS_day','DVC_TYPE_TXT']

continuous_features =['ACCT_PRE_TRAN_AVAIL_BAL','TRAN_AMT','OPEN_ACCT_CT','WF_dvc_age',]


filterd_features = discrete_features+continuous_features

In [19]:
for col in discrete_features:
    print(col)
    test_df[col] = test_df[col].apply(lambda x : txt_value_idx_map[x])

CARR_NAME_bin_10_feature
PH_NUM_UPDT_TS_hour
RGN_NAME_bin_10_feature
STATE_PRVNC_TXT_bin_10_feature
PWD_UPDT_TS_hour
PH_NUM_UPDT_TS_day
DVC_TYPE_TXT


### model predict

In [20]:
xgb_model = pickle.load(open(f"{model_dir}/bst.pkl",'rb'))

In [21]:
idx2label_map ={1:'Fraud',0:'Non-Fraud'}

label2requirement ={'Fraud':0,'Non-Fraud':1}

In [22]:
predict_y = xgb_model.predict(test_df[filterd_features])
predict_y = [idx2label_map[i] for i in predict_y]
predict_y = [label2requirement[i] for i in predict_y]

test_df['FRAUD_NONFRAUD'] =predict_y

In [23]:
# based on the statistical result from train set,bad-case in PH_NUM_UPDT_TS_hour and PH_NUM_UPDT_TS_hour will definitely cause fraud
test_df.loc[(test_df['PH_NUM_UPDT_TS_hour'] ==txt_value_idx_map['bad-case']) |
           (test_df['PWD_UPDT_TS_hour'] ==txt_value_idx_map['bad-case']),'FRAUD_NONFRAUD']=0

In [24]:
test_df['FRAUD_NONFRAUD'].value_counts()

1    4141
0    1859
Name: FRAUD_NONFRAUD, dtype: int64

In [25]:
test_df[['dataset_id','FRAUD_NONFRAUD']].to_csv('predict_result.csv',index=False)