In [1]:
import os
root_dir ='../'
os.chdir(root_dir)

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
model_dir = './model_checkpoint'

### load package

In [4]:
import pandas as pd
import pickle
import copy

In [5]:
pd.set_option('display.max_columns', 50)

### load training dataset

In [6]:
train_path ='./data/trainset-281-29.xlsx'
train_df = pd.read_excel(train_path)

### feature processing

In [7]:
from data_processing import processing

p_conn =processing.processing()

#### missing value

In [8]:
missing_value =-999
missing_label ='missing'

In [9]:
for col in train_df.columns:
    train_df[col] =train_df[col].apply(lambda x : None if str.lower(str(x)) in ['none','non','nan'] else x)

In [10]:
train_df.fillna(-999,inplace=True)

#### discrete features -more than 10

In [11]:
discrete_cols_over_10 =['CARR_NAME','RGN_NAME','STATE_PRVNC_TXT','CUST_STATE']

In [12]:
risk_map =pickle.load(open(f"{model_dir}/risk_map.pkl", "rb"))

In [13]:
# reduce dimension with risk ratio
for idx,col in enumerate(discrete_cols_over_10):
    print(f"...{col}")
    target_col = f"{col}_bin_10_feature"
    value_map =risk_map[col]
    train_df[target_col] = train_df[col].apply(lambda x : value_map.get(x))

...CARR_NAME
...RGN_NAME
...STATE_PRVNC_TXT
...CUST_STATE


####  discrete features -less than 10

    none treatment

#### datetime formate features

In [14]:
ts_cols =['PWD_UPDT_TS','PH_NUM_UPDT_TS', 'TRAN_TS']

##### operating hour 

In [15]:
for idx,col in enumerate(ts_cols):
    target_col = f"{col}_hour"
    train_df = p_conn.operate_hour(train_df,col)

##### gap between updating operation, like passwaor or phone number,and transaction 

In [16]:
train_df['PWD_UPDT_TS_day'] = train_df.apply(lambda x : p_conn.timedelta_day(x.TRAN_TS,x.PWD_UPDT_TS),axis =1)    
train_df['PH_NUM_UPDT_TS_day'] = train_df.apply(lambda x : p_conn.timedelta_day(x.TRAN_TS,x.PH_NUM_UPDT_TS),axis =1)    

### feature label to idx

In [17]:
txt_value_idx_map =pickle.load(open(f'{model_dir}/txt_value_idx_map.pkl','rb'))

In [18]:
discrete_features =['CARR_NAME_bin_10_feature','PH_NUM_UPDT_TS_hour','RGN_NAME_bin_10_feature',
                   'STATE_PRVNC_TXT_bin_10_feature','PWD_UPDT_TS_hour','PH_NUM_UPDT_TS_day','DVC_TYPE_TXT']

continuous_features =['ACCT_PRE_TRAN_AVAIL_BAL','TRAN_AMT','OPEN_ACCT_CT','WF_dvc_age',]


filterd_features = discrete_features+continuous_features

In [19]:
for col in discrete_features:
    train_df[col] = train_df[col].apply(lambda x : txt_value_idx_map[x])

### model select & training

In [20]:
from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report

In [21]:
label ='FRAUD_NONFRAUD'
label_map ={'Fraud':1,'Non-Fraud':0}


xgb_df =copy.deepcopy(train_df[filterd_features+[label]])

train_x,test_x = train_test_split(xgb_df,
                                 test_size=0.3,
                                 shuffle=True)


train_x =pd.concat([train_x,train_x[train_x['FRAUD_NONFRAUD']=='Fraud']])

train_x = train_x.sample(frac=1)

train_y =train_x.pop('FRAUD_NONFRAUD')
train_y =[label_map.get(i) for i in train_y ]


test_y =test_x.pop('FRAUD_NONFRAUD')
test_y =[label_map.get(i) for i in test_y ]

#### random forest

In [22]:
from sklearn.ensemble import RandomForestClassifier

In [23]:
clf = RandomForestClassifier(max_depth=10, random_state =2021)
clf.fit(train_x, train_y)

# train dataset
predict_train_y = clf.predict(train_x)
predict_train_y =predict_train_y.tolist()
print(f"....random forest classification report---training dataset")
print('\n')
print(classification_report(y_true=train_y,y_pred=predict_train_y))

# test dataset
predict_y = clf.predict(test_x)
predict_y =predict_y.tolist()
print(f"....random forest classification report---test dataset")
print('\n')
print(classification_report(y_true=test_y,y_pred=predict_y))

print('++'*40)

print(f"feature importance")

print(dict(zip(filterd_features,clf.feature_importances_.tolist())))

....random forest classification report---training dataset


              precision    recall  f1-score   support

           0       0.98      0.98      0.98      6869
           1       0.98      0.98      0.98      5862

    accuracy                           0.98     12731
   macro avg       0.98      0.98      0.98     12731
weighted avg       0.98      0.98      0.98     12731

....random forest classification report---test dataset


              precision    recall  f1-score   support

           0       0.98      0.96      0.97      2967
           1       0.91      0.95      0.93      1233

    accuracy                           0.96      4200
   macro avg       0.95      0.96      0.95      4200
weighted avg       0.96      0.96      0.96      4200

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
feature importance
{'CARR_NAME_bin_10_feature': 0.16296522285073528, 'PH_NUM_UPDT_TS_hour': 0.013030193444410981, 'RGN_NAME_bin_10_feature': 0.0604

####  save model -random forest

In [24]:
pickle.dump(clf, open(f"{model_dir}/randomforest.pkl", "wb"))

#### xgboost

In [25]:
import xgboost as xgb

In [26]:
thres =0.5

bst =xgb.XGBClassifier(base_score=thres,
                       scale_pos_weight=0.7,
                       min_child_weight=10,
                       subsample =0.7,
                       colsample_bytree=0.7,
                      max_depth =8)

bst.fit(train_x.to_numpy(),train_y)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=8,
              min_child_weight=10, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=0.7, subsample=0.7,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [27]:
# training dataset
# predict_train_y = bst.predict(dtrain)
predict_train_y = bst.predict(train_x)

predict_train_y =predict_train_y.tolist()
predict_train_y =[1 if i >=thres else 0 for i in predict_train_y]
print(f"....xgboost classification report---training dataset")
print('\n')
print(classification_report(y_true=train_y,y_pred=predict_train_y))


predict_y_xgb = bst.predict(test_x)
predict_y_xgb =predict_y_xgb.tolist()
predict_y_xgb =[1 if i >=thres else 0 for i in predict_y_xgb]
print(f"....xgboost classification report---testing dataset")
print('\n')
print(classification_report(y_true=test_y,y_pred=predict_y_xgb))


print('++'*40)

print(f"feature importance")

print(dict(zip(filterd_features,bst.feature_importances_.tolist())))

....xgboost classification report---training dataset


              precision    recall  f1-score   support

           0       0.99      0.99      0.99      6869
           1       0.99      0.98      0.99      5862

    accuracy                           0.99     12731
   macro avg       0.99      0.99      0.99     12731
weighted avg       0.99      0.99      0.99     12731

....xgboost classification report---testing dataset


              precision    recall  f1-score   support

           0       0.98      0.98      0.98      2967
           1       0.94      0.95      0.95      1233

    accuracy                           0.97      4200
   macro avg       0.96      0.96      0.96      4200
weighted avg       0.97      0.97      0.97      4200

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
feature importance
{'CARR_NAME_bin_10_feature': 0.32374516129493713, 'PH_NUM_UPDT_TS_hour': 0.02467169798910618, 'RGN_NAME_bin_10_feature': 0.05744619667530

#### save model- xgboost

In [28]:
pickle.dump(bst, open(f"{model_dir}/bst.pkl", "wb"))

##### DeepFM model

In [29]:
import torch
from sklearn.preprocessing import MinMaxScaler

import numpy as np
import random
from deepctrmodels.deepfm import Deepfm

In [30]:
dense_features =['ACCT_PRE_TRAN_AVAIL_BAL', 'OPEN_ACCT_CT', 'TRAN_AMT', 'WF_dvc_age']
sparse_features =list(set(filterd_features)- set(dense_features))

feature_names =filterd_features
target = ['FRAUD_NONFRAUD']                     

In [31]:
seed = 1024
torch.manual_seed(seed)  
torch.cuda.manual_seed(seed) 
torch.cuda.manual_seed_all(seed)  
np.random.seed(seed)
random.seed(seed)

In [32]:
test_xx = copy.deepcopy(test_x)

In [33]:
mms = MinMaxScaler(feature_range=(-1, 1))
for df in [train_x,test_xx]:
    df[dense_features] = mms.fit_transform(df[dense_features])

# test_xx[dense_features] =mms.transform(test_xx[dense_features])

pickle.dump(mms, open(f"{model_dir}/max_min_scaler.pkl", "wb"))

In [34]:
# 2.count #unique features for each sparse field,and record dense feature field name
feat_sizes1={ feat:1 for feat in dense_features}
feat_sizes2 = {feat: max(txt_value_idx_map.values())+1 for feat in sparse_features}

# feat_sizes2 = {feat: len(train_df[feat].unique()) for feat in sparse_features}
feat_sizes={}
feat_sizes.update(feat_sizes1)
feat_sizes.update(feat_sizes2)

In [35]:
train_model_input = {name: train_x[name] for name in feature_names}
test_model_input =  {name: test_xx[name]  for name in feature_names}

In [36]:
# 4.Define Model,train,predict and evaluate
device = 'cpu'
use_cuda = True
if use_cuda and torch.cuda.is_available():
    print('cuda ready...')
    device = 'cuda:0'

model = Deepfm(feat_sizes ,
               sparse_feature_columns = sparse_features,
               sparse_shared_embedding_map ={},
               dense_feature_columns = dense_features,
               model_checkpoint_path =f"{model_dir}/deepfm.pt",
               dnn_hidden_units=[32,16] , dnn_dropout=0.5 , ebedding_size = 10,
               l2_reg_linear=1e-3, device=device)

In [37]:
model.fit(train_model_input, np.array(train_y) , 
          test_model_input , np.array(test_y)
          ,batch_size=256, epochs=150, verbose=1)

cpu
Train on 12731 samples,  50 steps per epoch
epoch 0 train loss is 0.6394 train AUC is 0.8413
test LogLoss is 0.5526 test AUC is 0.9093
f1_score :0.8318708792849314
              precision    recall  f1-score   support

           0       0.89      0.87      0.88      2967
           1       0.71      0.73      0.72      1233

    accuracy                           0.83      4200
   macro avg       0.80      0.80      0.80      4200
weighted avg       0.83      0.83      0.83      4200

epoch 1 train loss is 0.4716 train AUC is 0.8884
test LogLoss is 0.4055 test AUC is 0.9062
f1_score :0.8311132520920345
epoch 2 train loss is 0.3784 train AUC is 0.9240
test LogLoss is 0.3501 test AUC is 0.9241
f1_score :0.8587791160759275
              precision    recall  f1-score   support

           0       0.91      0.88      0.90      2967
           1       0.74      0.79      0.77      1233

    accuracy                           0.86      4200
   macro avg       0.83      0.84      0.83    

epoch 30 train loss is 0.2420 train AUC is 0.9677
test LogLoss is 0.2323 test AUC is 0.9624
f1_score :0.900612219015868
              precision    recall  f1-score   support

           0       0.95      0.91      0.93      2967
           1       0.80      0.88      0.84      1233

    accuracy                           0.90      4200
   macro avg       0.87      0.89      0.88      4200
weighted avg       0.90      0.90      0.90      4200

epoch 31 train loss is 0.2421 train AUC is 0.9675
test LogLoss is 0.2395 test AUC is 0.9632
f1_score :0.8985651389509576
epoch 32 train loss is 0.2409 train AUC is 0.9679
test LogLoss is 0.2410 test AUC is 0.9632
f1_score :0.8986360997893492
epoch 33 train loss is 0.2406 train AUC is 0.9680
test LogLoss is 0.2370 test AUC is 0.9634
f1_score :0.8996355414952261
epoch 34 train loss is 0.2409 train AUC is 0.9680
test LogLoss is 0.2353 test AUC is 0.9632
f1_score :0.8999760879538653
epoch 35 train loss is 0.2401 train AUC is 0.9682
test LogLoss is 0.2

epoch 79 train loss is 0.2298 train AUC is 0.9715
test LogLoss is 0.2197 test AUC is 0.9653
f1_score :0.9094755064702977
              precision    recall  f1-score   support

           0       0.95      0.92      0.93      2967
           1       0.83      0.87      0.85      1233

    accuracy                           0.91      4200
   macro avg       0.89      0.90      0.89      4200
weighted avg       0.91      0.91      0.91      4200

epoch 80 train loss is 0.2305 train AUC is 0.9712
test LogLoss is 0.2245 test AUC is 0.9661
f1_score :0.9067392926542063
epoch 81 train loss is 0.2292 train AUC is 0.9715
test LogLoss is 0.2293 test AUC is 0.9660
f1_score :0.9070376747062462
epoch 82 train loss is 0.2293 train AUC is 0.9714
test LogLoss is 0.2329 test AUC is 0.9662
f1_score :0.9050588699494951
epoch 83 train loss is 0.2294 train AUC is 0.9715
test LogLoss is 0.2241 test AUC is 0.9665
f1_score :0.908271992558092
epoch 84 train loss is 0.2294 train AUC is 0.9715
test LogLoss is 0.2

epoch 142 train loss is 0.2190 train AUC is 0.9746
test LogLoss is 0.2242 test AUC is 0.9673
f1_score :0.9078893478103308
epoch 143 train loss is 0.2191 train AUC is 0.9747
test LogLoss is 0.2238 test AUC is 0.9675
f1_score :0.9074720251563859
epoch 144 train loss is 0.2181 train AUC is 0.9749
test LogLoss is 0.2182 test AUC is 0.9672
f1_score :0.9082667607758226
epoch 145 train loss is 0.2187 train AUC is 0.9747
test LogLoss is 0.2196 test AUC is 0.9671
f1_score :0.9079049169651577
epoch 146 train loss is 0.2175 train AUC is 0.9751
test LogLoss is 0.2178 test AUC is 0.9665
f1_score :0.9097982432230101
epoch 147 train loss is 0.2192 train AUC is 0.9746
test LogLoss is 0.2149 test AUC is 0.9662
f1_score :0.9113411037010294
              precision    recall  f1-score   support

           0       0.94      0.93      0.94      2967
           1       0.84      0.86      0.85      1233

    accuracy                           0.91      4200
   macro avg       0.89      0.90      0.89      4

### model merge

In [38]:
xgb_y_pred =bst.predict(test_x).tolist()
rf_y_pred =clf.predict(test_x).tolist()
df_y_pred =model.predict(test_xx,128)
df_y_pred =df_y_pred.reshape((1,-1)).tolist()[0]

In [39]:
test_x['xgb'] =xgb_y_pred
test_x['rf'] =rf_y_pred
test_x['df'] =[1 if i >0.55 else 0 for i in df_y_pred]
test_x['ground_truth'] = test_y

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_x['xgb'] =xgb_y_pred
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_x['rf'] =rf_y_pred
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_x['df'] =[1 if i >0.55 else 0 for i in df_y_pred]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_in

In [40]:
def label(value_list):
    label_sum = sum(value_list)
    label =0
    if label_sum >=2:
        label =1
    return label

In [41]:
test_x['merged_label'] = test_x.apply(lambda x : label([x.xgb,x.rf,x.df]),axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_x['merged_label'] = test_x.apply(lambda x : label([x.xgb,x.rf,x.df]),axis=1)


In [42]:
print(classification_report(test_x['ground_truth'],test_x['merged_label']))

              precision    recall  f1-score   support

           0       0.98      0.97      0.98      2967
           1       0.94      0.95      0.95      1233

    accuracy                           0.97      4200
   macro avg       0.96      0.96      0.96      4200
weighted avg       0.97      0.97      0.97      4200



### model redict result analysis

In [63]:
test_x['correct_flag'] = test_x['ground_truth'] ==test_x['xgb']

test_x['trans_deposit_ratio'] = test_x['TRAN_AMT']/(test_x['ACCT_PRE_TRAN_AVAIL_BAL']+test_x['TRAN_AMT'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_x['correct_flag'] = test_x['ground_truth'] ==test_x['xgb']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_x['trans_deposit_ratio'] = test_x['TRAN_AMT']/(test_x['ACCT_PRE_TRAN_AVAIL_BAL']+test_x['TRAN_AMT'])


In [64]:
test_x['trans_deposit_ratio_cls'] =pd.cut(test_x['trans_deposit_ratio'],bins=[i/10 for i in range(0,11)])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_x['trans_deposit_ratio_cls'] =pd.cut(test_x['trans_deposit_ratio'],bins=[i/10 for i in range(0,11)])


In [65]:
pd.pivot_table(data=test_x,index='trans_deposit_ratio_cls',columns='correct_flag',values ='xgb',aggfunc='count')

correct_flag,False,True
trans_deposit_ratio_cls,Unnamed: 1_level_1,Unnamed: 2_level_1
"(0.0, 0.1]",99,1709
"(0.1, 0.2]",19,878
"(0.2, 0.3]",10,162
"(0.3, 0.4]",3,62
"(0.4, 0.5]",4,31
"(0.5, 0.6]",0,29
"(0.6, 0.7]",0,24
"(0.7, 0.8]",0,29
"(0.8, 0.9]",0,15
"(0.9, 1.0]",0,1126


In [66]:
# groud truth ==non-fraud; predict result ==fraud
test_x[(test_x['correct_flag']==False)&
      (test_x['xgb']==1)]['trans_deposit_ratio_cls'].value_counts()

(0.0, 0.1]    46
(0.1, 0.2]    14
(0.2, 0.3]     8
(0.4, 0.5]     2
(0.3, 0.4]     1
(0.5, 0.6]     0
(0.6, 0.7]     0
(0.7, 0.8]     0
(0.8, 0.9]     0
(0.9, 1.0]     0
Name: trans_deposit_ratio_cls, dtype: int64

In [67]:
# groud truth == fraud; predict result ==non-fraud
test_x[(test_x['correct_flag']==False)&
      (test_x['xgb']==0)]['trans_deposit_ratio_cls'].value_counts()

(0.0, 0.1]    53
(0.1, 0.2]     5
(0.2, 0.3]     2
(0.3, 0.4]     2
(0.4, 0.5]     2
(0.5, 0.6]     0
(0.6, 0.7]     0
(0.7, 0.8]     0
(0.8, 0.9]     0
(0.9, 1.0]     0
Name: trans_deposit_ratio_cls, dtype: int64

# conclusion


1. we use three different models(random forest,xgboost,DeepFM), the classification result show that random forest and xgboost have similar performance, both get f1 score 96. however,the f1 score of DeepFM is the lowest among the three, probably due to limited number of dataset.



2. we also merge the predict result of the three, however, the performance does not improve.



3. Comparing the performance of random forest and xgboost, we finnaly choose xgboost on ground of common sense that  higher recall score of fraud  is more importance in reality.

4. the classificaiton report("0" equals to"non-fraud";"1" equals to "fraud") on xgboost shows that:

    a.The model performs better than "fraud" in predicting "non-fraud" transactions, with higher precision and recall. For example, out of 100 transactions predicted by the model to be fraudulent, there is a probability of error for 6 times; and out of 100 transactions predicted by the model to be non-fraud, the probability of error is 2 times.

    b. In terms of feature importance, the derived features from "RGN_NAME", "STATE_PRVNC_TXT" and "DVC_TYPE_TXT",and the original feature "TRAN_AMT" contribute significantly to the model classification results.

    c. On the test data set, when the transaction amount accounts for less than 10% of the account balance, the prediction error probability of the model increases;