In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import numpy as np
import pandas as pd
import random
from sklearn.preprocessing import LabelEncoder

In [2]:
#loading train_data and divide them into different array: alertkey, custinfo, dp_5_arr
train_data = np.load("train_set_5.npy",allow_pickle = True)
print(train_data.shape)
alert_key_arr = train_data[:,0]

custinfo_arr = train_data[:,1]
train_custinfo = np.array([np.array(i) for i in custinfo_arr]) #convert all into array

dp_5_arr = train_data[:,2]
train_dp_5 = np.array([np.array(i) for i in dp_5_arr]) #convert all into array

ccba_5_arr = train_data[:,3]
train_ccba_5 = np.array([np.array(i) for i in ccba_5_arr]) #convert all into array

cdtx_5_arr = train_data[:,4]
train_cdtx_5 = np.array([np.array(i) for i in cdtx_5_arr]) #convert all into array

remit_5_arr = train_data[:,5]
train_remit_5 = np.array([np.array(i) for i in remit_5_arr]) #convert all into array

sarflag_arr = pd.read_csv("train_y_answer.csv").values
#capture the alertkey which is money laundry record
sarflag1_alertkey = np.array([row[0] for row in sarflag_arr if row[1]==1])

laundry_label = np.array([0,1])
normal_label = np.array([1,0])

train_sarflag = np.array([laundry_label if row[1] == 1 else normal_label for row in sarflag_arr])

(23906, 7)


In [3]:
#load testing dataset
test_data = np.load("public_set_5.npy",allow_pickle = True)

test_alert_key = test_data[:,0]

custinfo_arr = test_data[:,1]
test_custinfo = np.array([np.array(i) for i in custinfo_arr]) #convert all into array

dp_5_arr = test_data[:,2]
test_dp_5 = np.array([np.array(i) for i in dp_5_arr]) #convert all into array

ccba_5_arr = test_data[:,3]
test_ccba_5 = np.array([np.array(i) for i in ccba_5_arr]) #convert all into array

cdtx_5_arr = test_data[:,4]
test_cdtx_5 = np.array([np.array(i) for i in cdtx_5_arr]) #convert all into array

remit_5_arr = test_data[:,5]
test_remit_5 = np.array([np.array(i) for i in remit_5_arr]) #convert all into array


sarflag_arr = pd.read_csv("test_y_answer.csv").values
#capture the alertkey which is money laundry record
sarflag1_alertkey = np.array([row[0] for row in sarflag_arr if row[1]==1])

laundry_label = np.array([0,1])
normal_label = np.array([1,0])

test_sarflag = np.array([laundry_label if row[1] == 1 else normal_label for row in sarflag_arr])
print(test_sarflag)

[[1 0]
 [1 0]
 [1 0]
 ...
 [1 0]
 [1 0]
 [1 0]]


In [4]:
#find test_alertkey which is money laundry record
test_laundry_alertkey = [key for key in test_alert_key if key in sarflag1_alertkey]
print("test money laundry alert key:",test_laundry_alertkey)
print("len of test laundry records:",len(test_laundry_alertkey))

test money laundry alert key: [354939, 355091, 355152, 355724, 359668, 356602, 363320, 358453, 363896, 361617, 363033]
len of test laundry records: 11


In [5]:
#calculate class weight:
neg = 0
pos = 0
total = len(train_sarflag)

for label in train_sarflag:
    if label[0] == 1:
        neg += 1
    else:
        pos += 1

weight_for_0 = (1 / neg) * (total / 2.0)
weight_for_1 = (1 / pos) * (total / 2.0)/(3/2)

class_weight = {0: weight_for_0, 1: weight_for_1}

print(class_weight)

{0: 0.5049425481581615, 1: 34.054131054131055}


In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pandas as pd
import math
import tensorflow as tf
import logging
tf.get_logger().setLevel(logging.ERROR)
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Dropout
from tensorflow.keras import Input
#https://keras.io/api/layers/core_layers/input/
from tensorflow.keras.layers import concatenate

In [7]:
METRICS = [
      tf.keras.metrics.Recall(name='recall'),
]


In [8]:
def capture_sarflags_cost(real_laundry_key,y_pred,shuffle_alert_key):
    
    #concat alertkey with predicted probability
    alert_key_with_prob = np.hstack((np.array(shuffle_alert_key).reshape(-1,1),
                                     y_pred.reshape(-1,1)))
    
    #sorted by probability
    alert_key_with_prob = np.array(sorted(alert_key_with_prob, key=lambda x: x[1],reverse=True))
    #capture only index
    sorted_alert_key = alert_key_with_prob[:,0].tolist()
    
    distance = [0,0]
    for index in real_laundry_key:
        if distance[1] < sorted_alert_key.index(index):
            distance[0] = distance[1]
            distance[1] = sorted_alert_key.index(index)
        else:
            distance[0] = max(distance[0],sorted_alert_key.index(index))
    
    precision = (len(real_laundry_key)-1)/distance[0]
    return precision

In [9]:
def model_phase(epoch_num,comb):
    
    #define input_shape
    custinfo_input = Input(shape=(train_custinfo.shape[1]))
    dp_input = Input(shape=train_dp_5.shape[1:])
    ccba_input = Input(shape=train_ccba_5.shape[1:])
    cdtx_input = Input(shape=train_cdtx_5.shape[1:])
    remit_input = Input(shape=train_remit_5.shape[1:])
    
    
    
    dp_embedding = LSTM(train_dp_5.shape[2],activation="relu")(dp_input)
    dp_embedding = Dropout(0.2)(dp_embedding)
    
    ccba_embedding = LSTM(train_ccba_5.shape[2],activation="relu")(ccba_input)
    ccba_embedding = Dropout(0.2)(ccba_embedding)
    
    cdtx_embedding = LSTM(train_cdtx_5.shape[2],activation="relu")(cdtx_input)
    cdtx_embedding = Dropout(0.2)(cdtx_embedding)
    
    remit_embedding = LSTM(train_remit_5.shape[2],activation="relu")(remit_input)
    remit_embedding = Dropout(0.2)(remit_embedding)
    
    embedding_list = [custinfo_input,dp_embedding,ccba_embedding,cdtx_embedding,remit_embedding]
    
    if len(comb) == 1:
        concat_embedding = custinfo_input
    else:
        embed_to_concat = [embedding_list[i] for i in comb]
        concat_embedding = concatenate(embed_to_concat)
    
    out128 = Dense(128, activation="relu")(concat_embedding)
    
    out64 = Dense(64, activation="relu")(out128)
    
    out32 = Dense(32, activation="relu")(out64)
    
    out16 = Dense(16, activation="relu")(out32)
    
    out8 = Dense(8, activation="relu")(out16)
    
    out4 = Dense(4, activation="relu")(out8)
    
    out2 = Dense(2, activation="softmax")(out4)

    model = Model(inputs=[custinfo_input, dp_input,ccba_input,cdtx_input,remit_input], outputs=out2)
    model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
            optimizer='adam',
            metrics=METRICS)

    #callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=20)
    # 進行訓練
    history = model.fit([train_custinfo,train_dp_5,train_ccba_5,train_cdtx_5,train_remit_5], 
                        train_sarflag, 
                        epochs = epoch_num, 
                        batch_size = 256,
                        class_weight=class_weight,
                        #callbacks=[callback],
                        validation_data=([test_custinfo,test_dp_5,test_ccba_5,test_cdtx_5,test_remit_5],
                                         test_sarflag),
                        verbose=0,
                       )  
    
    
    
    y_pred = model.predict([test_custinfo,test_dp_5,test_ccba_5,test_cdtx_5,test_remit_5])
    
    tf.keras.backend.clear_session()
    return y_pred

In [10]:
#窮舉所有可能的排列組合
from itertools import combinations

def order_and_group(length):
    #because we need to keep a
    _len = length-1
    numerical_list = [i for i in range(1,_len+1,1)]
    
    possible_group = []
    for num in numerical_list:
        possible_group += list(combinations(numerical_list,num))
    
    comb_list = [[0]]
    for group in possible_group:
        temp_list = [0] + [i for i in group]
        comb_list.append(temp_list)
    return comb_list
comb_list = order_and_group(5)

In [11]:
#right now num of epoch
epoch_list = range(1,31,1)

#store the result of each epoch
result_list = []
column_name = ["epoch_num","combination","average_precision","min_value","max_value"]
for epoch in epoch_list:
    for comb in comb_list:
        precision_list = []
        for _ in range(10):#try 10 times
            pred = model_phase(epoch,comb)
            precision = capture_sarflags_cost(test_laundry_alertkey,pred[:,1],test_alert_key)
            precision_list.append(precision)
        average_precision = round(sum(precision_list)/len(precision_list),5)
        max_precision = round(max(precision_list),5)
        min_precision = round(min(precision_list),5)
        result_list.append([epoch,comb,average_precision,min_precision,max_precision])
        result_df = pd.DataFrame(result_list,columns=column_name)
        print(f"epoch: {epoch}  combination: {comb}  ave_precision: {average_precision}",
             f"  min_precision: {min_precision}  min_precision: {max_precision}")
        result_df.to_csv("precision_result.csv",index=False)

epoch: 1  combination: [0]  ave_precision: 0.00721   min_precision: 0.00631  min_precision: 0.00951
epoch: 1  combination: [0, 1]  ave_precision: 0.01432   min_precision: 0.00969  min_precision: 0.02033
epoch: 1  combination: [0, 2]  ave_precision: 0.00721   min_precision: 0.00588  min_precision: 0.00904
epoch: 1  combination: [0, 3]  ave_precision: 0.00834   min_precision: 0.00607  min_precision: 0.01037
epoch: 1  combination: [0, 4]  ave_precision: 0.00753   min_precision: 0.00645  min_precision: 0.01038
epoch: 1  combination: [0, 1, 2]  ave_precision: 0.014   min_precision: 0.00602  min_precision: 0.01832
epoch: 1  combination: [0, 1, 3]  ave_precision: 0.01074   min_precision: 0.00621  min_precision: 0.01408
epoch: 1  combination: [0, 1, 4]  ave_precision: 0.01304   min_precision: 0.00668  min_precision: 0.01631
epoch: 1  combination: [0, 2, 3]  ave_precision: 0.0076   min_precision: 0.00605  min_precision: 0.00882
epoch: 1  combination: [0, 2, 4]  ave_precision: 0.00734   min_prec

epoch: 5  combination: [0, 2, 3, 4]  ave_precision: 0.01061   min_precision: 0.00655  min_precision: 0.01372
epoch: 5  combination: [0, 1, 2, 3, 4]  ave_precision: 0.01381   min_precision: 0.01168  min_precision: 0.01587
epoch: 6  combination: [0]  ave_precision: 0.00883   min_precision: 0.00655  min_precision: 0.01114
epoch: 6  combination: [0, 1]  ave_precision: 0.01495   min_precision: 0.01225  min_precision: 0.02247
epoch: 6  combination: [0, 2]  ave_precision: 0.0085   min_precision: 0.00655  min_precision: 0.01239
epoch: 6  combination: [0, 3]  ave_precision: 0.00898   min_precision: 0.00655  min_precision: 0.01056
epoch: 6  combination: [0, 4]  ave_precision: 0.01012   min_precision: 0.00733  min_precision: 0.01344
epoch: 6  combination: [0, 1, 2]  ave_precision: 0.01447   min_precision: 0.00655  min_precision: 0.02028
epoch: 6  combination: [0, 1, 3]  ave_precision: 0.01208   min_precision: 0.00655  min_precision: 0.01471
epoch: 6  combination: [0, 1, 4]  ave_precision: 0.0148 

epoch: 10  combination: [0, 1, 2, 4]  ave_precision: 0.01658   min_precision: 0.01057  min_precision: 0.02174
epoch: 10  combination: [0, 1, 3, 4]  ave_precision: 0.01459   min_precision: 0.01258  min_precision: 0.01701
epoch: 10  combination: [0, 2, 3, 4]  ave_precision: 0.0111   min_precision: 0.00655  min_precision: 0.01623
epoch: 10  combination: [0, 1, 2, 3, 4]  ave_precision: 0.01374   min_precision: 0.01057  min_precision: 0.01821
epoch: 11  combination: [0]  ave_precision: 0.01104   min_precision: 0.0081  min_precision: 0.0152
epoch: 11  combination: [0, 1]  ave_precision: 0.01554   min_precision: 0.01325  min_precision: 0.01838
epoch: 11  combination: [0, 2]  ave_precision: 0.01086   min_precision: 0.00655  min_precision: 0.01931
epoch: 11  combination: [0, 3]  ave_precision: 0.0103   min_precision: 0.00655  min_precision: 0.01427
epoch: 11  combination: [0, 4]  ave_precision: 0.01043   min_precision: 0.00924  min_precision: 0.0119
epoch: 11  combination: [0, 1, 2]  ave_precis

epoch: 15  combination: [0, 2, 4]  ave_precision: 0.00972   min_precision: 0.00655  min_precision: 0.01376
epoch: 15  combination: [0, 3, 4]  ave_precision: 0.01128   min_precision: 0.00655  min_precision: 0.01502
epoch: 15  combination: [0, 1, 2, 3]  ave_precision: 0.01237   min_precision: 0.00655  min_precision: 0.01618
epoch: 15  combination: [0, 1, 2, 4]  ave_precision: 0.0151   min_precision: 0.01202  min_precision: 0.01779
epoch: 15  combination: [0, 1, 3, 4]  ave_precision: 0.0151   min_precision: 0.01256  min_precision: 0.01767
epoch: 15  combination: [0, 2, 3, 4]  ave_precision: 0.01197   min_precision: 0.00947  min_precision: 0.01515
epoch: 15  combination: [0, 1, 2, 3, 4]  ave_precision: 0.01363   min_precision: 0.00655  min_precision: 0.01792
epoch: 16  combination: [0]  ave_precision: 0.01066   min_precision: 0.00596  min_precision: 0.01471
epoch: 16  combination: [0, 1]  ave_precision: 0.01358   min_precision: 0.00655  min_precision: 0.02347
epoch: 16  combination: [0, 2]

epoch: 20  combination: [0, 1, 3]  ave_precision: 0.01297   min_precision: 0.00655  min_precision: 0.01739
epoch: 20  combination: [0, 1, 4]  ave_precision: 0.01527   min_precision: 0.00655  min_precision: 0.0304
epoch: 20  combination: [0, 2, 3]  ave_precision: 0.00959   min_precision: 0.00655  min_precision: 0.01227
epoch: 20  combination: [0, 2, 4]  ave_precision: 0.00959   min_precision: 0.00817  min_precision: 0.01229
epoch: 20  combination: [0, 3, 4]  ave_precision: 0.00898   min_precision: 0.00655  min_precision: 0.01247
epoch: 20  combination: [0, 1, 2, 3]  ave_precision: 0.01229   min_precision: 0.00906  min_precision: 0.01905
epoch: 20  combination: [0, 1, 2, 4]  ave_precision: 0.01543   min_precision: 0.00655  min_precision: 0.02415
epoch: 20  combination: [0, 1, 3, 4]  ave_precision: 0.01406   min_precision: 0.00655  min_precision: 0.01894
epoch: 20  combination: [0, 2, 3, 4]  ave_precision: 0.00929   min_precision: 0.00655  min_precision: 0.01282
epoch: 20  combination: [0

epoch: 25  combination: [0, 3]  ave_precision: 0.008   min_precision: 0.00655  min_precision: 0.01078
epoch: 25  combination: [0, 4]  ave_precision: 0.0119   min_precision: 0.00655  min_precision: 0.01499
epoch: 25  combination: [0, 1, 2]  ave_precision: 0.0139   min_precision: 0.00655  min_precision: 0.02008
epoch: 25  combination: [0, 1, 3]  ave_precision: 0.01202   min_precision: 0.00842  min_precision: 0.01757
epoch: 25  combination: [0, 1, 4]  ave_precision: 0.01654   min_precision: 0.01171  min_precision: 0.01949
epoch: 25  combination: [0, 2, 3]  ave_precision: 0.0079   min_precision: 0.00655  min_precision: 0.0107
epoch: 25  combination: [0, 2, 4]  ave_precision: 0.00997   min_precision: 0.00651  min_precision: 0.01279
epoch: 25  combination: [0, 3, 4]  ave_precision: 0.00879   min_precision: 0.00724  min_precision: 0.01206
epoch: 25  combination: [0, 1, 2, 3]  ave_precision: 0.0123   min_precision: 0.00639  min_precision: 0.01815
epoch: 25  combination: [0, 1, 2, 4]  ave_preci

epoch: 30  combination: [0]  ave_precision: 0.01044   min_precision: 0.00655  min_precision: 0.01318
epoch: 30  combination: [0, 1]  ave_precision: 0.01315   min_precision: 0.00655  min_precision: 0.02092
epoch: 30  combination: [0, 2]  ave_precision: 0.00792   min_precision: 0.00651  min_precision: 0.01033
epoch: 30  combination: [0, 3]  ave_precision: 0.00933   min_precision: 0.00671  min_precision: 0.01235
epoch: 30  combination: [0, 4]  ave_precision: 0.01018   min_precision: 0.00655  min_precision: 0.0141
epoch: 30  combination: [0, 1, 2]  ave_precision: 0.01286   min_precision: 0.00655  min_precision: 0.01949
epoch: 30  combination: [0, 1, 3]  ave_precision: 0.01077   min_precision: 0.00655  min_precision: 0.01368
epoch: 30  combination: [0, 1, 4]  ave_precision: 0.01829   min_precision: 0.01264  min_precision: 0.02695
epoch: 30  combination: [0, 2, 3]  ave_precision: 0.00851   min_precision: 0.00655  min_precision: 0.01192
epoch: 30  combination: [0, 2, 4]  ave_precision: 0.0105

In [12]:
from sklearn.metrics import classification_report

y_pred = [0 if label[0]> label[1] else 1 for label in pred]
y_true = [0 if label[0]> label[1] else 1 for label in test_sarflag]

print(classification_report(y_true,y_pred))
print(len(pred))

y_pred_index = [i for i in range(len(y_pred)) if y_pred[i]==1]
y_true_index = [i for i in range(len(y_true)) if y_true[i]==1]
print("predict index:",y_pred_index)
print("true index:",y_true_index)

              precision    recall  f1-score   support

           0       1.00      0.79      0.88      1834
           1       0.02      0.64      0.04        11

    accuracy                           0.79      1845
   macro avg       0.51      0.72      0.46      1845
weighted avg       0.99      0.79      0.88      1845

1845
predict index: [4, 6, 11, 12, 13, 18, 19, 24, 27, 29, 47, 57, 61, 62, 64, 71, 74, 77, 80, 81, 94, 95, 96, 97, 98, 114, 121, 146, 157, 162, 171, 176, 177, 183, 189, 199, 207, 212, 213, 220, 230, 231, 237, 247, 254, 256, 257, 262, 267, 269, 271, 273, 275, 281, 284, 285, 300, 303, 304, 310, 311, 312, 319, 326, 328, 333, 335, 344, 345, 346, 347, 364, 365, 368, 377, 379, 388, 396, 397, 400, 412, 414, 422, 424, 425, 426, 430, 431, 432, 438, 443, 447, 456, 460, 461, 462, 463, 466, 467, 468, 469, 471, 474, 477, 484, 486, 499, 513, 518, 522, 532, 533, 534, 536, 540, 541, 548, 550, 552, 555, 562, 572, 583, 588, 590, 592, 593, 595, 604, 609, 619, 624, 640, 642, 656, 661,