In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import numpy as np
import pandas as pd
import random
from sklearn.preprocessing import LabelEncoder

In [2]:
#loading train_data and divide them into different array: alertkey, custinfo, dp_5_arr
train_data = np.load("train_set_5.npy",allow_pickle = True)
print(train_data.shape)
alert_key_arr = train_data[:,0]

custinfo_arr = train_data[:,1]
train_custinfo = np.array([np.array(i) for i in custinfo_arr]) #convert all into array

dp_5_arr = train_data[:,2]
train_dp_5 = np.array([np.array(i) for i in dp_5_arr]) #convert all into array

ccba_5_arr = train_data[:,3]
train_ccba_5 = np.array([np.array(i) for i in ccba_5_arr]) #convert all into array

cdtx_5_arr = train_data[:,4]
train_cdtx_5 = np.array([np.array(i) for i in cdtx_5_arr]) #convert all into array

remit_5_arr = train_data[:,5]
train_remit_5 = np.array([np.array(i) for i in remit_5_arr]) #convert all into array

sarflag_arr = pd.read_csv("train_y_answer.csv").values
#capture the alertkey which is money laundry record
sarflag1_alertkey = np.array([row[0] for row in sarflag_arr if row[1]==1])

laundry_label = np.array([0,1])
normal_label = np.array([1,0])

train_sarflag = np.array([laundry_label if row[1] == 1 else normal_label for row in sarflag_arr])

(23906, 7)


In [3]:
#load testing dataset
test_data = np.load("public_set_5.npy",allow_pickle = True)

test_alert_key = test_data[:,0]

custinfo_arr = test_data[:,1]
test_custinfo = np.array([np.array(i) for i in custinfo_arr]) #convert all into array

dp_5_arr = test_data[:,2]
test_dp_5 = np.array([np.array(i) for i in dp_5_arr]) #convert all into array

ccba_5_arr = test_data[:,3]
test_ccba_5 = np.array([np.array(i) for i in ccba_5_arr]) #convert all into array

cdtx_5_arr = test_data[:,4]
test_cdtx_5 = np.array([np.array(i) for i in cdtx_5_arr]) #convert all into array

remit_5_arr = test_data[:,5]
test_remit_5 = np.array([np.array(i) for i in remit_5_arr]) #convert all into array


sarflag_arr = pd.read_csv("test_y_answer.csv").values
#capture the alertkey which is money laundry record
sarflag1_alertkey = np.array([row[0] for row in sarflag_arr if row[1]==1])

laundry_label = np.array([0,1])
normal_label = np.array([1,0])

test_sarflag = np.array([laundry_label if row[1] == 1 else normal_label for row in sarflag_arr])
print(test_sarflag)

[[1 0]
 [1 0]
 [1 0]
 ...
 [1 0]
 [1 0]
 [1 0]]


In [4]:
#find test_alertkey which is money laundry record
test_laundry_alertkey = [key for key in test_alert_key if key in sarflag1_alertkey]
print("test money laundry alert key:",test_laundry_alertkey)
print("len of test laundry records:",len(test_laundry_alertkey))

test money laundry alert key: [354939, 355091, 355152, 355724, 359668, 356602, 363320, 358453, 363896, 361617, 363033]
len of test laundry records: 11


In [5]:
#calculate class weight:
neg = 0
pos = 0
total = len(train_sarflag)

for label in train_sarflag:
    if label[0] == 1:
        neg += 1
    else:
        pos += 1

weight_for_0 = (1 / neg) * (total / 2.0)
weight_for_1 = (1 / pos) * (total / 2.0)/(3/2)

class_weight = {0: weight_for_0, 1: weight_for_1}

print(class_weight)

{0: 0.5049425481581615, 1: 34.054131054131055}


In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pandas as pd
import math
import tensorflow as tf
import logging
tf.get_logger().setLevel(logging.ERROR)
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Dropout
from tensorflow.keras import Input
#https://keras.io/api/layers/core_layers/input/
from tensorflow.keras.layers import concatenate

In [7]:
METRICS = [
      tf.keras.metrics.Recall(name='recall'),
]


In [8]:
def capture_sarflags_cost(real_laundry_key,y_pred,shuffle_alert_key):
    
    #concat alertkey with predicted probability
    alert_key_with_prob = np.hstack((np.array(shuffle_alert_key).reshape(-1,1),
                                     y_pred.reshape(-1,1)))
    
    #sorted by probability
    alert_key_with_prob = np.array(sorted(alert_key_with_prob, key=lambda x: x[1],reverse=True))
    #capture only index
    sorted_alert_key = alert_key_with_prob[:,0].tolist()
    
    distance = [0,0]
    for index in real_laundry_key:
        if distance[1] < sorted_alert_key.index(index):
            distance[0] = distance[1]
            distance[1] = sorted_alert_key.index(index)
        else:
            distance[0] = max(distance[0],sorted_alert_key.index(index))
    
    precision = (len(real_laundry_key)-1)/distance[0]
    return precision

In [9]:
def model_phase(epoch_num,comb):
    
    #define input_shape
    custinfo_input = Input(shape=(train_custinfo.shape[1]))
    dp_input = Input(shape=train_dp_5.shape[1:])
    ccba_input = Input(shape=train_ccba_5.shape[1:])
    cdtx_input = Input(shape=train_cdtx_5.shape[1:])
    remit_input = Input(shape=train_remit_5.shape[1:])
    
    
    
    dp_embedding = LSTM(train_dp_5.shape[2],activation="relu")(dp_input)
    dp_embedding = Dropout(0.2)(dp_embedding)
    
    ccba_embedding = LSTM(train_ccba_5.shape[2],activation="relu")(ccba_input)
    ccba_embedding = Dropout(0.2)(ccba_embedding)
    
    cdtx_embedding = LSTM(train_cdtx_5.shape[2],activation="relu")(cdtx_input)
    cdtx_embedding = Dropout(0.2)(cdtx_embedding)
    
    remit_embedding = LSTM(train_remit_5.shape[2],activation="relu")(remit_input)
    remit_embedding = Dropout(0.2)(remit_embedding)
    
    embedding_list = [custinfo_input,dp_embedding,ccba_embedding,cdtx_embedding,remit_embedding]
    
    if len(comb) == 1:
        concat_embedding = custinfo_input
    else:
        embed_to_concat = [embedding_list[i] for i in comb]
        concat_embedding = concatenate(embed_to_concat)
    
    out128 = Dense(128, activation="relu")(concat_embedding)
    
    out64 = Dense(64, activation="relu")(out128)
    
    out16 = Dense(16, activation="relu")(out64)
    
    out2 = Dense(2, activation="softmax")(out16)

    model = Model(inputs=[custinfo_input, dp_input,ccba_input,cdtx_input,remit_input], outputs=out2)
    model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
            optimizer='adam',
            metrics=METRICS)

    #callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=20)
    # 進行訓練
    history = model.fit([train_custinfo,train_dp_5,train_ccba_5,train_cdtx_5,train_remit_5], 
                        train_sarflag, 
                        epochs = epoch_num, 
                        batch_size = 256,
                        class_weight=class_weight,
                        #callbacks=[callback],
                        validation_data=([test_custinfo,test_dp_5,test_ccba_5,test_cdtx_5,test_remit_5],
                                         test_sarflag),
                        verbose=0,
                       )  
    
    
    
    y_pred = model.predict([test_custinfo,test_dp_5,test_ccba_5,test_cdtx_5,test_remit_5])
    
    tf.keras.backend.clear_session()
    return y_pred

In [10]:
#窮舉所有可能的排列組合
from itertools import combinations

def order_and_group(length):
    #because we need to keep a
    _len = length-1
    numerical_list = [i for i in range(1,_len+1,1)]
    
    possible_group = []
    for num in numerical_list:
        possible_group += list(combinations(numerical_list,num))
    
    comb_list = [[0]]
    for group in possible_group:
        temp_list = [0] + [i for i in group]
        comb_list.append(temp_list)
    return comb_list
comb_list = order_and_group(5)

In [11]:
#right now num of epoch
epoch_list = range(1,31,1)

#store the result of each epoch
result_list = []
column_name = ["epoch_num","combination","average_precision","min_value","max_value"]
for epoch in epoch_list:
    for comb in comb_list:
        precision_list = []
        for _ in range(10):#try 10 times
            pred = model_phase(epoch,comb)
            precision = capture_sarflags_cost(test_laundry_alertkey,pred[:,1],test_alert_key)
            precision_list.append(precision)
        average_precision = round(sum(precision_list)/len(precision_list),5)
        max_precision = round(max(precision_list),5)
        min_precision = round(min(precision_list),5)
        result_list.append([epoch,comb,average_precision,min_precision,max_precision])
        result_df = pd.DataFrame(result_list,columns=column_name)
        print(f"epoch: {epoch}  combination: {comb}  ave_precision: {average_precision}",
             f"  min_precision: {min_precision}  min_precision: {max_precision}")
        result_df.to_csv("precision_result_ver2.csv",index=False)

epoch: 1  combination: [0]  ave_precision: 0.00773   min_precision: 0.00595  min_precision: 0.01055
epoch: 1  combination: [0, 1]  ave_precision: 0.01659   min_precision: 0.01272  min_precision: 0.02247
epoch: 1  combination: [0, 2]  ave_precision: 0.00846   min_precision: 0.00609  min_precision: 0.01318
epoch: 1  combination: [0, 3]  ave_precision: 0.00831   min_precision: 0.00615  min_precision: 0.00958
epoch: 1  combination: [0, 4]  ave_precision: 0.00828   min_precision: 0.00632  min_precision: 0.01025
epoch: 1  combination: [0, 1, 2]  ave_precision: 0.01648   min_precision: 0.01115  min_precision: 0.02336
epoch: 1  combination: [0, 1, 3]  ave_precision: 0.01127   min_precision: 0.00907  min_precision: 0.01248
epoch: 1  combination: [0, 1, 4]  ave_precision: 0.01574   min_precision: 0.01389  min_precision: 0.01848
epoch: 1  combination: [0, 2, 3]  ave_precision: 0.00812   min_precision: 0.00602  min_precision: 0.00969
epoch: 1  combination: [0, 2, 4]  ave_precision: 0.00798   min_p

epoch: 5  combination: [0, 2, 3, 4]  ave_precision: 0.01096   min_precision: 0.00865  min_precision: 0.01385
epoch: 5  combination: [0, 1, 2, 3, 4]  ave_precision: 0.01532   min_precision: 0.01351  min_precision: 0.01739
epoch: 6  combination: [0]  ave_precision: 0.00954   min_precision: 0.00787  min_precision: 0.01214
epoch: 6  combination: [0, 1]  ave_precision: 0.01647   min_precision: 0.0134  min_precision: 0.01916
epoch: 6  combination: [0, 2]  ave_precision: 0.01093   min_precision: 0.00838  min_precision: 0.01555
epoch: 6  combination: [0, 3]  ave_precision: 0.01051   min_precision: 0.00849  min_precision: 0.01205
epoch: 6  combination: [0, 4]  ave_precision: 0.00944   min_precision: 0.00824  min_precision: 0.01044
epoch: 6  combination: [0, 1, 2]  ave_precision: 0.01537   min_precision: 0.01235  min_precision: 0.02053
epoch: 6  combination: [0, 1, 3]  ave_precision: 0.01378   min_precision: 0.01196  min_precision: 0.01582
epoch: 6  combination: [0, 1, 4]  ave_precision: 0.01718

epoch: 10  combination: [0, 1, 2, 4]  ave_precision: 0.01813   min_precision: 0.01318  min_precision: 0.03205
epoch: 10  combination: [0, 1, 3, 4]  ave_precision: 0.01588   min_precision: 0.01364  min_precision: 0.02222
epoch: 10  combination: [0, 2, 3, 4]  ave_precision: 0.01173   min_precision: 0.0089  min_precision: 0.01397
epoch: 10  combination: [0, 1, 2, 3, 4]  ave_precision: 0.01426   min_precision: 0.00955  min_precision: 0.01748
epoch: 11  combination: [0]  ave_precision: 0.01127   min_precision: 0.0088  min_precision: 0.01425
epoch: 11  combination: [0, 1]  ave_precision: 0.01731   min_precision: 0.01376  min_precision: 0.02257
epoch: 11  combination: [0, 2]  ave_precision: 0.01102   min_precision: 0.00829  min_precision: 0.01577
epoch: 11  combination: [0, 3]  ave_precision: 0.01056   min_precision: 0.0086  min_precision: 0.01266
epoch: 11  combination: [0, 4]  ave_precision: 0.01079   min_precision: 0.0089  min_precision: 0.01282
epoch: 11  combination: [0, 1, 2]  ave_preci

epoch: 15  combination: [0, 2, 4]  ave_precision: 0.01299   min_precision: 0.00905  min_precision: 0.0198
epoch: 15  combination: [0, 3, 4]  ave_precision: 0.01193   min_precision: 0.0077  min_precision: 0.01767
epoch: 15  combination: [0, 1, 2, 3]  ave_precision: 0.01628   min_precision: 0.01225  min_precision: 0.02404
epoch: 15  combination: [0, 1, 2, 4]  ave_precision: 0.01538   min_precision: 0.01159  min_precision: 0.02105
epoch: 15  combination: [0, 1, 3, 4]  ave_precision: 0.01636   min_precision: 0.01159  min_precision: 0.02128
epoch: 15  combination: [0, 2, 3, 4]  ave_precision: 0.01156   min_precision: 0.00984  min_precision: 0.01499
epoch: 15  combination: [0, 1, 2, 3, 4]  ave_precision: 0.01717   min_precision: 0.00839  min_precision: 0.02688
epoch: 16  combination: [0]  ave_precision: 0.01104   min_precision: 0.00982  min_precision: 0.01508
epoch: 16  combination: [0, 1]  ave_precision: 0.02035   min_precision: 0.01661  min_precision: 0.025
epoch: 16  combination: [0, 2]  

epoch: 20  combination: [0, 1, 3]  ave_precision: 0.01407   min_precision: 0.01056  min_precision: 0.01715
epoch: 20  combination: [0, 1, 4]  ave_precision: 0.02128   min_precision: 0.01355  min_precision: 0.03817
epoch: 20  combination: [0, 2, 3]  ave_precision: 0.01047   min_precision: 0.00707  min_precision: 0.01316
epoch: 20  combination: [0, 2, 4]  ave_precision: 0.01038   min_precision: 0.00674  min_precision: 0.01279
epoch: 20  combination: [0, 3, 4]  ave_precision: 0.00929   min_precision: 0.00784  min_precision: 0.01307
epoch: 20  combination: [0, 1, 2, 3]  ave_precision: 0.01276   min_precision: 0.00801  min_precision: 0.01608
epoch: 20  combination: [0, 1, 2, 4]  ave_precision: 0.01894   min_precision: 0.01224  min_precision: 0.02457
epoch: 20  combination: [0, 1, 3, 4]  ave_precision: 0.01527   min_precision: 0.01311  min_precision: 0.01792
epoch: 20  combination: [0, 2, 3, 4]  ave_precision: 0.00993   min_precision: 0.00688  min_precision: 0.01684
epoch: 20  combination: [

KeyboardInterrupt: 

In [None]:
from sklearn.metrics import classification_report

y_pred = [0 if label[0]> label[1] else 1 for label in pred]
y_true = [0 if label[0]> label[1] else 1 for label in test_sarflag]

print(classification_report(y_true,y_pred))
print(len(pred))

y_pred_index = [i for i in range(len(y_pred)) if y_pred[i]==1]
y_true_index = [i for i in range(len(y_true)) if y_true[i]==1]
print("predict index:",y_pred_index)
print("true index:",y_true_index)