In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import numpy as np
import pandas as pd
import random
from sklearn.preprocessing import LabelEncoder

In [2]:
#loading train_data and divide them into different array: alertkey, custinfo, dp_5_arr
train_data = np.load("train_set_6.npy",allow_pickle = True)
print(train_data.shape)
alert_key_arr = train_data[:,0]

custinfo_arr = train_data[:,1]
train_custinfo = np.array([np.array(i) for i in custinfo_arr]) #convert all into array

dp_5_arr = train_data[:,2]
train_dp_5 = np.array([np.array(i) for i in dp_5_arr]) #convert all into array

ccba_5_arr = train_data[:,3]
train_ccba_5 = np.array([np.array(i) for i in ccba_5_arr]) #convert all into array

cdtx_5_arr = train_data[:,4]
train_cdtx_5 = np.array([np.array(i) for i in cdtx_5_arr]) #convert all into array

remit_5_arr = train_data[:,5]
train_remit_5 = np.array([np.array(i) for i in remit_5_arr]) #convert all into array

sarflag_arr = pd.read_csv("train_y_answer.csv").values
#capture the alertkey which is money laundry record
sarflag1_alertkey = np.array([row[0] for row in sarflag_arr if row[1]==1])

laundry_label = np.array([0,1])
normal_label = np.array([1,0])

train_sarflag = np.array([laundry_label if row[1] == 1 else normal_label for row in sarflag_arr])

(23906, 7)


In [3]:
#load testing dataset
test_data = np.load("public_set_6.npy",allow_pickle = True)

test_alert_key = test_data[:,0]

custinfo_arr = test_data[:,1]
test_custinfo = np.array([np.array(i) for i in custinfo_arr]) #convert all into array

dp_5_arr = test_data[:,2]
test_dp_5 = np.array([np.array(i) for i in dp_5_arr]) #convert all into array

ccba_5_arr = test_data[:,3]
test_ccba_5 = np.array([np.array(i) for i in ccba_5_arr]) #convert all into array

cdtx_5_arr = test_data[:,4]
test_cdtx_5 = np.array([np.array(i) for i in cdtx_5_arr]) #convert all into array

remit_5_arr = test_data[:,5]
test_remit_5 = np.array([np.array(i) for i in remit_5_arr]) #convert all into array


sarflag_arr = pd.read_csv("test_y_answer.csv").values
#capture the alertkey which is money laundry record
sarflag1_alertkey = np.array([row[0] for row in sarflag_arr if row[1]==1])

laundry_label = np.array([0,1])
normal_label = np.array([1,0])

test_sarflag = np.array([laundry_label if row[1] == 1 else normal_label for row in sarflag_arr])
print(test_sarflag)

[[1 0]
 [1 0]
 [1 0]
 ...
 [1 0]
 [1 0]
 [1 0]]


In [4]:
#find test_alertkey which is money laundry record
test_laundry_alertkey = [key for key in test_alert_key if key in sarflag1_alertkey]
print("test money laundry alert key:",test_laundry_alertkey)
print("len of test laundry records:",len(test_laundry_alertkey))

test money laundry alert key: [354939, 355091, 355152, 355724, 359668, 356602, 363320, 358453, 363896, 361617, 363033]
len of test laundry records: 11


In [5]:
#calculate class weight:
neg = 0
pos = 0
total = len(train_sarflag)

for label in train_sarflag:
    if label[0] == 1:
        neg += 1
    else:
        pos += 1

weight_for_0 = (1 / neg) * (total / 2.0)
weight_for_1 = (1 / pos) * (total / 2.0)/(3/2)

class_weight = {0: weight_for_0, 1: weight_for_1}

print(class_weight)

{0: 0.5049425481581615, 1: 34.054131054131055}


In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pandas as pd
import math
import tensorflow as tf
import logging
tf.get_logger().setLevel(logging.ERROR)
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Dropout
from tensorflow.keras import Input
#https://keras.io/api/layers/core_layers/input/
from tensorflow.keras.layers import concatenate

In [7]:
METRICS = [
      tf.keras.metrics.Recall(name='recall'),
]


In [8]:
def capture_sarflags_cost(real_laundry_key,y_pred,shuffle_alert_key):
    
    #concat alertkey with predicted probability
    alert_key_with_prob = np.hstack((np.array(shuffle_alert_key).reshape(-1,1),
                                     y_pred.reshape(-1,1)))
    
    #sorted by probability
    alert_key_with_prob = np.array(sorted(alert_key_with_prob, key=lambda x: x[1],reverse=True))
    #capture only index
    sorted_alert_key = alert_key_with_prob[:,0].tolist()
    
    distance = [0,0]
    for index in real_laundry_key:
        if distance[1] < sorted_alert_key.index(index):
            distance[0] = distance[1]
            distance[1] = sorted_alert_key.index(index)
        else:
            distance[0] = max(distance[0],sorted_alert_key.index(index))
    
    precision = (len(real_laundry_key)-1)/distance[0]
    return precision

In [9]:
def model_phase(epoch_num,comb,count):
    
    #define input_shape
    custinfo_input = Input(shape=(train_custinfo.shape[1]))
    dp_input = Input(shape=train_dp_5.shape[1:])
    ccba_input = Input(shape=train_ccba_5.shape[1:])
    cdtx_input = Input(shape=train_cdtx_5.shape[1:])
    remit_input = Input(shape=train_remit_5.shape[1:])
    
    
    
    dp_embedding = LSTM(train_dp_5.shape[2],activation="relu")(dp_input)
    dp_embedding = Dropout(0.2)(dp_embedding)
    
    ccba_embedding = LSTM(train_ccba_5.shape[2],activation="relu")(ccba_input)
    ccba_embedding = Dropout(0.2)(ccba_embedding)
    
    cdtx_embedding = LSTM(train_cdtx_5.shape[2],activation="relu")(cdtx_input)
    cdtx_embedding = Dropout(0.2)(cdtx_embedding)
    
    remit_embedding = LSTM(train_remit_5.shape[2],activation="relu")(remit_input)
    remit_embedding = Dropout(0.2)(remit_embedding)
    
    embedding_list = [custinfo_input,dp_embedding,ccba_embedding,cdtx_embedding,remit_embedding]
    
    if len(comb) == 1:
        concat_embedding = custinfo_input
    else:
        embed_to_concat = [embedding_list[i] for i in comb]
        concat_embedding = concatenate(embed_to_concat)
    
    out128 = Dense(128, activation="relu")(concat_embedding)
    
    out64 = Dense(64, activation="relu")(out128)
    
    out16 = Dense(16, activation="relu")(out64)
    
    out2 = Dense(2, activation="softmax")(out16)

    model = Model(inputs=[custinfo_input, dp_input,ccba_input,cdtx_input,remit_input], outputs=out2)
    model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
            optimizer='adam',
            metrics=METRICS)

    #callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=20)
    # 進行訓練
    history = model.fit([train_custinfo,train_dp_5,train_ccba_5,train_cdtx_5,train_remit_5], 
                        train_sarflag, 
                        epochs = epoch_num, 
                        batch_size = 256,
                        class_weight=class_weight,
                        #callbacks=[callback],
                        validation_data=([test_custinfo,test_dp_5,test_ccba_5,test_cdtx_5,test_remit_5],
                                         test_sarflag),
                        verbose=0,
                       )  
    
    
    
    y_pred = model.predict([test_custinfo,test_dp_5,test_ccba_5,test_cdtx_5,test_remit_5])
    model.save(f'saved_model/my_model{count}')
    tf.keras.backend.clear_session()
    return y_pred

In [10]:
#窮舉所有可能的排列組合
from itertools import combinations

def order_and_group(length):
    #because we need to keep a
    _len = length-1
    numerical_list = [i for i in range(1,_len+1,1)]
    
    possible_group = []
    for num in numerical_list:
        possible_group += list(combinations(numerical_list,num))
    
    comb_list = [[0]]
    for group in possible_group:
        temp_list = [0] + [i for i in group]
        comb_list.append(temp_list)
    return comb_list
comb_list = order_and_group(5)
print(comb_list)

[[0], [0, 1], [0, 2], [0, 3], [0, 4], [0, 1, 2], [0, 1, 3], [0, 1, 4], [0, 2, 3], [0, 2, 4], [0, 3, 4], [0, 1, 2, 3], [0, 1, 2, 4], [0, 1, 3, 4], [0, 2, 3, 4], [0, 1, 2, 3, 4]]


In [11]:
#right now num of epoch
comb_list = [0,1,4]
epoch_ = 16
pred_list= []
for _ in range(30):
    #store the result of each epoch
    result_list = []
    column_name = ["epoch_num","combination","average_precision","min_value","max_value"]
    precision_list = []
    pred = model_phase(epoch_,comb_list,_)
    precision = capture_sarflags_cost(test_laundry_alertkey,pred[:,1],test_alert_key)
    precision_list.append(precision)
    average_precision = round(sum(precision_list)/len(precision_list),5)
    max_precision = round(max(precision_list),5)
    min_precision = round(min(precision_list),5)
    pred_list.append(pred)
    print(f"epoch: {epoch_}  combination: {comb_list}  ave_precision: {average_precision}",
         f"  min_precision: {min_precision}  min_precision: {max_precision}")

epoch: 16  combination: [0, 1, 4]  ave_precision: 0.01425   min_precision: 0.01425  min_precision: 0.01425
epoch: 16  combination: [0, 1, 4]  ave_precision: 0.02268   min_precision: 0.02268  min_precision: 0.02268
epoch: 16  combination: [0, 1, 4]  ave_precision: 0.0173   min_precision: 0.0173  min_precision: 0.0173
epoch: 16  combination: [0, 1, 4]  ave_precision: 0.01828   min_precision: 0.01828  min_precision: 0.01828
epoch: 16  combination: [0, 1, 4]  ave_precision: 0.03021   min_precision: 0.03021  min_precision: 0.03021
epoch: 16  combination: [0, 1, 4]  ave_precision: 0.02008   min_precision: 0.02008  min_precision: 0.02008
epoch: 16  combination: [0, 1, 4]  ave_precision: 0.02398   min_precision: 0.02398  min_precision: 0.02398
epoch: 16  combination: [0, 1, 4]  ave_precision: 0.01605   min_precision: 0.01605  min_precision: 0.01605
epoch: 16  combination: [0, 1, 4]  ave_precision: 0.02208   min_precision: 0.02208  min_precision: 0.02208
epoch: 16  combination: [0, 1, 4]  ave_p

In [12]:
print(len(pred_list))
pred = pred_list[8]

30


In [13]:
ans_csv = pd.DataFrame([])
ans_csv["alert_key"] = test_data[:,0]
display(ans_csv)
ans_csv["probability"] = pred[:,1]
display(ans_csv)
ans_csv.to_csv("ans_csv1204.csv",index=False)

handout_data = pd.read_csv("D:\Python_projects\DataMining\yushan\預測的案件名單及提交檔案範例.csv")
handout_data = handout_data.merge(ans_csv,on="alert_key",how="left")
display(handout_data)
handout_data =  handout_data.drop(columns="probability_x")
handout_data = handout_data.rename(columns={"probability_y":"probability"})
handout_data = handout_data.fillna(0)
display(handout_data)
#handout_data.to_csv("handout_dataVer7.csv",index=False)


Unnamed: 0,alert_key
0,352342
1,352866
2,352696
3,352330
4,352683
...,...
1840,364472
1841,364788
1842,364673
1843,364626


Unnamed: 0,alert_key,probability
0,352342,1.632545e-21
1,352866,2.951764e-19
2,352696,1.641340e-07
3,352330,0.000000e+00
4,352683,8.480431e-01
...,...,...
1840,364472,8.859868e-01
1841,364788,9.187136e-07
1842,364673,8.388287e-01
1843,364626,3.711990e-01


Unnamed: 0,alert_key,probability_x,probability_y
0,357307,0.000017,2.378167e-28
1,376329,0.000324,
2,373644,0.000372,
3,357668,0.000489,6.269974e-10
4,354443,0.000526,1.430079e-25
...,...,...,...
3845,364485,0.997702,5.554610e-05
3846,363155,0.998987,1.077026e-01
3847,368710,0.999694,
3848,358067,0.999821,1.846920e-02


Unnamed: 0,alert_key,probability
0,357307,2.378167e-28
1,376329,0.000000e+00
2,373644,0.000000e+00
3,357668,6.269974e-10
4,354443,1.430079e-25
...,...,...
3845,364485,5.554610e-05
3846,363155,1.077026e-01
3847,368710,0.000000e+00
3848,358067,1.846920e-02


In [14]:
from sklearn.metrics import classification_report

y_pred = [0 if label[0]> label[1] else 1 for label in pred]
y_true = [0 if label[0]> label[1] else 1 for label in test_sarflag]

print(classification_report(y_true,y_pred))
print(len(pred))

y_pred_index = [i for i in range(len(y_pred)) if y_pred[i]==1]
y_true_index = [i for i in range(len(y_true)) if y_true[i]==1]
print("predict index:",y_pred_index)
print("true index:",y_true_index)

              precision    recall  f1-score   support

           0       1.00      0.72      0.83      1834
           1       0.02      0.91      0.04        11

    accuracy                           0.72      1845
   macro avg       0.51      0.81      0.44      1845
weighted avg       0.99      0.72      0.83      1845

1845
predict index: [4, 6, 8, 11, 12, 13, 17, 18, 19, 24, 27, 29, 32, 43, 46, 47, 54, 55, 56, 62, 63, 64, 65, 68, 70, 71, 77, 80, 81, 82, 94, 95, 96, 97, 98, 113, 133, 135, 137, 138, 140, 146, 152, 155, 157, 162, 167, 171, 177, 178, 183, 189, 192, 200, 207, 212, 213, 214, 218, 224, 230, 231, 237, 245, 247, 257, 262, 264, 269, 271, 273, 274, 275, 281, 284, 285, 294, 303, 304, 310, 311, 312, 313, 315, 319, 326, 329, 334, 335, 340, 342, 344, 345, 347, 354, 365, 368, 374, 375, 377, 379, 381, 388, 396, 397, 400, 409, 412, 414, 419, 421, 422, 424, 425, 432, 438, 443, 460, 461, 462, 463, 466, 467, 468, 469, 471, 474, 477, 484, 486, 489, 497, 498, 499, 500, 503, 506, 513, 