In [1]:
import pandas as pd
import numpy as np
from torch import nn
import torch.optim as optim
import json, copy, pickle, torch
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings("ignore")

## custinfo & ccba

In [2]:
info_raw = pd.read_csv("data/inverse/info.csv")
ccba_raw = pd.read_csv("data/inverse/ccba.csv")
cdtx_raw = pd.read_csv("data/inverse/cdtx.csv")
dp_raw = pd.read_csv("data/inverse/dp.csv")
remit_raw = pd.read_csv("data/inverse/remit.csv")

In [3]:
info_raw["month"] = info_raw["date"].apply(lambda X: X[:7])
ccba_raw["month"] = ccba_raw["byymm"].apply(lambda X: X[:7])
info_raw = pd.merge(info_raw, ccba_raw, on = ["cust_id", "month"], how = "left")
info_raw = info_raw.drop(["month", "byymm"], axis = 1)
info_raw.head()

Unnamed: 0,alert_key,date,sar_flag,cust_id,risk_rank,occupation_code,total_asset,AGE,lupay,cycam,usgam,clamt,csamt,inamt,cucsm,cucah
0,171142,2021-04-01,0.0,a39fea9aec90969fe66a2b2b4d1b86368a2d38e8b8d4bf...,3,12.0,241719.0,3,12565.0,150744.0,82748.0,0.0,0.0,12477.0,12477.0,0.0
1,171152,2021-04-01,0.0,7e42b5dca9b28ee8e5545beb834361e90e6197d176b389...,3,13.0,599497.0,6,3581.0,324783.0,64363.0,0.0,0.0,0.0,4981.0,0.0
2,171177,2021-04-01,0.0,a6cdf6302aead77112013168c6d546d2df3bcb551956d2...,1,19.0,51160.0,4,,,,,,,,
3,171178,2021-04-01,0.0,1a3efa69705f611c7ef2384a715c8142e2ee801cfec9df...,3,9.0,3634343.0,6,829364.0,7666339.0,2343836.0,0.0,0.0,781279.0,781279.0,0.0
4,171180,2021-04-01,0.0,67f8cbb64dd3d447e992b1b299e0ceed3372188e47c88e...,1,17.0,4076287.0,4,636.0,256134.0,3538.0,0.0,0.0,0.0,3410.0,0.0


In [4]:
info_raw = info_raw[~info_raw["occupation_code"].isnull()]
info_raw

Unnamed: 0,alert_key,date,sar_flag,cust_id,risk_rank,occupation_code,total_asset,AGE,lupay,cycam,usgam,clamt,csamt,inamt,cucsm,cucah
0,171142,2021-04-01,0.0,a39fea9aec90969fe66a2b2b4d1b86368a2d38e8b8d4bf...,3,12.0,241719.0,3,12565.0,150744.0,82748.0,0.0,0.0,12477.0,12477.0,0.0
1,171152,2021-04-01,0.0,7e42b5dca9b28ee8e5545beb834361e90e6197d176b389...,3,13.0,599497.0,6,3581.0,324783.0,64363.0,0.0,0.0,0.0,4981.0,0.0
2,171177,2021-04-01,0.0,a6cdf6302aead77112013168c6d546d2df3bcb551956d2...,1,19.0,51160.0,4,,,,,,,,
3,171178,2021-04-01,0.0,1a3efa69705f611c7ef2384a715c8142e2ee801cfec9df...,3,9.0,3634343.0,6,829364.0,7666339.0,2343836.0,0.0,0.0,781279.0,781279.0,0.0
4,171180,2021-04-01,0.0,67f8cbb64dd3d447e992b1b299e0ceed3372188e47c88e...,1,17.0,4076287.0,4,636.0,256134.0,3538.0,0.0,0.0,0.0,3410.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25746,365001,2022-04-29,,18ee644a371548e9780d701aaa7e0c8c42a7794cdee755...,1,17.0,135072.0,3,7918.0,313340.0,58134.0,0.0,0.0,0.0,0.0,0.0
25747,365004,2022-04-29,,7f69fa9eab8f397d367e2bb61ee1fa008999a0aab91e06...,3,12.0,2285386.0,3,284394.0,342995.0,79765.0,0.0,0.0,0.0,97941.0,0.0
25748,365008,2022-04-29,,12c9e6d35500d2a96fc2b22a9da8e3deb6048de515a16e...,3,19.0,1230244.0,2,0.0,120106.0,0.0,0.0,0.0,0.0,0.0,0.0
25749,365009,2022-04-29,,d24d46c19002ab1f9a02801af5e4be6a154b3c5adc0417...,1,17.0,162418.0,2,843074.0,31322.0,16124.0,0.0,0.0,0.0,817962.0,0.0


In [5]:
X = info_raw.drop("sar_flag", axis = 1)
y = info_raw["sar_flag"]

In [6]:
def cyclical_feat_encode(df):
    df["date"] = pd.to_datetime(df["date"])

    df['month'] = df["date"].dt.month
    df['day'] = df["date"].dt.day

    df['month_sin'] = np.sin(2 * np.pi *  df['month']/ df["month"].max())
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / df["month"].max())

    df['day_sin'] = np.sin(2 * np.pi * df['day'] / df["day"].max())
    df['day_cos'] = np.cos(2 * np.pi * df['day'] / df["day"].max())

    df = df.drop(["month", "day", "date"], axis = 1)
    
    return df

In [7]:
X = cyclical_feat_encode(X)
X.head()

Unnamed: 0,alert_key,cust_id,risk_rank,occupation_code,total_asset,AGE,lupay,cycam,usgam,clamt,csamt,inamt,cucsm,cucah,month_sin,month_cos,day_sin,day_cos
0,171142,a39fea9aec90969fe66a2b2b4d1b86368a2d38e8b8d4bf...,3,12.0,241719.0,3,12565.0,150744.0,82748.0,0.0,0.0,12477.0,12477.0,0.0,0.866025,-0.5,0.201299,0.97953
1,171152,7e42b5dca9b28ee8e5545beb834361e90e6197d176b389...,3,13.0,599497.0,6,3581.0,324783.0,64363.0,0.0,0.0,0.0,4981.0,0.0,0.866025,-0.5,0.201299,0.97953
2,171177,a6cdf6302aead77112013168c6d546d2df3bcb551956d2...,1,19.0,51160.0,4,,,,,,,,,0.866025,-0.5,0.201299,0.97953
3,171178,1a3efa69705f611c7ef2384a715c8142e2ee801cfec9df...,3,9.0,3634343.0,6,829364.0,7666339.0,2343836.0,0.0,0.0,781279.0,781279.0,0.0,0.866025,-0.5,0.201299,0.97953
4,171180,67f8cbb64dd3d447e992b1b299e0ceed3372188e47c88e...,1,17.0,4076287.0,4,636.0,256134.0,3538.0,0.0,0.0,0.0,3410.0,0.0,0.866025,-0.5,0.201299,0.97953


In [8]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
X["cust_id"] = le.fit_transform(X["cust_id"])
X.head()

Unnamed: 0,alert_key,cust_id,risk_rank,occupation_code,total_asset,AGE,lupay,cycam,usgam,clamt,csamt,inamt,cucsm,cucah,month_sin,month_cos,day_sin,day_cos
0,171142,4927,3,12.0,241719.0,3,12565.0,150744.0,82748.0,0.0,0.0,12477.0,12477.0,0.0,0.866025,-0.5,0.201299,0.97953
1,171152,3745,3,13.0,599497.0,6,3581.0,324783.0,64363.0,0.0,0.0,0.0,4981.0,0.0,0.866025,-0.5,0.201299,0.97953
2,171177,5017,1,19.0,51160.0,4,,,,,,,,,0.866025,-0.5,0.201299,0.97953
3,171178,786,3,9.0,3634343.0,6,829364.0,7666339.0,2343836.0,0.0,0.0,781279.0,781279.0,0.0,0.866025,-0.5,0.201299,0.97953
4,171180,3057,1,17.0,4076287.0,4,636.0,256134.0,3538.0,0.0,0.0,0.0,3410.0,0.0,0.866025,-0.5,0.201299,0.97953


In [9]:
alert_key = X.pop("alert_key")

In [10]:
X_train = X.iloc[:-1845]
X_test = X.iloc[-1845:]
y_train = y.iloc[:-1845]
y_test = y.iloc[-1845:]

In [11]:
import category_encoders as ce

cat_feat = ["cust_id", "risk_rank", "occupation_code", "AGE"]
ce_target = ce.TargetEncoder(cols = cat_feat)
X_train = ce_target.fit_transform(X_train, y_train)
X_test = ce_target.transform(X_test)
X_train.head()

Unnamed: 0,cust_id,risk_rank,occupation_code,total_asset,AGE,lupay,cycam,usgam,clamt,csamt,inamt,cucsm,cucah,month_sin,month_cos,day_sin,day_cos
0,0.0,0.002141,0.007585,241719.0,0.011447,12565.0,150744.0,82748.0,0.0,0.0,12477.0,12477.0,0.0,0.866025,-0.5,0.201299,0.97953
1,1.642763e-07,0.002141,0.01693,599497.0,0.005569,3581.0,324783.0,64363.0,0.0,0.0,0.0,4981.0,0.0,0.866025,-0.5,0.201299,0.97953
2,0.002645325,0.012996,0.00942,51160.0,0.008417,,,,,,,,,0.866025,-0.5,0.201299,0.97953
3,3.38527e-16,0.002141,0.00563,3634343.0,0.005569,829364.0,7666339.0,2343836.0,0.0,0.0,781279.0,781279.0,0.0,0.866025,-0.5,0.201299,0.97953
4,8.178964e-09,0.012996,0.010621,4076287.0,0.008417,636.0,256134.0,3538.0,0.0,0.0,0.0,3410.0,0.0,0.866025,-0.5,0.201299,0.97953


In [12]:
from sklearn.impute import KNNImputer

imputer = KNNImputer()
X_train1 = imputer.fit_transform(X_train)
X_train = pd.DataFrame(X_train1, columns = imputer.get_feature_names_out())

In [13]:
X_test1 = imputer.transform(X_test)
X_test = pd.DataFrame(X_test1, columns = imputer.get_feature_names_out())

In [26]:
sar1_idx = (y_train == 1).tolist()
sar0_idx = (y_train == 0).tolist()

X_train1 = X_train[sar1_idx]
X_train0 = X_train[sar0_idx]
y_train1 = y_train[sar1_idx]
y_train0 = y_train[sar0_idx]

In [28]:
from sklearn import svm
clf = svm.OneClassSVM()
clf.fit(X_train0)

In [29]:
outlier = clf.predict(X_train1)

In [30]:
len(outlier), len(X_train1), len(y_train1)

(234, 234, 234)

In [31]:
outlier

array([-1, -1,  1, -1,  1,  1, -1, -1, -1,  1,  1, -1,  1,  1, -1,  1,  1,
       -1, -1,  1,  1, -1,  1, -1,  1,  1, -1,  1,  1,  1, -1,  1,  1,  1,
        1, -1, -1,  1,  1, -1,  1, -1, -1,  1,  1,  1,  1,  1, -1, -1,  1,
       -1,  1,  1, -1, -1, -1, -1, -1,  1, -1,  1, -1,  1,  1, -1,  1,  1,
       -1,  1, -1,  1, -1, -1,  1,  1,  1,  1,  1, -1,  1, -1, -1, -1,  1,
       -1,  1,  1,  1,  1, -1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1,
        1,  1,  1,  1, -1, -1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,
        1,  1,  1,  1,  1, -1, -1,  1, -1,  1, -1, -1,  1,  1, -1, -1,  1,
        1, -1, -1,  1, -1,  1,  1,  1,  1, -1,  1,  1, -1, -1,  1, -1, -1,
        1, -1,  1,  1,  1,  1, -1,  1,  1,  1, -1,  1, -1,  1,  1, -1,  1,
        1, -1, -1, -1, -1, -1,  1,  1,  1, -1, -1,  1,  1,  1, -1,  1, -1,
       -1, -1,  1,  1, -1,  1, -1,  1,  1,  1, -1,  1,  1,  1,  1, -1,  1,
       -1,  1, -1, -1,  1,  1,  1,  1,  1,  1, -1,  1,  1, -1,  1, -1, -1,
        1, -1, -1, -1, -1

In [34]:
print("1:", (outlier == 1).sum())
print("-1:", (outlier == -1).sum())

1: 136
-1: 98


## Embedding

In [2]:
import pandas as pd
import numpy as np
from torch import nn
import torch.optim as optim
import json, copy, pickle, torch
from tqdm import tqdm

In [3]:
data = pickle.load(open('data/inverse/cust_id2.pkl', 'rb'))

In [4]:
max_len = 256

In [5]:
def data_process(data, mode = "train"):
    data_X = []
    data_Y = []
    for k1, v1 in data.items():
        for k2, v2 in v1.items():
            # 判斷要抓的是train data還是test data
            if mode == "train":
                if v2["data_type"] == "test":
                    continue
            else:
                if v2["data_type"] == "train":
                    continue
            
            idx = [[] for _ in range(4)]
            data2 = [[] for _ in range(4)]

            max_idx = len(v2["data"]) - 1

            for i, trade in enumerate(v2["data"]):
                source = trade["source"]
                trade1 = {k: v for k, v in trade.items() if k not in ["date", "source", "alert_key"]} #刪除日期跟資料來源

                if i == max_idx:
                    data_Y.append(trade["sar_flag"])
                    trade1["sar_flag"] = 2
                

                idx[source].append(i)
                data2[source].append(list(trade1.values()))

            data_X.append([idx, data2, trade["alert_key"]])


    # train - X shape = (總樣本數(23906), 該樣本下的交易index、內容和alert_key(3), 資料源(4), 數據)
    print(f"Mode: {mode}, Total sample: {len(data_X)}")

    return data_X, data_Y

In [6]:
X_train, y_train = data_process(data, mode = "train")
X_test, y_test = data_process(data, mode = "test")

Mode: train, Total sample: 23906
Mode: test, Total sample: 1845


In [7]:
sar1 = sum(y_train)
sar0 = len(y_train) - sum(y_train)
total = len(y_train)

print(f"train - 0: {sar1}, 1: {sar0}, total: {total}, 0/1: {round(sar0/sar1)}")

train - 0: 234, 1: 23672, total: 23906, 0/1: 101


from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.2, shuffle = True, stratify = y_train, random_state = 99)
print(f"X_train: {len(X_train)}, X_valid: {len(X_valid)}, X_test: {len(X_test)}\ny_train: {len(y_train)}, y_valid: {len(y_valid)}, y_test: {len(y_test)}")

print(f"train - 0: {len(y_train) - sum(y_train)}, 1: {sum(y_train)}, total: {len(y_train)}")
print(f"valid - 0: {len(y_valid) - sum(y_valid)}, 1: {sum(y_valid)}, total: {len(y_valid)}")

In [8]:
import gc

del data
gc.collect()

16

In [9]:
from torch.utils.data import DataLoader, Dataset
class Dataset_transform(Dataset):
    def __init__(self, X, y):
        self.n_samples = len(y)
        self.X = X
        self.y = torch.tensor(y).float().reshape(-1, 1)
                                            
    def __len__(self):
        return self.n_samples

    def __getitem__(self, idx):
        seq_idx = self.X[idx][0]
        x1 = self.X[idx][1]
        alert_key = self.X[idx][2]
        y1 = self.y[idx]


        return [torch.tensor(s).long() for s in seq_idx], [torch.tensor(x2) for x2 in x1], y1, alert_key

In [10]:
train_dataset = Dataset_transform(X_train, y_train)
# valid_dataset = Dataset_transform(X_valid, y_valid)
test_dataset = Dataset_transform(X_test, y_test)

In [11]:
def BatchCollate(data):
    batch_idxs = [torch.tensor([], dtype = torch.long) for i in range(4)]
    seq_idxs = [torch.tensor([], dtype = torch.long) for i in range(4)]
    xs = [torch.tensor([]) for i in range(4)]
    targets = torch.tensor([])
    alert_keys = []

    for batch, d in enumerate(data):
        for i in range(4):
            seq_idxs[i] = torch.cat((seq_idxs[i], d[0][i]))
            xs[i] = torch.cat((xs[i], d[1][i]))
            
            batch1 = torch.tensor([batch] * len(d[0][i])).long()
            batch_idxs[i] = torch.cat((batch_idxs[i], batch1))

        targets = torch.cat((targets, d[2]))
        alert_keys.append(d[3])


    return [batch_idxs, seq_idxs, xs], targets.reshape(-1, 1), alert_keys

In [12]:
batch_size = 64
train_dataloader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True, collate_fn = BatchCollate)
# valid_dataloader = DataLoader(valid_dataset, batch_size = len(valid_dataset), collate_fn = BatchCollate)
test_dataloader = DataLoader(test_dataset, batch_size = len(test_dataset), collate_fn = BatchCollate)

In [13]:
with open("feats_type.json", newline='') as file:
    feats_type = json.load(file)

with open("category_num.json", newline='') as file:
    category_num = json.load(file)

In [14]:
import torch.nn.functional as F

class Encoder(nn.Module):
    """
    src: https://github.com/baosenguo/Kaggle-MoA-2nd-Place-Solution/blob/main/training/1d-cnn-train.ipynb
    """
    def __init__(self, num_features, embed_output=128, hidden_size=512, dropout=0.3):
        super().__init__()
        cha_1 = 64
        cha_2 = 128
        cha_3 = 128

        cha_1_reshape = int(hidden_size/cha_1)
        cha_po_1 = int(hidden_size/cha_1/2)
        cha_po_2 = int(hidden_size/cha_1/2/2) * cha_3

        self.cha_1 = cha_1
        self.cha_2 = cha_2
        self.cha_3 = cha_3
        self.cha_1_reshape = cha_1_reshape
        self.cha_po_1 = cha_po_1
        self.cha_po_2 = cha_po_2

        self.batch_norm1 = nn.BatchNorm1d(num_features)
        self.dropout1 = nn.Dropout(dropout)
        self.dense1 = nn.utils.weight_norm(nn.Linear(num_features, hidden_size))

        self.batch_norm_c1 = nn.BatchNorm1d(cha_1)
        self.dropout_c1 = nn.Dropout(dropout*0.9)
        self.conv1 = nn.utils.weight_norm(nn.Conv1d(cha_1,cha_2, kernel_size = 5, stride = 1, padding=2,  bias=False),dim=None)

        self.ave_po_c1 = nn.AdaptiveAvgPool1d(output_size = cha_po_1)

        self.batch_norm_c2 = nn.BatchNorm1d(cha_2)
        self.dropout_c2 = nn.Dropout(dropout*0.8)
        self.conv2 = nn.utils.weight_norm(nn.Conv1d(cha_2,cha_2, kernel_size = 3, stride = 1, padding=1, bias=True),dim=None)

        self.batch_norm_c2_1 = nn.BatchNorm1d(cha_2)
        self.dropout_c2_1 = nn.Dropout(dropout*0.6)
        self.conv2_1 = nn.utils.weight_norm(nn.Conv1d(cha_2,cha_2, kernel_size = 3, stride = 1, padding=1, bias=True),dim=None)

        self.batch_norm_c2_2 = nn.BatchNorm1d(cha_2)
        self.dropout_c2_2 = nn.Dropout(dropout*0.5)
        self.conv2_2 = nn.utils.weight_norm(nn.Conv1d(cha_2,cha_3, kernel_size = 5, stride = 1, padding=2, bias=True),dim=None)

        self.max_po_c2 = nn.MaxPool1d(kernel_size=4, stride=2, padding=1)

        self.flt = nn.Flatten()
        
        self.batch_norm3 = nn.BatchNorm1d(cha_po_2)
        self.dropout3 = nn.Dropout(dropout)
        self.dense3 = nn.utils.weight_norm(nn.Linear(cha_po_2, embed_output))

    def forward(self, x):

        x = self.batch_norm1(x)
        x = self.dropout1(x)
        x = F.celu(self.dense1(x), alpha=0.06)

        x = x.reshape(x.shape[0],self.cha_1,
                        self.cha_1_reshape)

        x = self.batch_norm_c1(x)
        x = self.dropout_c1(x)
        x = F.relu(self.conv1(x))

        x = self.ave_po_c1(x)

        x = self.batch_norm_c2(x)
        x = self.dropout_c2(x)
        x = F.relu(self.conv2(x))
        x_s = x

        x = self.batch_norm_c2_1(x)
        x = self.dropout_c2_1(x)
        x = F.relu(self.conv2_1(x))

        x = self.batch_norm_c2_2(x)
        x = self.dropout_c2_2(x)
        x = F.relu(self.conv2_2(x))
        x =  x * x_s

        x = self.max_po_c2(x)

        x = self.flt(x)

        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = self.dense3(x)

        return x

In [15]:
from traceback import format_exc

In [16]:
class FeatureEmbedder(torch.nn.Module):
    def __init__(self, feat_type, category_num, embed_dim = 4, embed_output = 32, hidden_size = 256, dropout = 0.3):
        super().__init__()

        feat_type = {k: v for k, v in feat_type.items() if v in ["category", "int", "float"]}

        layers = []
        for k, v in feat_type.items():
            if v == "category":
                layers.append(nn.Embedding(category_num[k], embed_dim))
            else:
                layers.append(nn.Linear(1, embed_dim))
        self.embeddings = torch.nn.ModuleList(layers)

        self.encoder = Encoder(
            num_features = len(feat_type) * embed_dim, 
            embed_output = embed_output, 
            hidden_size = hidden_size,
            dropout = dropout
        )

        self.feat_type = feat_type
    
    def forward(self, x):
        # 透過embedding把每個feature的數據轉成(embed_dim)維的向量
        embs = []
        
        for i, (type_, emb_layer) in enumerate(zip(self.feat_type.values(), self.embeddings)):
            if type_ == "category": # 類別變數
                x1 = emb_layer(x[:, i].long())
            else: # 連續變數
                x1 = emb_layer(x[:, i].reshape(-1, 1))
            
            embs.append(x1)
        
        
        embs = torch.cat(embs, dim=1)# 把所有feature的向量合併
        embs = self.encoder(embs) # 透過encoder轉換，統一輸出維度(embed_output)
            
        return embs


In [17]:
embed_dim = 4
embed_output = 32
embed_hidden_size = 256
embed_dropout = 0.3

In [18]:
# 每個資料源建一個embedder
layers = []
for k, v in feats_type.items():
    embedder = FeatureEmbedder(v, category_num, embed_dim, embed_output, embed_hidden_size, embed_dropout)
    layers.append(embedder)
embedders = torch.nn.ModuleList(layers)

In [19]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [20]:
from temporal_aggregator import *

class Model(nn.Module):
    def __init__(self, embedders, input_size, max_len, hidden_size, temporal_aggregator_type="TemporalDebertaAggregator", 
                temporal_aggregator_args={
                    "hidden_size": 32,
                    "num_layers": 3,
                    "dropout": 0.3,
                    "max_len": 512
                }
                ):
        super().__init__()
        self.embedders = embedders
        
        self.max_len = max_len
        self.input_size = input_size

        self.temporal_aggregator = eval(
            f"{temporal_aggregator_type}")(**temporal_aggregator_args)
        self.classifier = nn.Sequential(
            nn.Linear(temporal_aggregator_args["hidden_size"], 1),
            nn.Sigmoid()
        )
    
    def forward(self, x, batch):
        with torch.no_grad():
            # 透過embedder統一各資料源的feature數量
            for s in range(4):
                if len(x[2][s]) == 0:
                    continue
                elif len(x[2][s]) == 1:
                    x[2][s] = torch.zeros(1, self.input_size)# 只有1個sample，沒辦法做batch_norm1，encoder會出錯，直接給0
                else:
                    x[2][s] = self.embedders[s](x[2][s].to(device))
            

            x1 = torch.zeros(batch, self.max_len, self.input_size).to(device) # shape: (batch, max_len, features)
            mask = torch.zeros((batch, self.max_len)).long().to(device)
            
            # 合併各資料源的資料
            for s in range(4):
                for i in range(len(x[0][s])):
                    batch_idx, seq_idx, features = x[0][s][i], x[1][s][i], x[2][s][i]
                    x1[batch_idx][seq_idx] = features
                    mask[batch_idx, seq_idx] = 1


        out = self.temporal_aggregator(x1, mask)
        out = self.classifier(out).squeeze(-1)

        return out.reshape(-1, 1), x1

In [21]:
hidden_size = 16

In [22]:
epochs = 100
model = Model(embedders, embed_output, max_len, hidden_size).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr = 1e-3)

In [23]:
def recall_n(output, target):
    comb = list(zip(output, target))
    comb.sort(key=lambda x:x[0])
    flag = False
    for i, (out, gt) in enumerate(comb):
        if gt == 1:
            if flag:
                break
            flag = True

    recall = ((sum(target)-1) / (len(target)-i))
    
    return recall.item()

In [24]:
def loss_weight(labels):
    weight = []
    for label in labels:
        if label == 1:
            weight.append(75)
        elif label == 0:
            weight.append(1)
            
    return torch.tensor(weight).reshape(-1, 1)

In [25]:
# validate、test預測後的損失函數，以及相關分數
def eval_score(dataloader, model, criterion, mode = "eval"):
    with torch.no_grad():
        losses = 0
        pred1, y1 = torch.Tensor([]).to(device), torch.Tensor([]).to(device)
        for batch, batch_data in enumerate(dataloader):
            X, y, alert_key = batch_data
            y = y.to(device)

            pred = model(X, len(y)) #預測

            if mode == "train":
                weight = loss_weight(y)
                criterion = nn.BCELoss(weight = weight).to(device)
                loss = criterion(pred, y) #計算損失函數
                losses += loss.item()

            pred1 = torch.concat([pred1, pred])
            y1 = torch.concat([y1, y])


        if mode == "train":
            losses /= (batch + 1)
            
            recall = recall_n(pred1, y1)

            return losses, recall
            
    return pred1, alert_key

In [26]:
model.load_state_dict(torch.load(f"model/20221206.pt")) # 更改model權重
model.eval()

Model(
  (embedders): ModuleList(
    (0): FeatureEmbedder(
      (embeddings): ModuleList(
        (0): Embedding(7708, 4)
        (1): Embedding(128, 4)
        (2): Embedding(51, 4)
        (3): Linear(in_features=1, out_features=4, bias=True)
        (4): Linear(in_features=1, out_features=4, bias=True)
        (5): Linear(in_features=1, out_features=4, bias=True)
        (6): Linear(in_features=1, out_features=4, bias=True)
        (7): Linear(in_features=1, out_features=4, bias=True)
      )
      (encoder): Encoder(
        (batch_norm1): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (dropout1): Dropout(p=0.3, inplace=False)
        (dense1): Linear(in_features=32, out_features=256, bias=True)
        (batch_norm_c1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (dropout_c1): Dropout(p=0.27, inplace=False)
        (conv1): Conv1d(64, 128, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)


In [27]:
X_embeds = torch.tensor([], device=device)
y_trains = torch.tensor([], device=device)
for batch, batch_data in enumerate(tqdm(train_dataloader)):
    X_train1, y_train1, alert_key = batch_data
    y_train1 = y_train1.to(device)

    train_pred, X_embed = model(X_train1, len(y_train1)) #預測
    X_embeds = torch.concat((X_embeds, X_embed))
    y_trains = torch.concat((y_trains, y_train1))

100%|██████████| 374/374 [02:28<00:00,  2.53it/s]


In [28]:
X_embeds.shape, y_trains.shape

(torch.Size([23906, 256, 32]), torch.Size([23906, 1]))

In [29]:
X_train = X_embeds.cpu().view(23906, -1)
X_train.shape

torch.Size([23906, 8192])

In [30]:
y_train = y_trains.cpu()

In [31]:
sar1_idx = (y_train == 1).view(-1)
sar0_idx = (y_train == 0).view(-1)

X_train1 = X_train[sar1_idx]
X_train0 = X_train[sar0_idx]
y_train1 = y_train[sar1_idx]
y_train0 = y_train[sar0_idx]

from sklearn.cluster import DBSCAN
clustering = DBSCAN().fit(X_train0)
outlier1 = clustering.labels_
outlier1

(outlier1 == 1).sum(), pd.DataFrame(y_train)[(outlier1 == 1)].sum()[0]

In [89]:
from sklearn.ensemble import IsolationForest
clf = IsolationForest(random_state=0).fit(X_train0)
outlier2 = clf.predict(X_train1)
outlier2

array([ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1

In [90]:
print("total:", len(outlier))
print("1:", (outlier2 == 1).sum())
print("-1:", (outlier2 == -1).sum())

total: 234
1: 230
-1: 4


In [32]:
from sklearn import svm
clf = svm.OneClassSVM().fit(X_train0)
outlier3 = clf.predict(X_train1)
outlier3

array([ 1, -1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1, -1, -1, -1,  1,  1,
        1, -1,  1, -1, -1, -1, -1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,
       -1, -1,  1,  1, -1,  1,  1, -1,  1, -1,  1,  1,  1, -1,  1, -1, -1,
       -1, -1, -1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1,  1,  1,  1,
        1, -1, -1, -1,  1,  1, -1, -1,  1,  1,  1,  1,  1,  1, -1,  1,  1,
       -1,  1, -1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1,  1,  1, -1, -1,
       -1,  1, -1, -1, -1,  1,  1,  1,  1, -1,  1,  1, -1,  1, -1,  1,  1,
        1, -1, -1,  1,  1,  1, -1,  1,  1, -1,  1, -1, -1, -1,  1,  1, -1,
       -1, -1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1, -1,  1,
       -1,  1, -1, -1, -1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,
       -1,  1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1,  1,  1,  1,  1,
       -1, -1,  1, -1, -1,  1,  1, -1,  1,  1,  1,  1,  1, -1,  1, -1,  1,
        1, -1, -1,  1,  1,  1, -1,  1, -1,  1, -1, -1,  1, -1,  1, -1,  1,
       -1, -1,  1,  1,  1

In [35]:
print("total:", len(outlier3))
print("1:", (outlier3 == 1).sum())
print("-1:", (outlier3 == -1).sum())

total: 234
1: 140
-1: 94
