In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
import warnings, pickle, os, json, copy
warnings.filterwarnings("ignore")

In [2]:
info_raw = pd.read_csv("data/inverse/info.csv")
ccba_raw = pd.read_csv("data/inverse/ccba.csv")
cdtx_raw = pd.read_csv("data/inverse/cdtx.csv")
dp_raw = pd.read_csv("data/inverse/dp.csv")
remit_raw = pd.read_csv("data/inverse/remit.csv")

In [3]:
info_raw["month"] = info_raw["date"].apply(lambda X: X[:7])
ccba_raw["month"] = ccba_raw["byymm"].apply(lambda X: X[:7])
info_raw = pd.merge(info_raw, ccba_raw, on = ["cust_id", "month"], how = "left")
info_raw = info_raw.drop(["month", "byymm"], axis = 1)
info_raw.head()

Unnamed: 0,alert_key,date,sar_flag,cust_id,risk_rank,occupation_code,total_asset,AGE,lupay,cycam,usgam,clamt,csamt,inamt,cucsm,cucah
0,171142,2021-04-01,0.0,a39fea9aec90969fe66a2b2b4d1b86368a2d38e8b8d4bf...,3,12.0,241719.0,3,12565.0,150744.0,82748.0,0.0,0.0,12477.0,12477.0,0.0
1,171152,2021-04-01,0.0,7e42b5dca9b28ee8e5545beb834361e90e6197d176b389...,3,13.0,599497.0,6,3581.0,324783.0,64363.0,0.0,0.0,0.0,4981.0,0.0
2,171177,2021-04-01,0.0,a6cdf6302aead77112013168c6d546d2df3bcb551956d2...,1,19.0,51160.0,4,,,,,,,,
3,171178,2021-04-01,0.0,1a3efa69705f611c7ef2384a715c8142e2ee801cfec9df...,3,9.0,3634343.0,6,829364.0,7666339.0,2343836.0,0.0,0.0,781279.0,781279.0,0.0
4,171180,2021-04-01,0.0,67f8cbb64dd3d447e992b1b299e0ceed3372188e47c88e...,1,17.0,4076287.0,4,636.0,256134.0,3538.0,0.0,0.0,0.0,3410.0,0.0


In [4]:
data_raw = [[cdtx_raw, 0, "date"], [dp_raw, 1, "tx_date"], [remit_raw, 2, "trans_date"], [info_raw, 3, "date"]]

In [5]:
def cyclical_feat_encode(df):
    df["date"] = pd.to_datetime(df["date"])

    df['month'] = df["date"].dt.month
    df['day'] = df["date"].dt.day

    df['month_sin'] = np.sin(2 * np.pi *  df['month']/ df["month"].max())
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / df["month"].max())

    df['day_sin'] = np.sin(2 * np.pi * df['day'] / df["day"].max())
    df['day_cos'] = np.cos(2 * np.pi * df['day'] / df["day"].max())

    df = df.drop(["month", "day"], axis = 1)
    
    return df

In [6]:
with open("feature_type.json", newline='') as file:
    feature_type = json.load(file)

In [7]:
def process_catgorical(df, col):
    df[col].fillna('NULL', inplace=True)
    map_dict = {v:i for i, v in enumerate(set(df[col].unique()))} #set可排序，NULL放最後
    df[col] = df[col].map(map_dict)
    return df

In [8]:
cust_id = {k: i for i, k in enumerate(info_raw["cust_id"].unique())}

for i in range(len(data_raw)):
    df = data_raw[i][0].copy()

    df = df.rename(columns = {data_raw[i][2]: "date"}) # 統一日期名稱
    df = df.drop(df[df["date"].isnull()].index)
    df["date"] = pd.to_datetime(df["date"]) #日期type轉換

    df = cyclical_feat_encode(df) # 時間特徵生成
    
    df["source"] = data_raw[i][1] # 資料源

    df["cust_id"] = df["cust_id"].map(cust_id) # label encoding
    
    # 缺失值處理
    for col in df.columns:
        if (feature_type[col] == "category") and (col not in ["sar_flag", "source"]):
            df = process_catgorical(df, col)
        elif (col not in ["sar_flag", "source"]):
            df[col].fillna(0, inplace=True)
    
    data_raw[i][0] = df

In [11]:
# 分群
data_g = copy.deepcopy(data_raw)
for df_g in data_g:
    df_g[0] = df_g[0].groupby("cust_id")

In [14]:
# 每個id每個info資料所涵蓋的區間之所有資料
data = {i:{} for i in cust_id.values()}

for id_ in cust_id.values():
    # 抓出id的所有資料
    cust_data1 = []
    for df_g, name, date_col in data_g:
        if id_ in df_g.groups:
            df1 = df_g.get_group(id_)
            cust_data1.extend(df1.to_dict('records'))

    # 依日期、資料源排序
    cust_data1 = sorted(cust_data1, key = lambda X: (X["date"].timestamp(), X["source"]))
    cust_data1 = np.array(cust_data1)

    #抓出各區間的資料
    cust_data2 = {}
    idx = 0
    start = 0
    for i, s in enumerate(cust_data1):
        if s["source"] == 3:
            cust_data2[idx] = {}
            end = i

            if not pd.isnull(cust_data1[end]["sar_flag"]):
                cust_data2[idx]["sar"] = cust_data1[end]["sar_flag"]
                cust_data2[idx]["data_type"] = "train"
            else:
                cust_data2[idx]["data_type"] = "test"


            cust_data2[idx]["data"] = cust_data1[start:end+1]
            start = end + 1
            idx += 1

    data[id_] = cust_data2
    
pickle.dump(data, open('data/inverse/cust_id1.pkl', 'wb'))

In [15]:
data[1]

{0: {'sar': 0.0,
  'data_type': 'train',
  'data': array([{'alert_key': 171152, 'date': Timestamp('2021-04-01 00:00:00'), 'sar_flag': 0.0, 'cust_id': 1, 'risk_rank': 3, 'occupation_code': 13, 'total_asset': 599497.0, 'AGE': 6, 'lupay': 3581.0, 'cycam': 324783.0, 'usgam': 64363.0, 'clamt': 0.0, 'csamt': 0.0, 'inamt': 0.0, 'cucsm': 4981.0, 'cucah': 0.0, 'month_sin': 0.8660254037844387, 'month_cos': -0.4999999999999998, 'day_sin': 0.20129852008866006, 'day_cos': 0.9795299412524945, 'source': 3}],
        dtype=object)},
 1: {'sar': 0.0,
  'data_type': 'train',
  'data': array([{'cust_id': 1, 'debit_credit': 0, 'date': Timestamp('2021-04-01 06:00:00'), 'tx_type': 1, 'tx_amt': 20153.0, 'exchg_rate': 1.0, 'info_asset_code': 16, 'fiscTxId': 29, 'txbranch': 349, 'cross_bank': 0, 'ATM': 0, 'month_sin': 0.8660254037844387, 'month_cos': -0.4999999999999998, 'day_sin': 0.20129852008866006, 'day_cos': 0.9795299412524945, 'source': 1},
         {'cust_id': 1, 'debit_credit': 0, 'date': Timestamp('20

In [61]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 1)

dimen_reduct = []
for k1, v1 in data.items():
    for k2, v2 in v1.items():
        data1 = []
        for v3 in v2["data"]:
            if (v3["source"] == 1):
                data1.append(v3)
        if len(data1) != 0:
            data1 = pd.DataFrame(data1)
            data1 = data1.drop(["cust_id", "date", "source"], axis = 1)
            data1 = pca.fit_transform(data1.T)
            dimen_reduct.append(data1.T)
        else:
            dimen_reduct.append(None)


In [62]:
dimen_reduct

[None,
 array([[-115815.80432933, -115815.5815669 , 1388558.52699048,
         -115815.5815669 , -115782.9508246 , -115731.03863352,
         -114689.77080113, -115818.7691155 , -115815.5815669 ,
         -115816.23773961, -115822.92817652, -115817.29486132,
         -115816.98780825]]),
 array([[-71206.07536098, -71206.03205847, 852663.17289752,
         -71206.03205847, -71175.23019502, -71100.74524484,
         -69514.52650747, -71209.51754382, -71206.03205847,
         -71206.81239068, -71214.76877684, -71206.10031389,
         -71211.30038858]]),
 array([[ -85322.04548486,  -85322.01649148, 1022422.5729335 ,
          -85322.01649148,  -85282.30971106,  -85244.7568435 ,
          -83978.30126236,  -85324.58415337,  -85322.01649148,
          -85322.70640536,  -85329.74087246,  -85322.63283057,
          -85329.44589551]]),
 array([[-145003.99688669, -145003.65407095, 1738262.08572418,
         -145003.65407095, -144976.05964177, -144891.74305634,
         -143334.8499017 , -145007