## Read File

In [1]:
import pandas as pd
# from pandas_profiling import ProfileReport



TRAIN_DIR = 'train_first'

CCBA_PATH = 'train_first/public_train_x_ccba_full_hashed.csv'
CDTX_PATH = 'train_first/public_train_x_cdtx0001_full_hashed.csv'
CUSTINFO_PATH = 'train_first/public_train_x_custinfo_full_hashed.csv'
DP_PATH = 'train_first/public_train_x_dp_full_hashed.csv'
REMIT_PATH = 'train_first/public_train_x_remit_full_hashed.csv'
PDATE_PATH = 'train_first/public_x_alert_date.csv'
TDATE_PATH = 'train_first/train_x_alert_date.csv'
ANSWER_PATH = 'train_first/train_y_answer.csv'
SAMPLE_PATH = './sample_submission.csv'


ccba = pd.read_csv(CCBA_PATH)
cdtx = pd.read_csv(CDTX_PATH)
cinfo = pd.read_csv(CUSTINFO_PATH)
dp = pd.read_csv(DP_PATH)
remit = pd.read_csv(REMIT_PATH)
pdate = pd.read_csv(PDATE_PATH)

tdate = pd.read_csv(TDATE_PATH)
answer = pd.read_csv(ANSWER_PATH)
sample = pd.read_csv(SAMPLE_PATH)

## Generate Report

In [2]:
names = ['ccba', 'cdtx', 'custinfo', 'dp', 'remit', 'pdate', 'tdate', 'answer', 'sample']
datas = [ccba, cdtx, cinfo, dp, remit, pdate, tdate, answer, sample]
num_files = len(datas)

# for i in range(num_files):
#     print(f'{names[i]}: {datas[i].shape}')
#     profile = ProfileReport(datas[i], minimal=True, title=names[i])
#     profile.to_file(f'./data_report/{names[i]}.html', )

## Preprocess

In [3]:

import pickle

from sklearn.preprocessing import QuantileTransformer
import numpy as np
from easydict import EasyDict as edict
import pandas as pd
from tqdm import tqdm


from process_data.data_config import (DataSource, FeatureType,
                         CCBAConfig, CDTXConfig, DPConfig, REMITConfig, CUSTINFOConfig,
                         CONFIG_MAP)
from process_data.utils import load_yaml, save_yaml, save_pickle, load_pickle

### Map Date and target to `custinfo`

In [4]:
date = pd.concat([pdate, tdate], axis=0)
cinfo = cinfo.merge(date, on='alert_key', how='left')
cinfo = cinfo.merge(answer, on='alert_key', how='left')
cinfo

Unnamed: 0,alert_key,cust_id,risk_rank,occupation_code,total_asset,AGE,date,sar_flag
0,352249,82595ac69158ae08d34156784bdec0d9e2ca5b242b6d2a...,1,19.0,1465816.0,7,365,
1,352253,b212d14cb35676926682b2cf849e295d948888f556c07e...,1,2.0,98177.0,2,365,
2,352254,e5b0002791c7852644a2730abeaa893cdf14a072ef7812...,1,19.0,2052922.0,7,365,
3,352280,74214c478dc6519fbefe4bc31693865bdcd698ab974b64...,3,15.0,201906.0,5,365,
4,352282,0340e7611f0d82c3cb87e6194fa14bb2ccf8afbf1b3418...,1,12.0,7450.0,5,365,
...,...,...,...,...,...,...,...,...
25746,352123,b600c0720096c241f5aec16b096c6a353492eee8a5855a...,1,17.0,12207.0,2,364,0.0
25747,352124,06c5ea5ccda4dfd8839c0dec8646fb3071d2c02883ef8a...,1,17.0,259985.0,4,364,0.0
25748,352125,0c2dc5fedc3689abf5ff4be14fe8fea8d23d22068297c2...,3,19.0,928963.0,3,364,0.0
25749,352128,ffe5f2bd86ecfd7d0a0f4c4b76dd5c312649be95eb8b42...,3,19.0,21647.0,4,364,0.0


### Process Numerical and Categorical

In [5]:
def normalize(col):
    qt = QuantileTransformer(
        n_quantiles=10_000, 
        random_state=0, 
        subsample=min(5*10**5, len(col)),
        output_distribution='normal'
    )
    return qt.fit_transform(col)

def process_numerical(col):
    col = normalize(col)
    col = np.nan_to_num(col, nan=0)
    return col


def process_catgorical(col):
    col.fillna('NULL', inplace=True)
    map_dict = {v:i for i, v in enumerate(set(col.unique()))}
    col = col.map(map_dict)
    return col

In [7]:
datas = [
    (ccba, DataSource.CCBA), 
    (cdtx, DataSource.CDTX),
    (dp, DataSource.DP),
    (remit, DataSource.REMIT),
    (cinfo, DataSource.CUSTINFO),
]

num_cat_dict = {}

# process numerical and categorical and data_source
for data, data_source in datas:
    config = CONFIG_MAP[data_source]
    cols = data.columns
    numericals = []
    for col in cols:
        feature_type = getattr(config, col)
        if feature_type == FeatureType.NUMERICAL and col != 'sar_flag':
            numericals.append(col)
        elif feature_type == FeatureType.CATEGORICAL: # sar_flag 應該設定在 TARGET type
            print(f'process categorical {col}')
            data[col] = process_catgorical(data[col].copy())
            num_cat = data[col].nunique()
            if data_source not in num_cat_dict:
                num_cat_dict[data_source] = {}
            num_cat_dict[data_source][col] = num_cat
    print(f'numericals: {numericals}')
    if numericals:
        data[numericals] = process_numerical(data[numericals].copy())
    data['data_source'] = data_source

numericals: ['lupay', 'cycam', 'usgam', 'clamt', 'csamt', 'inamt', 'cucsm', 'cucah']
process categorical country
process categorical cur_type
numericals: ['amt']
process categorical debit_credit
process categorical tx_time
process categorical tx_type
process categorical info_asset_code
process categorical fiscTxId
process categorical txbranch
process categorical cross_bank
process categorical ATM
numericals: ['tx_amt', 'exchg_rate']
process categorical trans_no
numericals: ['trade_amount_usd']
process categorical risk_rank
process categorical occupation_code
process categorical AGE
numericals: ['total_asset']


In [8]:
save_yaml(num_cat_dict, 'num_cat_dict.yml')

In [9]:
datas = [d[0] for d in datas]

In [10]:
datas_g = [d.groupby(by='cust_id') for d in datas]

In [29]:
def get_date(d):
    ds = d.data_source
    
    if ds == DataSource.CCBA:
        date = d.byymm
    elif ds == DataSource.CDTX:
        date = d.date
    elif ds == DataSource.DP:
        date = d.tx_date
    elif ds == DataSource.REMIT:
        date = d.trans_date
    elif ds == DataSource.CUSTINFO:
        date = d.date
    return date, ds


cust_ids = cinfo.cust_id.unique()
save_data = edict()
for cust_id in tqdm(cust_ids):
    # get all data from each group
    cust_data = []
    for d in datas_g:
        if not cust_id in d.groups:
            continue
        cust_data += d.get_group(cust_id).to_dict('records')
    for i in range(len(cust_data)):
        cust_data[i] = edict(cust_data[i])
    
    # sort by date
    cust_data.sort(key=get_date)
    
    # generate source list and target_mask
    source_list = []
    train_mask = []
    test_mask = []
    for i, c in enumerate(cust_data):
        ds = c.data_source
        source_list.append(ds)
        
        if ds != DataSource.CUSTINFO:
            pass
        # 之後process的 c （table row）都確保是custinfo 確認有sar_fla的存在
        else:
            
            # print(cust_id, c.sar_flag, np.isnan(c.sar_flag))            
            if np.isnan(c.sar_flag):
                
                test_mask.append(i)
            else:
            
                train_mask.append(i)
    
    # save data
    save_data[cust_id] = edict({
        'sources': source_list,
        'train_mask': train_mask,
        'test_mask': test_mask,
        'cust_data': cust_data,
    })


100%|██████████| 7708/7708 [00:56<00:00, 136.65it/s]


In [21]:
np.isnan(c.sar_flag)

True

### check num of train_mask, test_mask is correct

In [24]:
cinfo.sar_flag.value_counts(dropna=False) # cinfo sar = np.nan （testmask）已經被變成 2 了：）

0.0    23672
NaN     1845
1.0      234
Name: sar_flag, dtype: int64

In [25]:
isna = cinfo.sar_flag.isna()
train_num = sum(~isna)
test_num = sum(isna)

train_num2 = 0
test_num2 = 0
for v in save_data.values():
    train_num2 += len(v.train_mask)
    test_num2 += len(v.test_mask)

print(train_num, train_num2)
print(test_num, test_num2)

23906 23906
1845 1845


In [27]:
save_pickle(save_data, '/home/nanaeilish/projects/Github/esun_sar_baseline/cust_data.pkl')


In [None]:
# save_data = load_pickle('cust_data.pkl')

: 

In [None]:
custids = list(save_data.keys())

: 

In [28]:
from random import choice 
from pprint import pprint 
custid = choice(custids)
cust_data = save_data[custid]
pprint(cust_data)

if cust_data.train_mask: 
    print('train')
elif cust_data.test_mask: 
    print('test')

NameError: name 'custids' is not defined

In [None]:
## testing the data loader code (test.py bug)
pkl = save_data 
data = [] 
data_count = 0 
for k, v in pkl.items():
    masks = v.test_mask
    for e in masks:
        e += 1
        s = max(e -512, 0)
        data.append(edict({
            'sources': v.sources[s:e],
            'cust_data': v.cust_data[s:e]
        }))
        data_count += 1
print(f'num of data: {len(data)}')

: 

### length distribution

In [None]:
# get len of cust_data of save_data
lens = []
for k, v in save_data.items():
    lens.append(len(v.sources))
pd.DataFrame(data=lens, columns=None).describe(percentiles=[.25, .5, .75, .9, .95, .99])

: 

### check target_mask distribution

In [None]:
train_mask = []
test_mask = []
for v in save_data.values():
    train_mask += v.train_mask
    test_mask += v.test_mask

display(pd.DataFrame(data=train_mask, columns=['train']).describe(percentiles=np.linspace(0,1,11)))
display(pd.DataFrame(data=test_mask, columns=['test']).describe(percentiles=np.linspace(0,1,11)))

: 

In [None]:
data = load_pickle('/media/hd03/axot_data/sar/data/cust_data.pkl')
len(data)

: 

In [None]:
sars = set()
for cust_id, v in data.items():
    for idx in v.train_mask:
        if v.cust_data[idx].sar_flag == 1:
            sars.add(cust_id)
            break
len(sars)

: 

In [None]:
num1 = []
for cust_id in sars:
    d = data[cust_id]
    tmp = 0
    for idx in d.train_mask:
        tmp += (d.cust_data[idx].sar_flag == 1)
    num1.append(tmp)
sum(num1)

: 

In [None]:
# num_sar = []
num_len = []
num0 = []
num1 = []
for k, d in data.items():
    tmp0 = 0
    tmp1 = 0
    for idx in d.train_mask:
        tmp0 += d.cust_data[idx].sar_flag == 0
        tmp1 += d.cust_data[idx].sar_flag == 1
    num0.append(tmp0)
    num1.append(tmp1)
    num_len.append(len(d.cust_data))

: 

In [None]:
df = pd.DataFrame({'num0': num0, 'num1':num1,'num_len': num_len})

: 

In [None]:
df.describe()

: 

In [None]:
df[df.num1>0].describe()

: 

In [None]:
# df1 = df[df.num1>0]
df[df.num_len == df.num_len.max()]

: 

In [None]:
df[df.num1==0].describe()

: 

In [None]:
mask_ids = []
for k, v in data.items():
    for i, idx in enumerate(v.train_mask):
        if i == 0:
            mask_ids.append(idx)
        else:
            mask_ids.append(idx-v.train_mask[i-1])
pd.DataFrame({'mask_ids': mask_ids}).describe(percentiles=np.arange(.9, 1.01, 0.01))

: 

: 