# Pipeline Overall
### 1 to 1 relation
1. Merge core data
2. Filter customer info (keep real SAR and non-SAR)

#### Repeated process
3. Aggregation & Imputation
4. Resample (function)
5. Fit (function)
6. Evaluate
7. Submission

## 1. Merge Core Data

In [3]:
import pandas as pd

CUSTINFO_PATH = 'dataset/train_first/public_train_x_custinfo_full_hashed.csv'
PDATE_PATH = 'dataset/train_first/public_x_alert_date.csv'
TDATE_PATH = 'dataset/train_first/train_x_alert_date.csv'
ANSWER_PATH = 'dataset/train_first/train_y_answer.csv'
SAMPLE_PATH = 'dataset/train_first/sample_submission.csv'

cinfo = pd.read_csv(CUSTINFO_PATH)
pdate = pd.read_csv(PDATE_PATH)
tdate = pd.read_csv(TDATE_PATH)
answer = pd.read_csv(ANSWER_PATH)
sample = pd.read_csv(SAMPLE_PATH)

In [4]:
# Map Date and target to 'cust_info'

pdate.insert(pdate.shape[1], "data_label", ["test"] * pdate.shape[0], True)
tdate.insert(tdate.shape[1], "data_label", ["train"] * tdate.shape[0], True)
date = pd.concat([pdate, tdate], axis=0)

cinfo = cinfo.merge(date, on='alert_key', how='left')
cinfo = cinfo.merge(answer, on='alert_key', how='left')

In [22]:
cinfo.describe()

Unnamed: 0,alert_key,risk_rank,occupation_code,total_asset,AGE,date,sar_flag,AGE_encoding,risk_rank_encoding,date_encoding,occupation_code_encoding
count,25751.0,25751.0,25635.0,25751.0,25751.0,25751.0,23906.0,25751.0,25751.0,25751.0,25751.0
mean,265685.626927,1.610578,14.251063,713742.7,3.633024,198.164032,0.009788,0.980928,0.983034,0.908703,0.98526
std,58623.840868,0.906222,4.690025,2435461.0,1.309948,118.263229,0.098453,0.270991,0.495489,1.249958,0.62507
min,171142.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,212536.0,1.0,12.0,7508.0,3.0,92.0,0.0,0.837438,0.214133,0.0,0.758534
50%,266346.0,1.0,15.0,128880.0,3.0,210.0,0.0,1.139073,1.293184,0.0,0.941968
75%,316658.5,3.0,19.0,597231.5,4.0,295.0,0.0,1.139073,1.293184,1.369863,1.062135
max,365073.0,3.0,20.0,73863210.0,10.0,393.0,1.0,2.352941,1.442308,5.747126,6.25


In [5]:
import copy

# separate
train_cinfo = copy.deepcopy(cinfo)
train_cinfo = train_cinfo[~pd.isna(train_cinfo['sar_flag'])]
train_cinfo.head()

Unnamed: 0,alert_key,cust_id,risk_rank,occupation_code,total_asset,AGE,date,data_label,sar_flag
1845,171142,a39fea9aec90969fe66a2b2b4d1b86368a2d38e8b8d4bf...,3,12.0,241719.0,3,0,train,0.0
1846,171152,7e42b5dca9b28ee8e5545beb834361e90e6197d176b389...,3,13.0,599497.0,6,0,train,0.0
1847,171177,a6cdf6302aead77112013168c6d546d2df3bcb551956d2...,1,19.0,51160.0,4,0,train,0.0
1848,171178,1a3efa69705f611c7ef2384a715c8142e2ee801cfec9df...,3,9.0,3634343.0,6,0,train,0.0
1849,171180,67f8cbb64dd3d447e992b1b299e0ceed3372188e47c88e...,1,17.0,4076287.0,4,0,train,0.0


In [6]:
mean_encoding_map = {}


def mean_encoding(key):
    key_map = {}
    keys = pd.unique(train_cinfo[key])  # [k for k in pd.unique(X_under[key]) if not (pd.isna(k))]
    for k in keys:
        if pd.isna(k):
            key_map[k] = 0
            continue
        is_sar_num = len((train_cinfo[(train_cinfo['sar_flag'] == 1.0) & (train_cinfo[key] == k)]))
        total_num = len(train_cinfo[train_cinfo[key] == k])
        mean_val = is_sar_num / total_num * 100
        key_map[k] = mean_val
    mean_encoding_map[key] = key_map


mean_encoding('AGE')
mean_encoding('risk_rank')
mean_encoding('date')
mean_encoding('occupation_code')
mean_encoding_map['date'][365] = 0
mean_encoding_map

{'AGE': {3: 1.1390728476821192,
  6: 0.5569306930693069,
  4: 0.8374384236453201,
  5: 0.7261724659606656,
  2: 1.2917115177610334,
  9: 0.0,
  1: 1.1363636363636365,
  7: 0.22522522522522523,
  8: 2.3529411764705883,
  10: 0.0,
  0: 0.0},
 'risk_rank': {3: 0.21413276231263384,
  1: 1.2931842318985445,
  2: 1.4423076923076923,
  0: 0.0},
 'date': {0: 1.1363636363636365,
  5: 0.6578947368421052,
  6: 0.0,
  7: 1.2048192771084338,
  8: 0.0,
  11: 0.0,
  12: 2.380952380952381,
  13: 0.4878048780487805,
  14: 2.127659574468085,
  15: 0.0,
  18: 0.9900990099009901,
  19: 1.1494252873563218,
  20: 0.0,
  21: 1.3157894736842104,
  22: 0.0,
  25: 0.847457627118644,
  26: 4.477611940298507,
  27: 2.197802197802198,
  28: 1.1627906976744187,
  32: 2.2058823529411766,
  33: 1.2987012987012987,
  34: 1.098901098901099,
  35: 1.2658227848101267,
  36: 3.7037037037037033,
  39: 1.8518518518518516,
  40: 1.2658227848101267,
  41: 3.7037037037037033,
  42: 2.197802197802198,
  43: 5.747126436781609,
 

In [7]:
for k, v in mean_encoding_map.items():
    mean_encoding_val = [0 if pd.isna(x) or v.get(x) is None else v[x] for x in cinfo[k]]
    column_name = k + "_encoding"
    cinfo = pd.concat([cinfo, pd.DataFrame({column_name: mean_encoding_val})], axis=1)


Unnamed: 0,alert_key,cust_id,risk_rank,occupation_code,total_asset,AGE,date,data_label,sar_flag,AGE_encoding,risk_rank_encoding,date_encoding,occupation_code_encoding
0,352249,82595ac69158ae08d34156784bdec0d9e2ca5b242b6d2a...,1,19.0,1465816.0,7,365,test,,0.225225,1.293184,0.0,0.941968
1,352253,b212d14cb35676926682b2cf849e295d948888f556c07e...,1,2.0,98177.0,2,365,test,,1.291712,1.293184,0.0,1.282051
2,352254,e5b0002791c7852644a2730abeaa893cdf14a072ef7812...,1,19.0,2052922.0,7,365,test,,0.225225,1.293184,0.0,0.941968
3,352280,74214c478dc6519fbefe4bc31693865bdcd698ab974b64...,3,15.0,201906.0,5,365,test,,0.726172,0.214133,0.0,1.521739
4,352282,0340e7611f0d82c3cb87e6194fa14bb2ccf8afbf1b3418...,1,12.0,7450.0,5,365,test,,0.726172,1.293184,0.0,0.758534


In [11]:
len(pd.unique(cinfo.alert_key)), cinfo.shape

(25751, (25751, 13))

In [13]:
drop_feature = ['data_label', 'sar_flag', 'cust_id']
cinfo_encoding = cinfo.drop(drop_feature, axis=1)

In [19]:
cinfo_encoding.to_pickle("encoding_all.pkl")

In [20]:
drop_feature = ["AGE", "date", "risk_rank", "occupation_code"]

encoding_significant = cinfo_encoding.drop(drop_feature, axis=1)
encoding_significant.to_pickle("encoding_significant.pkl")
encoding_significant

Unnamed: 0,alert_key,total_asset,AGE_encoding,risk_rank_encoding,date_encoding,occupation_code_encoding
0,352249,1465816.0,0.225225,1.293184,0.0,0.941968
1,352253,98177.0,1.291712,1.293184,0.0,1.282051
2,352254,2052922.0,0.225225,1.293184,0.0,0.941968
3,352280,201906.0,0.726172,0.214133,0.0,1.521739
4,352282,7450.0,0.726172,1.293184,0.0,0.758534
...,...,...,...,...,...,...
25746,352123,12207.0,1.291712,1.293184,0.0,1.062135
25747,352124,259985.0,0.837438,1.293184,0.0,1.062135
25748,352125,928963.0,1.139073,0.214133,0.0,0.941968
25749,352128,21647.0,0.837438,0.214133,0.0,0.941968
