In [23]:
#Base

import pandas as pd
import os

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from sklearn.ensemble import RandomForestClassifier

from sklearn.feature_selection import RFE

import time
from matplotlib import pyplot
from sklearn.metrics import ConfusionMatrixDisplay
import xgboost as xgb

import statsmodels.api as sm

from tqdm import tqdm


dataset_folder = os.path.join(os.getcwd(), 'dataset')

def df_from_csv(filename):
    f = os.path.join(dataset_folder, filename)
    return pd.read_csv(f, delimiter='\t|\n|,', engine='python')

x_train = df_from_csv('ori_trainx.csv')
y_train = df_from_csv('ori_trainy.csv')
alert_date = df_from_csv('ori_alert_date.csv')
alert_cust = df_from_csv('ori_custinfo.csv')
xdp = df_from_csv('ori_xdp.csv')

smote = SMOTE(random_state=0)
tqdm.pandas(desc='Progress Bar')

def SMOTE_oversampling(input_x, input_y):
    x_train, x_test, y_train, y_test = train_test_split(input_x, input_y, test_size=0.3)
    x_cols = x_train.columns
    y_cols = y_train.columns

    sm_data_x, sm_data_y = smote.fit_resample(x_train, y_train)
    sm_data_x = pd.DataFrame(data=sm_data_x, columns=x_cols)
    sm_data_y = pd.DataFrame(data=sm_data_y, columns=y_cols)

    print("length of oversampled data is ",len(sm_data_x))
    print("Number of non-sar in oversampled data",len(sm_data_y[sm_data_y['sar_flag']==0]))
    print("Number of sar",len(sm_data_y[sm_data_y['sar_flag']==1]))
    print("Proportion of no subscription data in oversampled data is ", len(sm_data_y[sm_data_y['sar_flag']==0])/len(sm_data_x))
    print("Proportion of subscription data in oversampled data is ", len(sm_data_y[sm_data_y['sar_flag']==1])/len(sm_data_x))

    return sm_data_x, sm_data_y

def RFE_analysis(input_x, input_y, model):
    rfe = RFE(model, step=300)
    rfe.fit(input_x, input_y.values.ravel())
    print(input_x.columns)
    print(rfe.support_)
    print(rfe.ranking_)
    return rfe.get_support(1)

def LR_analysis(input_x, input_y):
    sm_LR = sm.Logit(input_y, input_x)
    result = sm_LR.fit()
    print(result.summary2())

def LR_training(input_x, input_y, LR_model):
    x_train, x_test, y_train, y_test = train_test_split(input_x, input_y, test_size=0.25)
    LR_model.fit(x_train, y_train)
    y_train_predict = LR_model.predict(x_train)
    y_test_predict = LR_model.predict(x_test)
    train_acc = accuracy_score(y_train, y_train_predict)
    test_acc = accuracy_score(y_test, y_test_predict)
    print(f'{train_acc}  {test_acc}')
    cm = confusion_matrix(y_true=y_test, y_pred=y_test_predict)
    print(cm)
    return LR_model

def LR_test(input_x, input_y, LR_model):
    y_predict = LR_model.predict(input_x)
    test_acc = accuracy_score(input_y, y_predict)
    print(f'{test_acc}')
    cm = confusion_matrix(y_true=input_y, y_pred=y_predict)
    print(cm)
    print(classification_report(y_true=input_y, y_pred=y_predict))

def calculate_tx(row):
    if not pd.isna(row['risk_rank']):
        temp_frame = xdp[(xdp['tx_date'] <= row['date']) & (xdp['cust_id'] == row['cust_id'])]
        total_tx = (temp_frame['tx_amt'] * temp_frame['exchg_rate'])
        row['total_tx'] = total_tx.sum()
        row['avg_tx'] = total_tx.mean()
        row['count_tx'] = total_tx.count()
        row['std_tx'] = total_tx.std()
        row['tx_per_day'] = row['count_tx']/row['date']+1
    return row

def generate_train_dataset(X, y, model):
    X_resampled, y_resampled = model.fit_resample(X, y)
    return train_test_split(X_resampled, y_resampled, test_size=0.25, random_state=123)

eta = 0.1
max_depth = 3
subsample = 0.9
colsample_bytree = 1

params_1 = {
    "booster": "gbtree",
    "eval_metric": "auc",
    "eta": eta,
    "tree_method": 'auto',
    "max_depth": max_depth,
    "sampling_method": "uniform",
    "subsample": subsample,
    "colsample_bytree": colsample_bytree
}

In [24]:
# Join data for training

ccba = pd.read_pickle('aggregated_ccba_cust.pkl')
cdtx = pd.read_pickle('cdtx_agg.pkl')
xdp = pd.read_pickle('xdp_x_train.pkl')
remit = pd.read_pickle('cust_remit_raw.pkl')

In [25]:
xdp.head()

Unnamed: 0,alert_key,date,sar_flag,cust_id,risk_rank,total_asset,AGE,occupation_code_0,occupation_code_1,occupation_code_2,...,db_2_total_tx,db_2_avg_tx,db_2_count_tx,db_2_std_tx,db_2_tx_per_day,db_3_total_tx,db_3_avg_tx,db_3_count_tx,db_3_std_tx,db_3_tx_per_day
0,171189,0,0,972ee157e63316e8a50dd489bc93730a3ee8a8959d5c6b...,1,375576.0,4,0,0,0,...,0.0,,0,,0.0,0.0,,0,,0.0
1,171202,0,0,a10ab33f90926fb18d7bb5e78034d7f04a1fbed95b7951...,3,2717416.0,2,0,0,0,...,0.0,,0,,0.0,156902.0,156902.0,1,,1.0
2,171599,0,0,3433ecc068ed1c9e2f5037cab5d42d7b901e9bd624c1fb...,1,326517.0,4,0,0,0,...,0.0,,0,,0.0,0.0,,0,,0.0
3,171737,0,0,a0861608615a9365d90f4ba0a813c0ea0471987f925c8b...,3,1014759.0,4,0,0,0,...,0.0,,0,,0.0,0.0,,0,,0.0
4,171142,0,0,a39fea9aec90969fe66a2b2b4d1b86368a2d38e8b8d4bf...,3,241719.0,3,0,0,0,...,0.0,,0,,0.0,0.0,,0,,0.0


In [26]:
train = df_from_csv('ori_trainx.csv')
train = train.merge(df_from_csv('ori_trainy.csv'), on='alert_key', how='inner')
train = train.merge(alert_cust, on='alert_key', how='left')
train = train[['alert_key', 'cust_id']]
train = train.merge(ccba, on='cust_id', how='left')
train = train.merge(cdtx, on='cust_id', how='left')
train = train.merge(xdp, on='alert_key', how='left')
train = train.merge(remit, on='alert_key', how='left')
train = train.reindex(sorted(train.columns), axis=1)
train

Unnamed: 0,AGE,alert_key,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_nonntd,avg_freq_tw_ntd,...,total_amt_tw_nonntd,total_amt_tw_ntd,total_asset,total_asset_x,total_asset_y,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,trade_sum
0,4,171189,9044.740741,3033.355556,,1388.550403,1.350000,1.097561,,2.147186,...,,688721.0,375576.0,375576.0,4.886652e+05,27.0,45.0,,496.0,
1,2,171202,,,,,,,,,...,,,2717416.0,2717416.0,2.621840e+06,,,,,
2,4,171599,,,,,,,,,...,,,326517.0,326517.0,3.265170e+05,,,,,
3,4,171737,,,,1893.333333,,,,1.500000,...,,5680.0,1014759.0,1014759.0,1.275499e+05,,,,3.0,
4,3,171142,,,,7048.368421,,,,1.055556,...,,133919.0,241719.0,241719.0,2.212473e+05,,,,19.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23901,3,352132,4332.523077,1223.939446,,988.503826,1.160714,2.513043,,6.668367,...,,2583949.0,3218731.0,3218731.0,2.361632e+06,65.0,578.0,,2614.0,364459.0
23902,3,352125,,,,,,,,,...,,,928963.0,928963.0,5.041325e+05,,,,,22127.0
23903,6,352080,,,,,,,,,...,,,69080.0,69080.0,7.500171e+04,,,,,43676.0
23904,6,352075,,,,,,,,,...,,,262604.0,262604.0,3.333440e+05,,,,,


In [27]:
train['bad_value'] = train.isnull().sum(axis=1) + (train == 0).astype(int).sum(axis=1)

In [28]:
train = train.drop(['cust_id_x', 'cust_id_y', 'date'], axis=1)
train.to_pickle('aggregate.pkl')

In [29]:
train = train.drop(['alert_key'], axis=1)
train

Unnamed: 0,AGE,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_nonntd,avg_freq_tw_ntd,cr_1_avg_tx,...,total_amt_tw_ntd,total_asset,total_asset_x,total_asset_y,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,trade_sum,bad_value
0,4,9044.740741,3033.355556,,1388.550403,1.350000,1.097561,,2.147186,,...,688721.0,375576.0,375576.0,4.886652e+05,27.0,45.0,,496.0,,74
1,2,,,,,,,,,,...,,2717416.0,2717416.0,2.621840e+06,,,,,,80
2,4,,,,,,,,,,...,,326517.0,326517.0,3.265170e+05,,,,,,92
3,4,,,,1893.333333,,,,1.500000,,...,5680.0,1014759.0,1014759.0,1.275499e+05,,,,3.0,,82
4,3,,,,7048.368421,,,,1.055556,,...,133919.0,241719.0,241719.0,2.212473e+05,,,,19.0,,81
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23901,3,4332.523077,1223.939446,,988.503826,1.160714,2.513043,,6.668367,932.000000,...,2583949.0,3218731.0,3218731.0,2.361632e+06,65.0,578.0,,2614.0,364459.0,63
23902,3,,,,,,,,,,...,,928963.0,928963.0,5.041325e+05,,,,,22127.0,81
23903,6,,,,,,,,,360378.928571,...,,69080.0,69080.0,7.500171e+04,,,,,43676.0,80
23904,6,,,,,,,,,369905.885463,...,,262604.0,262604.0,3.333440e+05,,,,,,76


In [30]:
train['bad_value'] = train.isnull().sum(axis=1) + (train == 0).astype(int).sum(axis=1)
train

Unnamed: 0,AGE,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_nonntd,avg_freq_tw_ntd,cr_1_avg_tx,...,total_amt_tw_ntd,total_asset,total_asset_x,total_asset_y,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,trade_sum,bad_value
0,4,9044.740741,3033.355556,,1388.550403,1.350000,1.097561,,2.147186,,...,688721.0,375576.0,375576.0,4.886652e+05,27.0,45.0,,496.0,,73
1,2,,,,,,,,,,...,,2717416.0,2717416.0,2.621840e+06,,,,,,79
2,4,,,,,,,,,,...,,326517.0,326517.0,3.265170e+05,,,,,,91
3,4,,,,1893.333333,,,,1.500000,,...,5680.0,1014759.0,1014759.0,1.275499e+05,,,,3.0,,81
4,3,,,,7048.368421,,,,1.055556,,...,133919.0,241719.0,241719.0,2.212473e+05,,,,19.0,,80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23901,3,4332.523077,1223.939446,,988.503826,1.160714,2.513043,,6.668367,932.000000,...,2583949.0,3218731.0,3218731.0,2.361632e+06,65.0,578.0,,2614.0,364459.0,63
23902,3,,,,,,,,,,...,,928963.0,928963.0,5.041325e+05,,,,,22127.0,81
23903,6,,,,,,,,,360378.928571,...,,69080.0,69080.0,7.500171e+04,,,,,43676.0,80
23904,6,,,,,,,,,369905.885463,...,,262604.0,262604.0,3.333440e+05,,,,,,76


In [31]:
ori_train = train.copy()
# train = train.loc[train['bad_value']<41]
train

Unnamed: 0,AGE,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_nonntd,avg_freq_tw_ntd,cr_1_avg_tx,...,total_amt_tw_ntd,total_asset,total_asset_x,total_asset_y,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,trade_sum,bad_value
0,4,9044.740741,3033.355556,,1388.550403,1.350000,1.097561,,2.147186,,...,688721.0,375576.0,375576.0,4.886652e+05,27.0,45.0,,496.0,,73
1,2,,,,,,,,,,...,,2717416.0,2717416.0,2.621840e+06,,,,,,79
2,4,,,,,,,,,,...,,326517.0,326517.0,3.265170e+05,,,,,,91
3,4,,,,1893.333333,,,,1.500000,,...,5680.0,1014759.0,1014759.0,1.275499e+05,,,,3.0,,81
4,3,,,,7048.368421,,,,1.055556,,...,133919.0,241719.0,241719.0,2.212473e+05,,,,19.0,,80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23901,3,4332.523077,1223.939446,,988.503826,1.160714,2.513043,,6.668367,932.000000,...,2583949.0,3218731.0,3218731.0,2.361632e+06,65.0,578.0,,2614.0,364459.0,63
23902,3,,,,,,,,,,...,,928963.0,928963.0,5.041325e+05,,,,,22127.0,81
23903,6,,,,,,,,,360378.928571,...,,69080.0,69080.0,7.500171e+04,,,,,43676.0,80
23904,6,,,,,,,,,369905.885463,...,,262604.0,262604.0,3.333440e+05,,,,,,76


In [32]:
ori_train.iloc[:, 1:] = ori_train.iloc[:, 1:].fillna(0)
train.iloc[:, 1:] = train.iloc[:, 1:].fillna(0)
train

Unnamed: 0,AGE,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_nonntd,avg_freq_tw_ntd,cr_1_avg_tx,...,total_amt_tw_ntd,total_asset,total_asset_x,total_asset_y,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,trade_sum,bad_value
0,4,9044.740741,3033.355556,0.0,1388.550403,1.350000,1.097561,0.0,2.147186,0.000000,...,688721.0,375576.0,375576.0,4.886652e+05,27.0,45.0,0.0,496.0,0.0,73
1,2,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,2717416.0,2717416.0,2.621840e+06,0.0,0.0,0.0,0.0,0.0,79
2,4,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,326517.0,326517.0,3.265170e+05,0.0,0.0,0.0,0.0,0.0,91
3,4,0.000000,0.000000,0.0,1893.333333,0.000000,0.000000,0.0,1.500000,0.000000,...,5680.0,1014759.0,1014759.0,1.275499e+05,0.0,0.0,0.0,3.0,0.0,81
4,3,0.000000,0.000000,0.0,7048.368421,0.000000,0.000000,0.0,1.055556,0.000000,...,133919.0,241719.0,241719.0,2.212473e+05,0.0,0.0,0.0,19.0,0.0,80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23901,3,4332.523077,1223.939446,0.0,988.503826,1.160714,2.513043,0.0,6.668367,932.000000,...,2583949.0,3218731.0,3218731.0,2.361632e+06,65.0,578.0,0.0,2614.0,364459.0,63
23902,3,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,928963.0,928963.0,5.041325e+05,0.0,0.0,0.0,0.0,22127.0,81
23903,6,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,360378.928571,...,0.0,69080.0,69080.0,7.500171e+04,0.0,0.0,0.0,0.0,43676.0,80
23904,6,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,369905.885463,...,0.0,262604.0,262604.0,3.333440e+05,0.0,0.0,0.0,0.0,0.0,76


In [33]:
ori_train = ori_train.drop(['bad_value'], axis=1)
train = train.drop(['bad_value'], axis=1)
train

Unnamed: 0,AGE,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_nonntd,avg_freq_tw_ntd,cr_1_avg_tx,...,total_amt_tw_nonntd,total_amt_tw_ntd,total_asset,total_asset_x,total_asset_y,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,trade_sum
0,4,9044.740741,3033.355556,0.0,1388.550403,1.350000,1.097561,0.0,2.147186,0.000000,...,0.0,688721.0,375576.0,375576.0,4.886652e+05,27.0,45.0,0.0,496.0,0.0
1,2,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,0.0,2717416.0,2717416.0,2.621840e+06,0.0,0.0,0.0,0.0,0.0
2,4,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,0.0,326517.0,326517.0,3.265170e+05,0.0,0.0,0.0,0.0,0.0
3,4,0.000000,0.000000,0.0,1893.333333,0.000000,0.000000,0.0,1.500000,0.000000,...,0.0,5680.0,1014759.0,1014759.0,1.275499e+05,0.0,0.0,0.0,3.0,0.0
4,3,0.000000,0.000000,0.0,7048.368421,0.000000,0.000000,0.0,1.055556,0.000000,...,0.0,133919.0,241719.0,241719.0,2.212473e+05,0.0,0.0,0.0,19.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23901,3,4332.523077,1223.939446,0.0,988.503826,1.160714,2.513043,0.0,6.668367,932.000000,...,0.0,2583949.0,3218731.0,3218731.0,2.361632e+06,65.0,578.0,0.0,2614.0,364459.0
23902,3,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,0.0,928963.0,928963.0,5.041325e+05,0.0,0.0,0.0,0.0,22127.0
23903,6,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,360378.928571,...,0.0,0.0,69080.0,69080.0,7.500171e+04,0.0,0.0,0.0,0.0,43676.0
23904,6,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,369905.885463,...,0.0,0.0,262604.0,262604.0,3.333440e+05,0.0,0.0,0.0,0.0,0.0


In [34]:
# Training
ori_train_x = ori_train.loc[:, train.columns != 'sar_flag']
ori_train_y = ori_train.loc[:, train.columns == 'sar_flag']
train_x = train.loc[:, train.columns != 'sar_flag']
train_y = train.loc[:, train.columns == 'sar_flag']
train_x

Unnamed: 0,AGE,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_nonntd,avg_freq_tw_ntd,cr_1_avg_tx,...,total_amt_tw_nonntd,total_amt_tw_ntd,total_asset,total_asset_x,total_asset_y,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,trade_sum
0,4,9044.740741,3033.355556,0.0,1388.550403,1.350000,1.097561,0.0,2.147186,0.000000,...,0.0,688721.0,375576.0,375576.0,4.886652e+05,27.0,45.0,0.0,496.0,0.0
1,2,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,0.0,2717416.0,2717416.0,2.621840e+06,0.0,0.0,0.0,0.0,0.0
2,4,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,0.0,326517.0,326517.0,3.265170e+05,0.0,0.0,0.0,0.0,0.0
3,4,0.000000,0.000000,0.0,1893.333333,0.000000,0.000000,0.0,1.500000,0.000000,...,0.0,5680.0,1014759.0,1014759.0,1.275499e+05,0.0,0.0,0.0,3.0,0.0
4,3,0.000000,0.000000,0.0,7048.368421,0.000000,0.000000,0.0,1.055556,0.000000,...,0.0,133919.0,241719.0,241719.0,2.212473e+05,0.0,0.0,0.0,19.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23901,3,4332.523077,1223.939446,0.0,988.503826,1.160714,2.513043,0.0,6.668367,932.000000,...,0.0,2583949.0,3218731.0,3218731.0,2.361632e+06,65.0,578.0,0.0,2614.0,364459.0
23902,3,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,0.0,928963.0,928963.0,5.041325e+05,0.0,0.0,0.0,0.0,22127.0
23903,6,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,360378.928571,...,0.0,0.0,69080.0,69080.0,7.500171e+04,0.0,0.0,0.0,0.0,43676.0
23904,6,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,369905.885463,...,0.0,0.0,262604.0,262604.0,3.333440e+05,0.0,0.0,0.0,0.0,0.0


In [35]:
#normalize
from sklearn import preprocessing

def normalize_mms(input_df):
    scaler = preprocessing.MinMaxScaler()
    result = scaler.fit_transform(input_df)
    return pd.DataFrame(result, index=input_df.index, columns=input_df.columns)

In [36]:
train_x = normalize_mms(train_x)
train_x

Unnamed: 0,AGE,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_nonntd,avg_freq_tw_ntd,cr_1_avg_tx,...,total_amt_tw_nonntd,total_amt_tw_ntd,total_asset,total_asset_x,total_asset_y,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,trade_sum
0,0.4,0.005648,0.018591,0.0,0.000110,0.004122,0.055273,0.0,0.031604,0.000308,...,0.0,0.001897,0.005085,0.005085,0.009321,0.000229,0.011081,0.0,0.023780,0.000000
1,0.2,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000308,...,0.0,0.000000,0.036790,0.036790,0.050010,0.000000,0.000000,0.0,0.000000,0.000000
2,0.4,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000308,...,0.0,0.000000,0.004421,0.004421,0.006228,0.000000,0.000000,0.0,0.000000,0.000000
3,0.4,0.000000,0.000000,0.0,0.000151,0.000000,0.000000,0.0,0.022078,0.000308,...,0.0,0.000016,0.013738,0.013738,0.002433,0.000000,0.000000,0.0,0.000144,0.000000
4,0.3,0.000000,0.000000,0.0,0.000560,0.000000,0.000000,0.0,0.015536,0.000308,...,0.0,0.000369,0.003273,0.003273,0.004220,0.000000,0.000000,0.0,0.000911,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23901,0.3,0.002705,0.007501,0.0,0.000079,0.003544,0.126556,0.0,0.098149,0.000354,...,0.0,0.007115,0.043577,0.043577,0.045047,0.000551,0.142329,0.0,0.125324,0.008875
23902,0.3,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000308,...,0.0,0.000000,0.012577,0.012577,0.009616,0.000000,0.000000,0.0,0.000000,0.000539
23903,0.6,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.018332,...,0.0,0.000000,0.000935,0.000935,0.001431,0.000000,0.000000,0.0,0.000000,0.001064
23904,0.6,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.018809,...,0.0,0.000000,0.003555,0.003555,0.006358,0.000000,0.000000,0.0,0.000000,0.000000


In [37]:
train_x.isna().any()

AGE                        False
avg_amt_nontw_nonntd       False
avg_amt_nontw_ntd          False
avg_amt_tw_nonntd          False
avg_amt_tw_ntd             False
                           ...  
total_freq_nontw_nonntd    False
total_freq_nontw_ntd       False
total_freq_tw_nonntd       False
total_freq_tw_ntd          False
trade_sum                  False
Length: 97, dtype: bool

In [38]:
train_x[train_x.isna().any(axis=1)]

Unnamed: 0,AGE,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_nonntd,avg_freq_tw_ntd,cr_1_avg_tx,...,total_amt_tw_nonntd,total_amt_tw_ntd,total_asset,total_asset_x,total_asset_y,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,trade_sum


In [39]:
# sm_x, sm_y = SMOTE_oversampling(train_x, train_y)

smt = SMOTETomek(random_state=42)
sm_x, sm_y = smt.fit_resample(train_x, train_y)

In [40]:
XG = xgb.XGBClassifier(**params_1, missing=-99)
# rfe_columns = RFE_analysis(sm_x, sm_y, XG)

In [41]:
# sm_x = sm_x[sm_x.columns[rfe_columns]]
sm_x

Unnamed: 0,AGE,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_nonntd,avg_freq_tw_ntd,cr_1_avg_tx,...,total_amt_tw_nonntd,total_amt_tw_ntd,total_asset,total_asset_x,total_asset_y,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,trade_sum
0,0.400000,0.005648,0.018591,0.0,0.000110,0.004122,0.055273,0.0,0.031604,0.000308,...,0.0,0.001897,0.005085,0.005085,0.009321,0.000229,0.011081,0.0,0.023780,0.000000
1,0.200000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000308,...,0.0,0.000000,0.036790,0.036790,0.050010,0.000000,0.000000,0.0,0.000000,0.000000
2,0.400000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000308,...,0.0,0.000000,0.004421,0.004421,0.006228,0.000000,0.000000,0.0,0.000000,0.000000
3,0.400000,0.000000,0.000000,0.0,0.000151,0.000000,0.000000,0.0,0.022078,0.000308,...,0.0,0.000016,0.013738,0.013738,0.002433,0.000000,0.000000,0.0,0.000144,0.000000
4,0.300000,0.000000,0.000000,0.0,0.000560,0.000000,0.000000,0.0,0.015536,0.000308,...,0.0,0.000369,0.003273,0.003273,0.004220,0.000000,0.000000,0.0,0.000911,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47315,0.485870,0.000000,0.000000,0.0,0.000122,0.000000,0.000000,0.0,0.018686,0.005609,...,0.0,0.000674,0.025197,0.025197,0.035500,0.000000,0.000000,0.0,0.007819,0.000000
47316,0.351037,0.000000,0.017786,0.0,0.000183,0.000000,0.025702,0.0,0.024051,0.000571,...,0.0,0.004728,0.005338,0.005338,0.009643,0.000000,0.000251,0.0,0.018254,0.000000
47317,0.300000,0.000000,0.004686,0.0,0.000130,0.000000,0.051383,0.0,0.020810,0.017825,...,0.0,0.000787,0.000125,0.000125,0.000176,0.000000,0.003630,0.0,0.010877,0.083501
47318,0.400000,0.000000,0.000000,0.0,0.000440,0.000000,0.000000,0.0,0.009347,0.001652,...,0.0,0.000183,0.017222,0.017222,0.024264,0.000000,0.000000,0.0,0.000365,0.000000


In [42]:
XG = LR_training(train_x, train_y, XG)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


0.9932511573428524  0.992136523339468
[[5925    2]
 [  45    5]]


In [43]:
XG = LR_training(sm_x, sm_y, XG)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


0.9723302338686954  0.9703296703296703
[[5684  229]
 [ 122 5795]]


In [44]:
# Training Evaluation: DONT USE RFE IN RF
# train_x_rfe = train_x[train_x.columns[rfe_columns]]
LR_test(train_x, train_y, XG)


0.9584204802141721
[[22751   921]
 [   73   161]]
              precision    recall  f1-score   support

           0       1.00      0.96      0.98     23672
           1       0.15      0.69      0.24       234

    accuracy                           0.96     23906
   macro avg       0.57      0.82      0.61     23906
weighted avg       0.99      0.96      0.97     23906



In [None]:
# Fine-tuning test


In [None]:
ori_train_x_normal = normalize_mms(ori_train_x)
LR_test(ori_train_x_normal, ori_train_y, XG)

In [None]:
test_xdp = pd.read_pickle('xdp_x_test.pkl')

In [None]:
# Test
test = df_from_csv('submit_format.csv')
test = test.drop(['probability'], axis=1)
test = test.merge(test_xdp, on='alert_key', how='left')
test = test.merge(ccba, on='cust_id', how='left')
test = test.merge(cdtx, on='cust_id', how='left')
test = test.merge(remit, on='alert_key', how='left')
test = test.drop(['alert_key', 'cust_id', 'date'], axis=1)
test['bad_value'] = test.isnull().sum(axis=1) + (test == 0).astype(int).sum(axis=1)
test = test.loc[test['bad_value'] < 77]
test.iloc[:, :] = test.iloc[:, :].fillna(0)
test = test.drop(['bad_value'], axis=1)
test = test.reindex(sorted(test.columns), axis=1)
test = normalize_mms(test)
test

In [None]:
#handle missing column
test['occupation_code_0'] = 0
test['occupation_code_6'] = 0
test['occupation_code_8'] = 0

In [None]:
y_test = XG.predict_proba(test)
# y_test = RF_random.predict_proba(test)
y_test = pd.DataFrame(y_test, columns=['probability0','probability'])
y_test = y_test.drop(['probability0'], axis=1)
# y_test['probability'] = round(y_test['probability'], 6)
y_test.index = test.index
y_test

In [None]:
y_test.sort_values(by='probability', ascending=True)

In [None]:
y_test.loc[y_test['probability'] == 0, 'probability'] = 0.000001
y_test.sort_values(by='probability', ascending=True)

In [None]:
xy_test = pd.concat([test, y_test], axis=1)
submit = df_from_csv('submit_format.csv')
submit = submit.drop(columns=['probability'])
submit = pd.concat([submit, xy_test], axis=1)
submit = submit[['alert_key','probability']]
submit['probability'] = submit['probability'].fillna(0)
submit = submit.sort_values(by='probability', ascending=False)
submit.to_csv('final_submit.csv')
submit

In [None]:
submit.isnull().any()

In [None]:
submit.loc[submit.duplicated(keep=False)]